In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
sns.set(rc={'figure.figsize':(20, 10)})

In [None]:
df = pd.read_csv("airbnb_nyc_final.csv", delimiter=",", encoding="utf-8")
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.drop(["name", "id", "host_id", "host_name"], inplace=True, axis=1) # dropping the unnecessary columns 
df.head()

In [None]:
integers = ["minimum_nights","number_of_reviews", "calculated_host_listings_count", "availability_365"]

for column in integers:
    df[column] = pd.to_numeric(df[column], errors="coerce")

    max_value = (3*df[column].std()) + df[column].mean()  # finds the max value that is not going to be called not an outlier
    min_value = -((3*df[column].std()) + df[column].mean())  # finds the min value that is not going to be called not an outlier

    mean = df.loc[(df[column] > min_value) & (df[column] < max_value), column].mean()  # calculate the new mean of the column without the outlier values

    # if there is no outlier value then the mean will be np.nan in this situation don't need to do imputation
    if mean is not np.nan:
        df.loc[(df[column] < min_value) | (df[column] > max_value), column] = mean  # change outlier values with mean
        df[column].fillna(mean, inplace=True)  # also fill nan values with mean
    else:  # this condition means there is no outlier value in the column
        df[column].fillna(df[column].mean(), inplace=True)  # just fill nan values with mean

    df[column] = df[column].astype("int64")  # the type will be converted to int64 

In [None]:
sns.histplot(df["price"], kde=True)  #the x-axis's readibility is low, as there are string values in dataset

In [None]:
floats = ["price", "reviews_per_month", "calculated_host_listings_count", "latitude", "longitude"]

for column in floats:
    df[column] = pd.to_numeric(df[column], errors="coerce")

    max_value = (3*df[column].std()) + df[column].mean()  # finds the max value that is not going to be called not an outlier
    min_value = -((3*df[column].std()) + df[column].mean())  # finds the min value that is not going to be called not an outlier

    mean = df.loc[(df[column] > min_value) & (df[column] < max_value), column].mean()
    if mean is not np.nan:
        df.loc[(df[column] < min_value) | (df[column] > max_value), column] = mean  # change outlier values with mean
        df[column].fillna(mean, inplace=True)  # also fill nan values with mean
    else:
        df[column].fillna(df[column].mean(), inplace=True)  # filling NA values with mean

    df[column] = df[column].astype("float64") #converting the type to int64

In [None]:
sns.histplot(df["price"], kde=True)


In [None]:
df.info()


In [None]:
df.groupby("room_type")["room_type"].value_counts()  # .value_counts() is used to see the values in column "room_type"

In [None]:
for key, value in df.groupby(by="room_type")["room_type"].value_counts().items():  # the items that are so rare in the column will be changed to "unknown_type" 
    if value == 1:
        df.loc[df["room_type"]==key[0], "room_type"] = "unknown_type"  # assigning them as "unknown_type"

df.groupby(by="room_type")["room_type"].value_counts()

In [None]:
df.isna().sum()  # prints the number of NA values for each column

In [None]:
df.loc[df.isna()["date"], "date"] = "unknown_date"  # assigning NAs of the date column to "unknown_type"

In [None]:
df.isna().sum()

In [None]:
object_columns = df.select_dtypes("object").columns
object_columns

In [None]:
for column in object_columns:
    df[column] = df[column].astype("category")  # converting the type of the columns to category

In [None]:
df.info()

In [None]:
df.to_csv("airbnb_nyc_preprocessed.csv")  # turns the dataframe into a csv 