### Import Libraries

In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt 
# for data preparartion for ml 
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from category_encoders import BinaryEncoder
from sklearn.preprocessing import MinMaxScaler , StandardScaler , RobustScaler
from sklearn.impute import KNNImputer

### Read Data

In [2]:
pd.set_option('display.max_columns', None)
data = pd.read_csv('Airbnb_after_univariate')
data.head()

Unnamed: 0,id,host_identity_verified,neighbourhood group,neighbourhood,lat,long,country,instant_bookable,cancellation_policy,room type,construction year,price,service fee,minimum nights,number of reviews,reviews per month,review rate number,calculated host listings count,availability 365,road,suburb,last_review_(month),last_review_(year),last_review_(day),last_review_(day_name)
0,1001254,unconfirmed,Brooklyn,Kensington,40.64749,-73.97237,United States,False,strict,Private room,2020.0,966.0,193.0,10.0,9.0,0.21,4.0,6.0,286.0,Friel Place,Brooklyn,10.0,2021.0,19.0,Tuesday
1,1002102,verified,Manhattan,Midtown,40.75362,-73.98377,United States,False,moderate,Entire home/apt,2007.0,142.0,28.0,30.0,45.0,0.38,4.0,2.0,228.0,West 41st Street,Manhattan,5.0,2022.0,21.0,Saturday
2,1002403,,Manhattan,Harlem,40.80902,-73.9419,United States,True,flexible,Private room,2005.0,620.0,124.0,3.0,0.0,,5.0,1.0,352.0,West 128th Street,Manhattan,,,,
3,1002755,unconfirmed,Brooklyn,Clinton Hill,40.68514,-73.95976,United States,True,moderate,Entire home/apt,2005.0,368.0,74.0,30.0,270.0,4.64,4.0,1.0,322.0,Gates Avenue,Brooklyn,7.0,2019.0,5.0,Friday
4,1003689,verified,Manhattan,East Harlem,40.79851,-73.94399,United States,False,moderate,Entire home/apt,2009.0,204.0,41.0,10.0,9.0,0.1,3.0,1.0,289.0,Park Avenue,Manhattan,11.0,2018.0,19.0,Monday


### Drop columns which wouldn't affect on ML process

In [3]:
cols = ['id','host_identity_verified','neighbourhood','lat','long','country','instant_bookable','reviews per month',
       'road','suburb','last_review_(day)']
data.drop(cols , axis=1 , inplace=True)

In [4]:
data.head()

Unnamed: 0,neighbourhood group,cancellation_policy,room type,construction year,price,service fee,minimum nights,number of reviews,review rate number,calculated host listings count,availability 365,last_review_(month),last_review_(year),last_review_(day_name)
0,Brooklyn,strict,Private room,2020.0,966.0,193.0,10.0,9.0,4.0,6.0,286.0,10.0,2021.0,Tuesday
1,Manhattan,moderate,Entire home/apt,2007.0,142.0,28.0,30.0,45.0,4.0,2.0,228.0,5.0,2022.0,Saturday
2,Manhattan,flexible,Private room,2005.0,620.0,124.0,3.0,0.0,5.0,1.0,352.0,,,
3,Brooklyn,moderate,Entire home/apt,2005.0,368.0,74.0,30.0,270.0,4.0,1.0,322.0,7.0,2019.0,Friday
4,Manhattan,moderate,Entire home/apt,2009.0,204.0,41.0,10.0,9.0,3.0,1.0,289.0,11.0,2018.0,Monday


### Show Descriptive statistics 
- for outliers

In [5]:
data.select_dtypes('number').describe()

Unnamed: 0,construction year,price,service fee,minimum nights,number of reviews,review rate number,calculated host listings count,availability 365,last_review_(month),last_review_(year)
count,101842.0,101807.0,101781.0,101655.0,101871.0,101735.0,101735.0,98853.0,86222.0,86222.0
mean,2012.48808,625.350467,125.038229,8.165688,27.516997,3.278744,7.93718,133.966202,5.889309,2018.994213
std,5.765851,331.676939,66.326766,30.606788,49.57169,1.285106,32.266966,129.802232,3.02031,1.661528
min,2003.0,50.0,10.0,1.0,0.0,1.0,1.0,0.0,1.0,2012.0
25%,2007.0,340.0,68.0,2.0,1.0,2.0,1.0,2.0,4.0,2018.0
50%,2012.0,624.0,125.0,3.0,7.0,3.0,1.0,90.0,6.0,2019.0
75%,2017.0,913.0,183.0,5.0,31.0,4.0,2.0,253.0,7.0,2019.0
max,2022.0,1200.0,240.0,5645.0,1024.0,5.0,332.0,365.0,12.0,2024.0


In [6]:
# define function to return index of outlier values in each col
def get_outliers(df , col):
    
    Q1 = df[col].quantile(0.25)    
    Q3 = df[col].quantile(0.75)
    
    IQR = Q3 - Q1              # interquartile range
    
    min_w = Q1 - (1.5 * IQR)    # minimum whisker
    max_w = Q3 + (1.5 * IQR)    # maximun whisker
    
    
    return df[(df[col] > max_w) | (df[col] < min_w)].index 

In [7]:
for col in data.select_dtypes('number').columns:
    
    # getting num of outliers in each col
    val = data.loc[get_outliers(data ,col)].shape
    
    # print percentage of outliers of whole data
    print(f"{col} : {val} and it's percentage : {val[0]/data.shape[0] *100}")

construction year : (0, 14) and it's percentage : 0.0
price : (0, 14) and it's percentage : 0.0
service fee : (0, 14) and it's percentage : 0.0
minimum nights : (18249, 14) and it's percentage : 17.881709683108944
number of reviews : (11119, 14) and it's percentage : 10.895212338565857
review rate number : (0, 14) and it's percentage : 0.0
calculated host listings count : (17702, 14) and it's percentage : 17.34571893311384
availability 365 : (0, 14) and it's percentage : 0.0
last_review_(month) : (5428, 14) and it's percentage : 5.318752817136025
last_review_(year) : (23431, 14) and it's percentage : 22.959413643757227


##### Hint >> Logarithmic transformation is a valuable technique for mitigating the influence of outliers while preserving the integrity of real-world data. It’s particularly useful when outliers are meaningful but skew the data distribution or adversely affect model performance.

In [8]:
# columns contain outliers but it's real values.
cols_outliers = ['minimum nights','number of reviews','calculated host listings count','last_review_(month)'
                 ,'last_review_(year)']

for col in cols_outliers:
    
    # apply log transformation on cols
    data[col] = np.log1p(data[col])

In [9]:
# showing statistics after handling outliers.
data.select_dtypes('number').describe()

Unnamed: 0,construction year,price,service fee,minimum nights,number of reviews,review rate number,calculated host listings count,availability 365,last_review_(month),last_review_(year)
count,101842.0,101807.0,101781.0,101655.0,101871.0,101735.0,101735.0,98853.0,86222.0,86222.0
mean,2012.48808,625.350467,125.038229,1.578545,2.211391,3.278744,1.157053,133.966202,1.810075,7.61085
std,5.765851,331.676939,66.326766,0.937551,1.56245,1.285106,0.924912,129.802232,0.527855,0.000823
min,2003.0,50.0,10.0,0.693147,0.0,1.0,0.693147,0.0,0.693147,7.607381
25%,2007.0,340.0,68.0,1.098612,0.693147,2.0,0.693147,2.0,1.609438,7.610358
50%,2012.0,624.0,125.0,1.386294,2.079442,3.0,0.693147,90.0,1.94591,7.610853
75%,2017.0,913.0,183.0,1.791759,3.465736,4.0,1.098612,253.0,2.079442,7.610853
max,2022.0,1200.0,240.0,8.638703,6.932448,5.0,5.808142,365.0,2.564949,7.613325


### Dealing with missing values

In [10]:
data.isnull().mean()*100       # getting percentage of missing values in each column.

neighbourhood group                0.028416
cancellation_policy                0.074470
room type                          0.000000
construction year                  0.207733
price                              0.242029
service fee                        0.267505
minimum nights                     0.390969
number of reviews                  0.179317
review rate number                 0.312580
calculated host listings count     0.312580
availability 365                   3.136575
last_review_(month)               15.513356
last_review_(year)                15.513356
last_review_(day_name)            15.513356
dtype: float64

### Spliting data before filling the missings

In [11]:
# x : independent variables & y : dependent variable (target).
x , y = data.drop(['price'] , axis = 1 ) , data['price']

In [12]:
# split data to train , test
x_train , x_test , y_train , y_test = train_test_split(x , y , test_size=0.2 , random_state=0)

- I would fill missing in ( neighbourhood group , cancellation_policy , room type , last_review_(day_name)) with mode.

In [13]:
# making list contains categorical cols which contain missing
category_cols = ['neighbourhood group' , 'cancellation_policy' , 'room type' , 'last_review_(day_name)']
# fill missing in these cols with mode
cate_imputer = SimpleImputer(strategy='most_frequent')

In [14]:
x_train[category_cols] = cate_imputer.fit_transform(x_train[category_cols])    # learning and applying encoding on x_train.
x_test[category_cols] = cate_imputer.transform(x_test[category_cols])          # applying learned encoding on x_test.

- I would fill a little missing values in numeric columns with the mean.

In [15]:
# making list contains numerical cols which contain missing
num_cols = ['construction year','service fee','minimum nights','number of reviews','review rate number'
         ,'calculated host listings count','availability 365']
# fill missing in these cols with mean
num_imputer = SimpleImputer(strategy='mean')

In [16]:
x_train[num_cols] = num_imputer.fit_transform(x_train[num_cols])  # learning and applying encoding on x_train.
x_test[num_cols] = num_imputer.transform(x_test[num_cols])        # applying learned encoding on x_test.

- I would fill missing in (last_review_(month) , last_review_(year)) columns with KNN imputer.

In [17]:
knn_imputer = KNNImputer()     # create instance of KNNImputer class.

In [18]:
# learning and applying encoding on x_train.
x_train['last_review_(month)'] = knn_imputer.fit_transform(x_train[['last_review_(month)']])
# applying learned encoding on x_test.
x_test['last_review_(month)'] = knn_imputer.transform(x_test[['last_review_(month)']])

In [19]:
x_train['last_review_(year)'] = knn_imputer.fit_transform(x_train[['last_review_(year)']])
x_test['last_review_(year)'] = knn_imputer.transform(x_test[['last_review_(year)']])

- Fill missing in target (price) with mean.

In [20]:
# convert pandas Series into a NumPy array. 
y_train = y_train.values.reshape(-1, 1)        # -1 : number of rows based on len of array.
y_test = y_test.values.reshape(-1, 1)          #  1 : reshaped array should have 1 column.

In [21]:
y_train = num_imputer.fit_transform(y_train)   # learning and applying encoding on y_train.
y_test = num_imputer.transform(y_test)         # applying learned encoding on y_test.

In [22]:
# Convert back to pandas Series
y_train = pd.Series(y_train.ravel())         # ravel(): used to flatten the 2D array into a 1D array.
y_test = pd.Series(y_test.ravel())           # pd.Series():used to convert NumPy array back into a pandas Series.

### Encoding Step

In [23]:
for col in x_test.select_dtypes('object').columns :
    
    # print number of uniques in each categorical column.
    print(f'{col} -- {data[col].nunique()}')

neighbourhood group -- 6
cancellation_policy -- 3
room type -- 4
last_review_(day_name) -- 7


- These columns contains Nominal data.
- so, i would use one hot encoder for (cancellation_policy ,room type) cols.
- And Binary encoder for (neighbourhood group ,last_review_(day_name)) cols.

In [24]:
ohe_col = ['cancellation_policy','room type']   # cols would be converted by one hot encoder.
be_col = ['neighbourhood group','last_review_(day_name)']      # cols would be converted by binary encoder.

In [25]:
ohe = OneHotEncoder(sparse_output=False , drop = 'first' )   # create instance of OneHotEncoder class.
be = BinaryEncoder()                                         # create instance of  BinaryEncoder class.

In [26]:
x_train_ohe = pd.DataFrame(ohe.fit_transform(x_train[ohe_col]) , columns=ohe.get_feature_names_out())
x_test_ohe = pd.DataFrame(ohe.transform(x_test[ohe_col]) , columns=ohe.get_feature_names_out())

In [27]:
x_train_be = be.fit_transform(x_train[be_col])    # learning and applying encoding on x_train.
x_test_be = be.transform(x_test[be_col])          # applying learned encoding on x_test.

In [28]:
x_train.reset_index(inplace = True , drop = True )    # reset indexes.
x_test.reset_index(inplace = True , drop = True )
x_train_be.reset_index(inplace = True , drop = True )
x_test_be.reset_index(inplace = True , drop = True )

In [29]:
x_train = pd.concat([x_train , x_train_ohe , x_train_be] , axis = 1 ) # concat encoded columns to x_train.

In [30]:
x_test = pd.concat([x_test , x_test_ohe , x_test_be], axis = 1 )      # concat encoded columns to x_test.

In [31]:
x_train = x_train.drop(ohe_col + be_col , axis = 1 )                 # drop original cols after encoded them.

In [32]:
x_test = x_test.drop(ohe_col + be_col , axis = 1 )                   # drop original cols after encoded them.

In [33]:
x_train

Unnamed: 0,construction year,service fee,minimum nights,number of reviews,review rate number,calculated host listings count,availability 365,last_review_(month),last_review_(year),cancellation_policy_moderate,cancellation_policy_strict,room type_Hotel room,room type_Private room,room type_Shared room,neighbourhood group_0,neighbourhood group_1,neighbourhood group_2,last_review_(day_name)_0,last_review_(day_name)_1,last_review_(day_name)_2
0,2011.0,54.0,2.397895,2.197225,4.0,0.693147,25.0,0.693147,7.612337,0.0,0.0,0.0,0.0,0.0,0,0,1,0,0,1
1,2022.0,239.0,0.693147,0.000000,3.0,0.693147,257.0,1.809018,7.610851,1.0,0.0,0.0,1.0,0.0,0,1,0,0,0,1
2,2008.0,139.0,1.098612,4.477337,5.0,0.693147,60.0,1.945910,7.610853,0.0,0.0,0.0,1.0,0.0,0,0,1,0,0,1
3,2011.0,101.0,0.693147,4.564348,3.0,1.098612,299.0,1.945910,7.610853,0.0,1.0,0.0,0.0,0.0,0,1,0,0,1,0
4,2003.0,188.0,0.693147,0.693147,2.0,0.693147,0.0,0.693147,7.609862,1.0,0.0,0.0,1.0,0.0,0,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81638,2004.0,213.0,1.791759,2.197225,3.0,0.693147,81.0,1.945910,7.610853,0.0,1.0,0.0,0.0,0.0,0,1,0,0,1,0
81639,2010.0,175.0,1.098612,0.000000,4.0,1.386294,313.0,1.809018,7.610851,1.0,0.0,0.0,0.0,0.0,0,0,1,0,0,1
81640,2008.0,84.0,1.098612,2.484907,5.0,0.693147,90.0,2.079442,7.610853,0.0,0.0,0.0,0.0,0.0,0,1,0,1,1,1
81641,2021.0,11.0,1.098612,0.693147,3.0,1.098612,77.0,1.609438,7.610853,0.0,0.0,0.0,1.0,0.0,0,1,0,0,0,1


In [34]:
y_train

0         271.0
1        1197.0
2         693.0
3         504.0
4         938.0
          ...  
81638    1067.0
81639     873.0
81640     422.0
81641      56.0
81642    1141.0
Length: 81643, dtype: float64

### Feature Scaling

In [35]:
sclr = StandardScaler()  # create instance of StandardScaler class.

In [36]:
x_train[x_train.columns] = sclr.fit_transform(x_train)   # learning and applying encoding on x_train.

In [37]:
x_train

Unnamed: 0,construction year,service fee,minimum nights,number of reviews,review rate number,calculated host listings count,availability 365,last_review_(month),last_review_(year),cancellation_policy_moderate,cancellation_policy_strict,room type_Hotel room,room type_Private room,room type_Shared room,neighbourhood group_0,neighbourhood group_1,neighbourhood group_2,last_review_(day_name)_0,last_review_(day_name)_1,last_review_(day_name)_2
0,-0.262083,-1.070424,0.876678,-0.010636,0.562176,-0.502157,-0.851301,-2.298904,1.963274e+00,-0.711841,-0.705600,-0.033951,-0.910856,-0.148087,-0.192901,-1.122253,0.881241,-0.792224,-0.840374,0.641717
1,1.645949,1.720382,-0.945449,-1.419168,-0.217397,-0.502157,0.965628,0.000000,-2.347015e-12,1.404808,-0.705600,-0.033951,1.097869,-0.148087,-0.192901,0.891065,-1.134763,-0.792224,-0.840374,0.641717
2,-0.782456,0.211839,-0.512066,1.451031,1.341749,-0.502157,-0.577195,0.282024,2.474977e-03,-0.711841,-0.705600,-0.033951,1.097869,-0.148087,-0.192901,-1.122253,0.881241,-0.792224,-0.840374,0.641717
3,-0.262083,-0.361408,-0.945449,1.506810,-0.217397,-0.061072,1.294554,0.282024,2.474977e-03,-0.711841,1.417233,-0.033951,-0.910856,-0.148087,-0.192901,0.891065,-1.134763,-0.792224,1.189947,-1.558320
4,-1.649743,0.951025,-0.945449,-0.974825,-0.996969,-0.502157,-1.047091,-2.298904,-1.306343e+00,1.404808,-0.705600,-0.033951,1.097869,-0.148087,-0.192901,0.891065,-1.134763,-0.792224,1.189947,0.641717
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81638,-1.476286,1.328161,0.228808,-0.010636,-0.217397,-0.502157,-0.412732,0.282024,2.474977e-03,-0.711841,1.417233,-0.033951,-0.910856,-0.148087,-0.192901,0.891065,-1.134763,-0.792224,1.189947,-1.558320
81639,-0.435541,0.754914,-0.512066,-1.419168,0.562176,0.251883,1.404197,0.000000,-2.347015e-12,1.404808,-0.705600,-0.033951,-0.910856,-0.148087,-0.192901,-1.122253,0.881241,-0.792224,-0.840374,0.641717
81640,-0.782456,-0.617860,-0.512066,0.173783,1.341749,-0.502157,-0.342248,0.557123,2.474977e-03,-0.711841,-0.705600,-0.033951,-0.910856,-0.148087,-0.192901,0.891065,-1.134763,1.262269,1.189947,0.641717
81641,1.472492,-1.719097,-0.512066,-0.974825,-0.217397,-0.061072,-0.444058,-0.411172,2.474977e-03,-0.711841,-0.705600,-0.033951,1.097869,-0.148087,-0.192901,0.891065,-1.134763,-0.792224,-0.840374,0.641717


In [38]:
x_test[x_test.columns] = sclr.transform(x_test)        # applying learned encoding on x_train.

In [39]:
x_test

Unnamed: 0,construction year,service fee,minimum nights,number of reviews,review rate number,calculated host listings count,availability 365,last_review_(month),last_review_(year),cancellation_policy_moderate,cancellation_policy_strict,room type_Hotel room,room type_Private room,room type_Shared room,neighbourhood group_0,neighbourhood group_1,neighbourhood group_2,last_review_(day_name)_0,last_review_(day_name)_1,last_review_(day_name)_2
0,0.258289,-1.326876,0.033932,0.991952,-0.996969,-0.061072,-0.444058,-0.035556,2.474977e-03,-0.711841,1.417233,-0.033951,1.097869,-0.148087,5.184016,-1.122253,0.881241,1.262269,-0.840374,0.641717
1,-0.262083,0.724743,-0.945449,1.458275,-1.776542,-0.502157,-0.937448,0.282024,2.474977e-03,-0.711841,1.417233,-0.033951,1.097869,-0.148087,-0.192901,0.891065,-1.134763,1.262269,1.189947,-1.558320
2,0.431747,-0.678202,-0.945449,0.272601,1.341749,-0.502157,1.654807,0.282024,2.474977e-03,-0.711841,-0.705600,-0.033951,-0.910856,-0.148087,-0.192901,0.891065,-1.134763,-0.792224,-0.840374,0.641717
3,-0.435541,0.875598,1.984110,-1.419168,-1.776542,-0.502157,-1.047091,0.000000,-2.347015e-12,1.404808,-0.705600,-0.033951,1.097869,-0.148087,-0.192901,-1.122253,0.881241,-0.792224,-0.840374,0.641717
4,0.258289,1.056623,-0.204576,1.665686,0.562176,-0.502157,-0.835638,0.282024,2.474977e-03,-0.711841,1.417233,-0.033951,-0.910856,-0.148087,-0.192901,-1.122253,0.881241,-0.792224,1.189947,0.641717
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20406,1.299034,1.147136,0.033932,1.088637,-0.996969,-0.502157,-0.647680,0.557123,2.474977e-03,1.404808,-0.705600,-0.033951,-0.910856,-0.148087,-0.192901,0.891065,-1.134763,1.262269,1.189947,0.641717
20407,-1.129371,1.388503,-0.945449,-0.530483,1.341749,-0.502157,1.114428,0.557123,-1.961239e+00,1.404808,-0.705600,-0.033951,1.097869,-0.148087,-0.192901,-1.122253,0.881241,1.262269,1.189947,0.641717
20408,-1.302828,-1.598414,-0.512066,-0.714902,1.341749,-0.502157,1.263228,0.282024,2.474977e-03,1.404808,-0.705600,-0.033951,-0.910856,-0.148087,-0.192901,0.891065,-1.134763,-0.792224,-0.840374,0.641717
20409,0.431747,0.106240,-0.204576,0.693630,0.562176,-0.502157,-0.522374,0.282024,2.474977e-03,1.404808,-0.705600,-0.033951,-0.910856,-0.148087,-0.192901,0.891065,-1.134763,-0.792224,-0.840374,0.641717
