## Feature Imputation using Machine Learning

In [103]:
# to prevent sklearn module not found error when import missforest
import sys
import sklearn.neighbors._base
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base

In [104]:
# set scikit learn to older version for missingpy import

In [105]:
#pip install scikit-learn==1.1.2

In [106]:
print(sklearn.__version__)

1.1.2


In [138]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
from missingpy import MissForest
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesRegressor
from xgboost import XGBRegressor
from xgboost import cv
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from pprint import pprint

In [108]:
# setting display options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

## ML Feature Imputation

### Feature Encoding

In [109]:
# read in data
trainpath = "train.csv"
df_train = pd.read_csv(trainpath, index_col='ID')
testpath = "test.csv"
df_test = pd.read_csv(testpath, index_col='ID')

# concatenate dataframes to reduce redundancies in operations
df = pd.concat([df_train, df_test])

In [110]:
# Data Cleaning

#Functions
def replace_string(df, c, s, r='', f='strip'):
    if f == 'find_replace':
        mask = (df[c].notnull()) & (df[c].astype(str).str.contains(s))
        df.loc[mask, c] = df.loc[mask, c].astype(str).str.replace(s, r)
    if f == 'replace':
        df[c] = df[c].replace(s, r)
    elif f == 'strip':
        df[c] = df[c].dropna().astype(str).str.replace(s, r, regex=True)
    return df

def replace_numeric(df, c, n, r=0, f='match'):
    if f == 'isgreater':
        df.loc[df[c] > n, c] = r
    elif f == 'isless':
        df.loc[df[c] < n, c] = r
    elif f == 'match':
        df.loc[df[c] == n, c] = r
    return df

def convert_numeric(df, c, t, d=1):
    df[c] = pd.to_numeric(df[c], errors='coerce')
    df[c] = df[c].astype(t)
    df[c] = df[c] / d
    return df

In [111]:
df2 = df

# price
df2 = replace_string(df2, 'price', '$','', 'strip')
df2 = replace_string(df2, 'price', ',','', 'strip')
df2 = convert_numeric(df2, 'price', 'float', 1)

# host_response_rate
df2 = replace_string(df2, 'host_response_rate', '%','', 'strip')
df2 = convert_numeric(df2, 'host_response_rate', 'float', 100)

# host_acceptance_rate
df2 = replace_string(df2, 'host_acceptance_rate', '%','', 'strip')
df2 = convert_numeric(df2, 'host_acceptance_rate', 'float', 100)

# bathrooms
df2 = replace_string(df2, 'bathrooms', 'Half-bath','0.5', 'find_replace')
df2 = replace_string(df2, 'bathrooms', 'half-bath','0.5', 'find_replace')
df2 = replace_string(df2, 'bathrooms', '[^0-9\.]','', 'strip')
df2 = convert_numeric(df2, 'bathrooms', 'float', 1)

# max/min nights - replace extreme values
df2 = replace_numeric(df2, 'maximum_nights', 9000, 1000, 'isgreater')
df2 = replace_numeric(df2, 'minimum_maximum_nights', 9000, 1000, 'isgreater')
df2 = replace_numeric(df2, 'maximum_maximum_nights', 9000, 1000, 'isgreater')
df2 = replace_numeric(df2, 'minimum_nights_avg_ntm', 9000, 1000, 'isgreater')
df2 = replace_numeric(df2, 'maximum_nights_avg_ntm', 9000, 1000, 'isgreater')

In [112]:
df3 = df2

# Create new features email, phone and work_email from host_verifications
df3 = replace_string(df3, 'host_verifications', "['email']","'1','0','0'", 'replace')
df3 = replace_string(df3, 'host_verifications', "['phone']","'0','1','0'", 'replace')
df3 = replace_string(df3, 'host_verifications', "['email', 'phone']","'1','1','0'", 'replace')
df3 = replace_string(df3, 'host_verifications', "['phone', 'work_email']","'0','1','1'", 'replace')
df3 = replace_string(df3, 'host_verifications', "['email', 'phone', 'work_email']","'1','1','1'", 'replace')

df3[['email', 'phone', 'work_email']] = df3['host_verifications'].str.split(',', expand=True)

df3 = replace_string(df3, 'email', "'",'', 'strip')
df3 = convert_numeric(df3, 'email','int', 1)

df3 = replace_string(df3, 'phone', "'",'', 'strip')
df3 = convert_numeric(df3, 'phone','int', 1)

df3 = replace_string(df3, 'work_email', "'",'', 'strip')
df3 = convert_numeric(df3, 'work_email','int', 1)

df3.drop(['host_verifications'], axis=1, inplace=True)

In [113]:
# Create new features smoke_alarm, kitchen, essential, hangers, wifi from amenities
# These are the top 5 ammenities in the dataset

from collections import Counter

amenity_count = Counter()
amenity_count_total = Counter()
count_total = []

for amenities_str in df3['amenities']:
    amenity_count_total = 0
    amenities_list = amenities_str.strip('][').replace('"', '').split(', ')
    for amenity in amenities_list:
        amenity_count[amenity] += 1
        amenity_count_total  += 1
    count_total.append(amenity_count_total)

df_amenities = pd.DataFrame(columns=[ 'amenity_count'])
df_amenities['amenity_count'] = amenity_count
df_amenities = df_amenities.sort_values('amenity_count', ascending=False)
df_amenities.head(5)

#df_acc = pd.DataFrame(columns=[ 'total_amenity_counts'])
#df_acc['total_amenity_counts'] = count_total
#df_acc
#print(acc)

#Smoke alarm	9548
#Kitchen	9383
#Essentials	9327
#Hangers	8702
#Wifi	8618

df3['amenity_count'] = count_total

df3[['smoke_alarm','kitchen','essentials','hangers','wifi']] = 0

for idx, amenities_str in df3['amenities'].items():
    amenities_list = amenities_str.strip('][').replace('"', '').split(', ')
    if 'Smoke alarm' in amenities_list:
        df3.loc[idx, 'smoke_alarm'] = 1
    if 'Kitchen' in amenities_list:
        df3.loc[idx, 'kitchen'] = 1        
    if 'Essentials' in amenities_list:
        df3.loc[idx, 'essentials'] = 1      
    if 'Hangers' in amenities_list:
        df3.loc[idx, 'hangers'] = 1      
    if 'Wifi' in amenities_list:
        df3.loc[idx, 'wifi'] = 1              


df3.drop(['amenities'], axis=1, inplace=True)

In [114]:
df_amenities.iloc[0:50,]

Unnamed: 0,amenity_count
Smoke alarm,9548
Kitchen,9383
Essentials,9327
Hangers,8702
Wifi,8618
Iron,8359
Hair dryer,8328
Dishes and silverware,8111
Hot water,7986
Refrigerator,7479


In [115]:
df_amenities.loc[df_amenities.index == 'Kitchen']

Unnamed: 0,amenity_count
Kitchen,9383


In [116]:
df5 = df3


#onehot encoder function
def onehot(df, c):
    for col in c:
        df = df.join(pd.get_dummies(df[[col]], drop_first=True))
        df.drop([col], axis=1, inplace=True)
    return df

#encode binary classifiers
# 'host_is_superhost','host_has_profile_pic','host_identity_verified','has_availability','instant_bookable'
df5 = onehot(df5, ['source', 'host_is_superhost','host_has_profile_pic','host_identity_verified','has_availability','instant_bookable'])

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

#encode source
#le = encoder.fit_transform(df5['source'].values)
#df5['source'] = le
#print('source:', encoder.classes_)

#encode room_type alternative (1): ordinal --> more space/privacy = more value
# create dictionary to map values
room_mapping = {'Entire home/apt':4, 'Private room':3, 'Hotel room':2, 'Shared room':1}
df5['room_type'] = df5['room_type'].map(room_mapping) # use map function

# encode room type alternative (2): onehot --> evaluate if prevalence of room type equals a difference in price
#df5 = onehot(df5, ['room_type'])

#le = encoder.fit_transform(df5['room_type'].values)
#df5['room_type'] = le
#room_type_classes = encoder.classes_


#encode top 5 property_type and other
top_5_property_type = df5['property_type'].value_counts().nlargest(5).index.tolist()  
encoder.fit(top_5_property_type + ['other'])  
#df5['property_type_encoded'] = df5['property_type'].apply(lambda x: x if x in top_5_property_type else 'other')
df5['property_type'] = df5['property_type'].apply(lambda x: x if x in top_5_property_type else 'other')
df5 = onehot(df5, ['property_type'])


#encode top 5 neighbourhood_cleansed and other
top_5_neighbourhood_cleansed = df5['neighbourhood_cleansed'].value_counts().nlargest(5).index.tolist()  
encoder.fit(top_5_neighbourhood_cleansed + ['other'])  

df5['neighbourhood_cleansed'] = df5['neighbourhood_cleansed'].apply(lambda x: x if x in top_5_neighbourhood_cleansed else 'other')
df5 = onehot(df5, ['neighbourhood_cleansed'])


# map/rank host_response_time
host_response_mapping = {'within an hour':1, 'within a few hours':2, 'within a day':3, 'a few days or more':4}
df5['host_response_time'] = df5['host_response_time'].map(host_response_mapping)

# convert host_since into days based on current date
from datetime import datetime
today = datetime.today()
df5['host_since'] = pd.to_datetime(df5['host_since'], format='%Y/%m/%d')
df5['host_since'] = (today - df5['host_since']).dt.days

# convert first_review into days based on current date
df5['first_review'] = pd.to_datetime(df5['first_review'], format='%Y/%m/%d')
df5['first_review'] = (today - df5['first_review']).dt.days

# convert last_review into days based on current date
df5['last_review'] = pd.to_datetime(df5['last_review'], format='%Y/%m/%d')
df5['last_review'] = (today - df5['last_review']).dt.days

In [117]:
df5.number_of_reviews/(df5.first_review/30 - df5.last_review/30)

ID
0       6.152782
1       1.367683
2       0.094712
3       0.582524
4       1.432396
          ...   
9995    8.571429
9996    0.892984
9997         NaN
9998    3.000000
9999    3.325301
Length: 10000, dtype: float64

In [118]:
df7 = df5
#df7.drop(['name', 'description','neighborhood_overview','host_name',
#          'host_about','neighbourhood','latitude','longitude'], axis=1, inplace=True)
df7.drop(['name', 'description','neighborhood_overview','host_name',
          'host_about','neighbourhood'], axis=1, inplace=True)

df7.drop(['host_location', #'host_response_rate','host_acceptance_rate',
          'host_neighbourhood',
          #'host_listings_count'
         ], 
          axis=1, inplace=True)
df7.drop(['minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm'], axis=1, inplace=True)
#df7.drop(['number_of_reviews','number_of_reviews_ltm', 'number_of_reviews_l30d'], axis=1, inplace=True)
df7.drop([#'first_review', 
    'last_review'], axis=1, inplace=True)
#df7.drop(['review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value'], axis=1, inplace=True) 
df7.drop(['calculated_host_listings_count', 'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms'], axis=1, inplace=True)
#df7.drop(['reviews_per_month'], axis=1, inplace=True)
 
df7

Unnamed: 0_level_0,host_since,host_response_time,host_response_rate,host_acceptance_rate,host_listings_count,latitude,longitude,room_type,accommodates,bathrooms,bedrooms,beds,minimum_nights,maximum_nights,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,reviews_per_month,price,email,phone,work_email,amenity_count,smoke_alarm,kitchen,essentials,hangers,wifi,source_previous scrape,host_is_superhost_t,host_has_profile_pic_t,host_identity_verified_t,has_availability_t,instant_bookable_t,property_type_Entire home,property_type_Entire rental unit,property_type_Private room in home,property_type_Private room in rental unit,property_type_other,neighbourhood_cleansed_Port Phillip,neighbourhood_cleansed_Stonnington,neighbourhood_cleansed_Yarra,neighbourhood_cleansed_Yarra Ranges,neighbourhood_cleansed_other
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1
0,3724,1.0,1.00,0.98,2.0,-37.82030,144.99016,4.0,2,1.0,1.0,1.0,2,14,0,0,0,12.0,741,37,1,3711.0,4.88,4.91,4.97,4.94,4.93,4.93,4.82,6.11,132.0,1.0,1.0,0.0,38,1,0,1,1,1,0,0,1,1,1,0,0,0,0,0,1,0,0,1,0,0
1,5001,2.0,1.00,0.98,1.0,-37.76606,144.97951,3.0,2,1.0,1.0,1.0,4,27,0,12,22,112.0,169,25,3,3787.0,4.48,4.64,3.97,4.72,4.69,4.65,4.60,1.37,39.0,1.0,1.0,0.0,57,1,1,1,1,0,0,0,1,1,1,0,0,0,0,1,0,0,0,0,0,1
2,4198,2.0,1.00,0.78,2.0,-37.90546,145.39447,4.0,4,2.5,2.0,4.0,2,365,30,60,90,365.0,8,2,0,2882.0,4.75,4.88,4.75,4.88,4.50,5.00,4.75,0.09,270.0,1.0,1.0,0.0,21,1,1,0,0,1,0,1,1,1,1,0,0,1,0,0,0,0,0,0,1,0
3,4731,3.0,0.75,0.92,4.0,-37.82163,144.96672,3.0,2,2.5,1.0,1.0,1,730,30,60,90,365.0,2,0,0,4241.0,4.50,4.00,4.50,4.00,4.00,5.00,4.00,0.01,1000.0,1.0,1.0,0.0,13,0,1,0,0,1,0,0,1,1,1,0,0,0,0,1,0,0,0,0,0,0
4,4702,2.0,1.00,0.87,1.0,-38.05725,145.33936,4.0,5,1.0,3.0,3.0,1,14,17,21,51,312.0,214,39,4,4567.0,4.86,4.91,4.98,4.91,4.93,4.90,4.87,1.43,116.0,1.0,1.0,1.0,49,1,1,1,1,1,0,1,1,1,1,0,0,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,3218,,,,1.0,-37.82025,145.03088,3.0,2,1.0,1.0,1.0,1,1125,0,0,0,0.0,2,0,0,2504.0,4.50,4.00,5.00,5.00,5.00,4.50,4.50,0.02,,1.0,1.0,0.0,13,1,1,1,1,1,1,0,1,1,1,0,0,0,0,1,0,0,0,0,0,1
9996,1202,,,,36.0,-37.83624,144.99299,4.0,2,1.0,1.0,1.0,1,90,1,1,1,268.0,42,9,0,1543.0,4.21,4.50,4.29,4.74,4.64,4.74,4.19,0.86,,1.0,1.0,1.0,40,1,1,1,1,1,0,0,1,0,1,1,0,1,0,0,0,0,1,0,0,0
9997,3124,1.0,1.00,0.92,36.0,-37.86326,144.75456,4.0,16,3.5,6.0,8.0,2,365,9,39,69,340.0,0,0,0,,,,,,,,,,,1.0,1.0,0.0,42,1,1,1,1,1,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,1
9998,1764,,,,2.0,-37.80913,144.96058,3.0,2,0.5,1.0,,1,1125,0,0,0,0.0,3,0,0,1217.0,3.00,4.00,3.00,2.33,3.33,4.00,3.33,0.08,,0.0,1.0,0.0,10,1,1,1,1,1,1,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0


In [119]:
df_impute = df7.drop(['price', 'source_previous scrape'], axis=1)

In [120]:
df_impute.isnull().sum()

host_since                                     0
host_response_time                           737
host_response_rate                           737
host_acceptance_rate                         721
host_listings_count                            0
latitude                                       0
longitude                                      0
room_type                                    175
accommodates                                   0
bathrooms                                      6
bedrooms                                     438
beds                                          84
minimum_nights                                 0
maximum_nights                                 0
availability_30                                0
availability_60                                0
availability_90                                0
availability_365                              67
number_of_reviews                              0
number_of_reviews_ltm                          0
number_of_reviews_l3

### Miss Forest Classifier

In [121]:
%%capture --no-display
imputer = MissForest()
X_imputed = imputer.fit_transform(df_impute)

In [122]:
X_imputed = pd.DataFrame(X_imputed, columns = df_impute.columns)

In [123]:
X_imputed.isnull().sum()

host_since                                   0
host_response_time                           0
host_response_rate                           0
host_acceptance_rate                         0
host_listings_count                          0
latitude                                     0
longitude                                    0
room_type                                    0
accommodates                                 0
bathrooms                                    0
bedrooms                                     0
beds                                         0
minimum_nights                               0
maximum_nights                               0
availability_30                              0
availability_60                              0
availability_90                              0
availability_365                             0
number_of_reviews                            0
number_of_reviews_ltm                        0
number_of_reviews_l30d                       0
first_review 

In [124]:
X_imputed

Unnamed: 0,host_since,host_response_time,host_response_rate,host_acceptance_rate,host_listings_count,latitude,longitude,room_type,accommodates,bathrooms,bedrooms,beds,minimum_nights,maximum_nights,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,reviews_per_month,email,phone,work_email,amenity_count,smoke_alarm,kitchen,essentials,hangers,wifi,host_is_superhost_t,host_has_profile_pic_t,host_identity_verified_t,has_availability_t,instant_bookable_t,property_type_Entire home,property_type_Entire rental unit,property_type_Private room in home,property_type_Private room in rental unit,property_type_other,neighbourhood_cleansed_Port Phillip,neighbourhood_cleansed_Stonnington,neighbourhood_cleansed_Yarra,neighbourhood_cleansed_Yarra Ranges,neighbourhood_cleansed_other
0,3724.0,1.00,1.0000,0.9800,2.0,-37.82030,144.99016,4.0,2.0,1.0,1.0,1.0,2.0,14.0,0.0,0.0,0.0,12.0,741.0,37.0,1.0,3711.00,4.88,4.9100,4.9700,4.940,4.9300,4.9300,4.8200,6.11,1.0,1.0,0.0,38.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,5001.0,2.00,1.0000,0.9800,1.0,-37.76606,144.97951,3.0,2.0,1.0,1.0,1.0,4.0,27.0,0.0,12.0,22.0,112.0,169.0,25.0,3.0,3787.00,4.48,4.6400,3.9700,4.720,4.6900,4.6500,4.6000,1.37,1.0,1.0,0.0,57.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,4198.0,2.00,1.0000,0.7800,2.0,-37.90546,145.39447,4.0,4.0,2.5,2.0,4.0,2.0,365.0,30.0,60.0,90.0,365.0,8.0,2.0,0.0,2882.00,4.75,4.8800,4.7500,4.880,4.5000,5.0000,4.7500,0.09,1.0,1.0,0.0,21.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,4731.0,3.00,0.7500,0.9200,4.0,-37.82163,144.96672,3.0,2.0,2.5,1.0,1.0,1.0,730.0,30.0,60.0,90.0,365.0,2.0,0.0,0.0,4241.00,4.50,4.0000,4.5000,4.000,4.0000,5.0000,4.0000,0.01,1.0,1.0,0.0,13.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4702.0,2.00,1.0000,0.8700,1.0,-38.05725,145.33936,4.0,5.0,1.0,3.0,3.0,1.0,14.0,17.0,21.0,51.0,312.0,214.0,39.0,4.0,4567.00,4.86,4.9100,4.9800,4.910,4.9300,4.9000,4.8700,1.43,1.0,1.0,1.0,49.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,3218.0,2.18,0.9895,0.3122,1.0,-37.82025,145.03088,3.0,2.0,1.0,1.0,1.0,1.0,1125.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2504.00,4.50,4.0000,5.0000,5.000,5.0000,4.5000,4.5000,0.02,1.0,1.0,0.0,13.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
9996,1202.0,1.00,0.9719,0.9794,36.0,-37.83624,144.99299,4.0,2.0,1.0,1.0,1.0,1.0,90.0,1.0,1.0,1.0,268.0,42.0,9.0,0.0,1543.00,4.21,4.5000,4.2900,4.740,4.6400,4.7400,4.1900,0.86,1.0,1.0,1.0,40.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
9997,3124.0,1.00,1.0000,0.9200,36.0,-37.86326,144.75456,4.0,16.0,3.5,6.0,8.0,2.0,365.0,9.0,39.0,69.0,340.0,0.0,0.0,0.0,1553.19,0.03,2.2754,2.0714,2.772,2.6391,3.0659,2.3577,0.02,1.0,1.0,0.0,42.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9998,1764.0,2.10,0.9673,0.5288,2.0,-37.80913,144.96058,3.0,2.0,0.5,1.0,1.1,1.0,1125.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,1217.00,3.00,4.0000,3.0000,2.330,3.3300,4.0000,3.3300,0.08,0.0,1.0,0.0,10.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [125]:
# add price back
X_imputed['price'] = df7.price

# create logarithmic price column
X_imputed['log_price'] = np.log(X_imputed['price'])

X_imputed.to_csv("missforest_imputed.csv")

In [126]:
X_imputed

Unnamed: 0,host_since,host_response_time,host_response_rate,host_acceptance_rate,host_listings_count,latitude,longitude,room_type,accommodates,bathrooms,bedrooms,beds,minimum_nights,maximum_nights,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,reviews_per_month,email,phone,work_email,amenity_count,smoke_alarm,kitchen,essentials,hangers,wifi,host_is_superhost_t,host_has_profile_pic_t,host_identity_verified_t,has_availability_t,instant_bookable_t,property_type_Entire home,property_type_Entire rental unit,property_type_Private room in home,property_type_Private room in rental unit,property_type_other,neighbourhood_cleansed_Port Phillip,neighbourhood_cleansed_Stonnington,neighbourhood_cleansed_Yarra,neighbourhood_cleansed_Yarra Ranges,neighbourhood_cleansed_other,price,log_price
0,3724.0,1.00,1.0000,0.9800,2.0,-37.82030,144.99016,4.0,2.0,1.0,1.0,1.0,2.0,14.0,0.0,0.0,0.0,12.0,741.0,37.0,1.0,3711.00,4.88,4.9100,4.9700,4.940,4.9300,4.9300,4.8200,6.11,1.0,1.0,0.0,38.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,132.0,4.882802
1,5001.0,2.00,1.0000,0.9800,1.0,-37.76606,144.97951,3.0,2.0,1.0,1.0,1.0,4.0,27.0,0.0,12.0,22.0,112.0,169.0,25.0,3.0,3787.00,4.48,4.6400,3.9700,4.720,4.6900,4.6500,4.6000,1.37,1.0,1.0,0.0,57.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,39.0,3.663562
2,4198.0,2.00,1.0000,0.7800,2.0,-37.90546,145.39447,4.0,4.0,2.5,2.0,4.0,2.0,365.0,30.0,60.0,90.0,365.0,8.0,2.0,0.0,2882.00,4.75,4.8800,4.7500,4.880,4.5000,5.0000,4.7500,0.09,1.0,1.0,0.0,21.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,270.0,5.598422
3,4731.0,3.00,0.7500,0.9200,4.0,-37.82163,144.96672,3.0,2.0,2.5,1.0,1.0,1.0,730.0,30.0,60.0,90.0,365.0,2.0,0.0,0.0,4241.00,4.50,4.0000,4.5000,4.000,4.0000,5.0000,4.0000,0.01,1.0,1.0,0.0,13.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1000.0,6.907755
4,4702.0,2.00,1.0000,0.8700,1.0,-38.05725,145.33936,4.0,5.0,1.0,3.0,3.0,1.0,14.0,17.0,21.0,51.0,312.0,214.0,39.0,4.0,4567.00,4.86,4.9100,4.9800,4.910,4.9300,4.9000,4.8700,1.43,1.0,1.0,1.0,49.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,116.0,4.753590
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,3218.0,2.18,0.9895,0.3122,1.0,-37.82025,145.03088,3.0,2.0,1.0,1.0,1.0,1.0,1125.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2504.00,4.50,4.0000,5.0000,5.000,5.0000,4.5000,4.5000,0.02,1.0,1.0,0.0,13.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,,
9996,1202.0,1.00,0.9719,0.9794,36.0,-37.83624,144.99299,4.0,2.0,1.0,1.0,1.0,1.0,90.0,1.0,1.0,1.0,268.0,42.0,9.0,0.0,1543.00,4.21,4.5000,4.2900,4.740,4.6400,4.7400,4.1900,0.86,1.0,1.0,1.0,40.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,,
9997,3124.0,1.00,1.0000,0.9200,36.0,-37.86326,144.75456,4.0,16.0,3.5,6.0,8.0,2.0,365.0,9.0,39.0,69.0,340.0,0.0,0.0,0.0,1553.19,0.03,2.2754,2.0714,2.772,2.6391,3.0659,2.3577,0.02,1.0,1.0,0.0,42.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,
9998,1764.0,2.10,0.9673,0.5288,2.0,-37.80913,144.96058,3.0,2.0,0.5,1.0,1.1,1.0,1125.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,1217.00,3.00,4.0000,3.0000,2.330,3.3300,4.0000,3.3300,0.08,0.0,1.0,0.0,10.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,


## More Feature Engineering

In [155]:
df11 = X_imputed.copy()

In [156]:
df11

Unnamed: 0,host_since,host_response_time,host_response_rate,host_acceptance_rate,host_listings_count,latitude,longitude,room_type,accommodates,bathrooms,bedrooms,beds,minimum_nights,maximum_nights,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,reviews_per_month,email,phone,work_email,amenity_count,smoke_alarm,kitchen,essentials,hangers,wifi,host_is_superhost_t,host_has_profile_pic_t,host_identity_verified_t,has_availability_t,instant_bookable_t,property_type_Entire home,property_type_Entire rental unit,property_type_Private room in home,property_type_Private room in rental unit,property_type_other,neighbourhood_cleansed_Port Phillip,neighbourhood_cleansed_Stonnington,neighbourhood_cleansed_Yarra,neighbourhood_cleansed_Yarra Ranges,neighbourhood_cleansed_other,price,log_price
0,3724.0,1.00,1.0000,0.9800,2.0,-37.82030,144.99016,4.0,2.0,1.0,1.0,1.0,2.0,14.0,0.0,0.0,0.0,12.0,741.0,37.0,1.0,3711.00,4.88,4.9100,4.9700,4.940,4.9300,4.9300,4.8200,6.11,1.0,1.0,0.0,38.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,132.0,4.882802
1,5001.0,2.00,1.0000,0.9800,1.0,-37.76606,144.97951,3.0,2.0,1.0,1.0,1.0,4.0,27.0,0.0,12.0,22.0,112.0,169.0,25.0,3.0,3787.00,4.48,4.6400,3.9700,4.720,4.6900,4.6500,4.6000,1.37,1.0,1.0,0.0,57.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,39.0,3.663562
2,4198.0,2.00,1.0000,0.7800,2.0,-37.90546,145.39447,4.0,4.0,2.5,2.0,4.0,2.0,365.0,30.0,60.0,90.0,365.0,8.0,2.0,0.0,2882.00,4.75,4.8800,4.7500,4.880,4.5000,5.0000,4.7500,0.09,1.0,1.0,0.0,21.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,270.0,5.598422
3,4731.0,3.00,0.7500,0.9200,4.0,-37.82163,144.96672,3.0,2.0,2.5,1.0,1.0,1.0,730.0,30.0,60.0,90.0,365.0,2.0,0.0,0.0,4241.00,4.50,4.0000,4.5000,4.000,4.0000,5.0000,4.0000,0.01,1.0,1.0,0.0,13.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1000.0,6.907755
4,4702.0,2.00,1.0000,0.8700,1.0,-38.05725,145.33936,4.0,5.0,1.0,3.0,3.0,1.0,14.0,17.0,21.0,51.0,312.0,214.0,39.0,4.0,4567.00,4.86,4.9100,4.9800,4.910,4.9300,4.9000,4.8700,1.43,1.0,1.0,1.0,49.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,116.0,4.753590
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,3218.0,2.18,0.9895,0.3122,1.0,-37.82025,145.03088,3.0,2.0,1.0,1.0,1.0,1.0,1125.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2504.00,4.50,4.0000,5.0000,5.000,5.0000,4.5000,4.5000,0.02,1.0,1.0,0.0,13.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,,
9996,1202.0,1.00,0.9719,0.9794,36.0,-37.83624,144.99299,4.0,2.0,1.0,1.0,1.0,1.0,90.0,1.0,1.0,1.0,268.0,42.0,9.0,0.0,1543.00,4.21,4.5000,4.2900,4.740,4.6400,4.7400,4.1900,0.86,1.0,1.0,1.0,40.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,,
9997,3124.0,1.00,1.0000,0.9200,36.0,-37.86326,144.75456,4.0,16.0,3.5,6.0,8.0,2.0,365.0,9.0,39.0,69.0,340.0,0.0,0.0,0.0,1553.19,0.03,2.2754,2.0714,2.772,2.6391,3.0659,2.3577,0.02,1.0,1.0,0.0,42.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,
9998,1764.0,2.10,0.9673,0.5288,2.0,-37.80913,144.96058,3.0,2.0,0.5,1.0,1.1,1.0,1125.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,1217.00,3.00,4.0000,3.0000,2.330,3.3300,4.0000,3.3300,0.08,0.0,1.0,0.0,10.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,


### Variable Ratios

#### Re-weighting Review Scores
`review scores` should be higher weighted if these resulted from more reviews as this would be more accurate. We could first account for different timeframes by subtracting first_review from today_date and divide number_of_reviews by this difference to get the amount of reviews per day (could also do month by dividing by 30 to have smaller numbers) since the first review was made. This could then be multiplied by the review_scores to give them more weight if there were more number of reviews per day. Actually, there is the feature reviews_per_month which calculates the difference between first review and today. Hence we can use this to multiply. Doing so might have the following effects:
- we would penalise old listings that haven't been reviewed in a long time (there are some which had their last review in 2012, so their reviews might not be accurate anymore and should get less weight)

Based on feature importance from initial XGBoost models, we will scale `review_scores_location`, `review_scores_rating`, and `review_scores_value`.

In [157]:
columns = ['review_scores_rating', 'review_scores_location', 'review_scores_value']

for f in columns:
    if f in df11.columns:
        new_column = f + "_scaled"
        df11[new_column] = df11[f] * df11.reviews_per_month
        df11.drop(columns=[f], inplace=True)

df11.drop(columns=['reviews_per_month'], inplace=True)
df11

Unnamed: 0,host_since,host_response_time,host_response_rate,host_acceptance_rate,host_listings_count,latitude,longitude,room_type,accommodates,bathrooms,bedrooms,beds,minimum_nights,maximum_nights,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,email,phone,work_email,amenity_count,smoke_alarm,kitchen,essentials,hangers,wifi,host_is_superhost_t,host_has_profile_pic_t,host_identity_verified_t,has_availability_t,instant_bookable_t,property_type_Entire home,property_type_Entire rental unit,property_type_Private room in home,property_type_Private room in rental unit,property_type_other,neighbourhood_cleansed_Port Phillip,neighbourhood_cleansed_Stonnington,neighbourhood_cleansed_Yarra,neighbourhood_cleansed_Yarra Ranges,neighbourhood_cleansed_other,price,log_price,review_scores_rating_scaled,review_scores_location_scaled,review_scores_value_scaled
0,3724.0,1.00,1.0000,0.9800,2.0,-37.82030,144.99016,4.0,2.0,1.0,1.0,1.0,2.0,14.0,0.0,0.0,0.0,12.0,741.0,37.0,1.0,3711.00,4.9100,4.9700,4.940,4.9300,1.0,1.0,0.0,38.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,132.0,4.882802,29.8168,30.122300,29.450200
1,5001.0,2.00,1.0000,0.9800,1.0,-37.76606,144.97951,3.0,2.0,1.0,1.0,1.0,4.0,27.0,0.0,12.0,22.0,112.0,169.0,25.0,3.0,3787.00,4.6400,3.9700,4.720,4.6900,1.0,1.0,0.0,57.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,39.0,3.663562,6.1376,6.370500,6.302000
2,4198.0,2.00,1.0000,0.7800,2.0,-37.90546,145.39447,4.0,4.0,2.5,2.0,4.0,2.0,365.0,30.0,60.0,90.0,365.0,8.0,2.0,0.0,2882.00,4.8800,4.7500,4.880,4.5000,1.0,1.0,0.0,21.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,270.0,5.598422,0.4275,0.450000,0.427500
3,4731.0,3.00,0.7500,0.9200,4.0,-37.82163,144.96672,3.0,2.0,2.5,1.0,1.0,1.0,730.0,30.0,60.0,90.0,365.0,2.0,0.0,0.0,4241.00,4.0000,4.5000,4.000,4.0000,1.0,1.0,0.0,13.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1000.0,6.907755,0.0450,0.050000,0.040000
4,4702.0,2.00,1.0000,0.8700,1.0,-38.05725,145.33936,4.0,5.0,1.0,3.0,3.0,1.0,14.0,17.0,21.0,51.0,312.0,214.0,39.0,4.0,4567.00,4.9100,4.9800,4.910,4.9300,1.0,1.0,1.0,49.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,116.0,4.753590,6.9498,7.007000,6.964100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,3218.0,2.18,0.9895,0.3122,1.0,-37.82025,145.03088,3.0,2.0,1.0,1.0,1.0,1.0,1125.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2504.00,4.0000,5.0000,5.000,5.0000,1.0,1.0,0.0,13.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,,,0.0900,0.090000,0.090000
9996,1202.0,1.00,0.9719,0.9794,36.0,-37.83624,144.99299,4.0,2.0,1.0,1.0,1.0,1.0,90.0,1.0,1.0,1.0,268.0,42.0,9.0,0.0,1543.00,4.5000,4.2900,4.740,4.6400,1.0,1.0,1.0,40.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,,,3.6206,4.076400,3.603400
9997,3124.0,1.00,1.0000,0.9200,36.0,-37.86326,144.75456,4.0,16.0,3.5,6.0,8.0,2.0,365.0,9.0,39.0,69.0,340.0,0.0,0.0,0.0,1553.19,2.2754,2.0714,2.772,2.6391,1.0,1.0,0.0,42.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,,0.0006,0.061318,0.047154
9998,1764.0,2.10,0.9673,0.5288,2.0,-37.80913,144.96058,3.0,2.0,0.5,1.0,1.1,1.0,1125.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,1217.00,4.0000,3.0000,2.330,3.3300,0.0,1.0,0.0,10.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.2400,0.320000,0.266400


Allow us to capture information on multiple features as a ratio, thereby also reducing dimensionality of the data. We will create the following new features:\
- bathrooms/bedrooms: larger ratio would indicate more comfort --> should equal higher price
- bathrooms/accommodates: larger ratio would indicate more comfort --> should equal higher price
- bedrooms/accommodates : larger ratio would indicate more comfort --> should equal higher price
- beds/accommodates : larger ratio would indicate more comfort --> should equal higher price
- bedrooms/beds: larger ratio would indicate low comfort --> should equal lower price
- maximum_nights/minimum_nights not sure if relevant, but we could analyse. both of these variables were deemed relevant in my XGBoost model (this also is feature selection since we capture both variables at once).
- availability_365 /365: gives a ratio of how booked out the place is over the next year --> lower ratio would indicate high demand  --> should equal higher price
- room type / accommodates as proxy for space per person

#### Bathrooms / Bedrooms

In [158]:
df11['bath_bed_ratio'] = df11.bathrooms / df11.bedrooms

#### Bathrooms / Accommodates

In [159]:
df11['bath_guests_ratio'] = df11.bathrooms / df11.accommodates

#### Bedrooms / Accommodates

In [160]:
df11['bedrooms_guests_ratio'] = df11.bedrooms / df11.accommodates

#### Beds / Accommodates

In [161]:
df11['beds_guests_ratio'] = df11.beds / df11.accommodates

#### Bedrooms / Beds

In [162]:
df11['bedrooms_beds_ratio'] = df11.bedrooms / df11.beds

#### Maximum Nights / Minimum Nights

In [163]:
df11['maxnights_minnights_ratio'] = df11.maximum_nights / df11.minimum_nights

#### Room Type / Accommodates 

In [164]:
df11['roomtype_guests_ratio'] = df11.room_type / df11.accommodates

In [201]:
# show df again and save
df11
df11.to_csv("df11.csv")

### Test new Features Importances without dropping original Features and without outliers

In [166]:
test_ids = df11.index[7000:].values # save IDs for later output
X_train = df11.iloc[:7000]
X_test = df11.iloc[7000:]

In [167]:
# Standardize continuous columns
# Initialize a StandardScaler object for feature standardization
scaler = StandardScaler()

X_train_std = X_train.drop(['log_price', 'price'], axis=1).copy()
X_test_std = X_test.drop(['log_price', 'price'], axis=1).copy()

# Iterate through columns
for column in X_train_std.columns:
    # Check if the column has more than 10 unique values (threshold for categorical variables)
    if X_train_std[column].nunique() > 10:
        # Fit the StandardScaler on the training data
        scaler.fit(X_train_std[column].values.reshape(-1, 1))
        # Apply feature standardization to both training and test data
        X_train_std[column] = scaler.transform(X_train_std[column].values.reshape(-1, 1))
        X_test_std[column] = scaler.transform(X_test_std[column].values.reshape(-1, 1))
        
# put prices back
X_train_std[['log_price', 'price']] = X_train[['log_price', 'price']]
X_test_std[['log_price', 'price']] = X_test[['log_price', 'price']]

In [168]:
def get_outliers(data, threshold=3):
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data.values.reshape(-1, 1))
    z_scores = scaled_data.flatten()
    outlier_locations = np.where(np.abs(z_scores) > threshold)[0]
    outlier_values = data[outlier_locations]
    outlier_df = pd.DataFrame({'Outlier': outlier_values})
    return outlier_df

In [169]:
# Remove price outliers
price_outlier = get_outliers(X_train_std['price'], threshold=2)
X_train_std = X_train_std[~X_train_std['price'].isin(price_outlier['Outlier'].values)]
X_train_std.reset_index(drop=True, inplace=True)

In [170]:
y_train = X_train_std['log_price'].iloc[:7000].values
X_train_std = X_train_std.drop(['log_price', 'price'], axis=1)
X_test_std = X_test_std.drop(['log_price', 'price'], axis=1)

In [171]:
model = xgb.XGBRegressor()

feature_names = X_train_std.columns

model.fit(X_train_std, y_train)

importance = model.feature_importances_

df_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importance })

df_importances['Rank'] = df_importances['Importance'].rank(ascending=False)

# Sort the importances by rank
df_importances = df_importances.sort_values(by='Rank')

# Print the feature importances with ranks
print(len(importance))
print(len(feature_names))
df_importances

60
60


Unnamed: 0,Feature,Importance,Rank
8,accommodates,0.254028,1.0
7,room_type,0.179687,2.0
10,bedrooms,0.174557,3.0
42,property_type_Private room in home,0.057314,4.0
49,neighbourhood_cleansed_other,0.037697,5.0
9,bathrooms,0.034607,6.0
59,roomtype_guests_ratio,0.027882,7.0
14,availability_30,0.016865,8.0
39,instant_bookable_t,0.013511,9.0
44,property_type_other,0.012853,10.0


#### XGBoost

In [173]:
# xgboost on all predictors
xgb = XGBRegressor(random_state=42)

xgb.fit(X_train_std, y_train) # fit regressor

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(xgb, X_train_std, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

xgb.fit(X_train_std, y_train)


y_pred = xgb.predict(X_test_std)
y_pred_dollar = np.exp(y_pred)

# save predictions to csv
out = pd.DataFrame({"ID":test_ids, "price":y_pred_dollar})
out.to_csv("XGBRegressorPredictions.csv", index=False, header=True)
out

CV Root Mean Squared Error: 0.34122548543996434


Unnamed: 0,ID,price
0,7000,172.033112
1,7001,140.648758
2,7002,110.486366
3,7003,160.410461
4,7004,148.604355
...,...,...
2995,9995,95.687950
2996,9996,139.338654
2997,9997,770.766235
2998,9998,163.107880


In [175]:
# top 30 features
df_importances_top30 = [x for x in df_importances.Feature][:30]

X_train_std_top30 = X_train_std.copy()
X_test_std_top30 = X_test_std.copy()

for f in X_train_std_top30.columns:
    if f not in df_importances_top30:
        X_train_std_top30.drop(columns=f, inplace=True)
        X_test_std_top30.drop(columns=f, inplace=True)
        
# xgboost on top 30 predictors
xgb = XGBRegressor(random_state=42)

xgb.fit(X_train_std_top30, y_train) # fit regressor

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(xgb, X_train_std_top30, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

xgb.fit(X_train_std_top30, y_train)

ypred_train = xgb.predict(X_train_std_top30)
ypred_train_dollar = np.round(np.exp(ypred_train))
y_train_dollar = np.round(np.exp(y_train))

y_pred = xgb.predict(X_test_std_top30)
y_pred_dollar = np.round(np.exp(y_pred))

rmse = np.sqrt(mean_squared_error(y_train_dollar, ypred_train_dollar))
print(f'Root Mean Squared Error: {rmse}')

# save predictions to csv
out = pd.DataFrame({"ID":test_ids, "price":y_pred_dollar})
out.to_csv("XGBRegressorPredictions.csv", index=False, header=True)
out

CV Root Mean Squared Error: 0.3497282783211024
Root Mean Squared Error: 55.767071221746164


Unnamed: 0,ID,price
0,7000,214.0
1,7001,166.0
2,7002,89.0
3,7003,170.0
4,7004,164.0
...,...,...
2995,9995,88.0
2996,9996,156.0
2997,9997,639.0
2998,9998,119.0


In [180]:
# top 15 features
df_importances_top15 = [x for x in df_importances.Feature][:15]

X_train_std_top15 = X_train_std.copy()
X_test_std_top15 = X_test_std.copy()

for f in X_train_std_top15.columns:
    if f not in df_importances_top15:
        X_train_std_top15.drop(columns=f, inplace=True)
        X_test_std_top15.drop(columns=f, inplace=True)
        
# xgboost on top 15 predictors
xgb = XGBRegressor(random_state=42)

xgb.fit(X_train_std_top15, y_train) # fit regressor

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(xgb, X_train_std_top15, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

xgb.fit(X_train_std_top15, y_train)

ypred_train = xgb.predict(X_train_std_top15)
ypred_train_dollar = np.round(np.exp(ypred_train))
y_train_dollar = np.round(np.exp(y_train))

y_pred = xgb.predict(X_train_std_top15)
y_pred_dollar = np.round(np.exp(y_pred))

rmse = np.sqrt(mean_squared_error(y_train_dollar, ypred_train_dollar))
print(f'Root Mean Squared Error: {rmse}')

y_pred = xgb.predict(X_test_std_top15)
y_pred_dollar = np.exp(y_pred)

# save predictions to csv
out = pd.DataFrame({"ID":test_ids, "price":y_pred_dollar})
out.to_csv("XGBRegressorPredictions.csv", index=False, header=True)
out

CV Root Mean Squared Error: 0.3731078176669536
Root Mean Squared Error: 82.76576923387525


Unnamed: 0,ID,price
0,7000,206.339050
1,7001,161.059860
2,7002,66.624374
3,7003,174.551117
4,7004,178.516312
...,...,...
2995,9995,71.737259
2996,9996,187.560028
2997,9997,463.088867
2998,9998,69.701622


In [181]:
# top 10 features
df_importances_top10 = [x for x in df_importances.Feature][:10]

X_train_std_top10 = X_train_std.copy()
X_test_std_top10 = X_test_std.copy()

for f in X_train_std_top10.columns:
    if f not in df_importances_top10:
        X_train_std_top10.drop(columns=f, inplace=True)
        X_test_std_top10.drop(columns=f, inplace=True)
        
# xgboost on top 10 predictors
xgb = XGBRegressor(random_state=42)

xgb.fit(X_train_std_top10, y_train) # fit regressor

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(xgb, X_train_std_top10, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

xgb.fit(X_train_std_top10, y_train)

ypred_train = xgb.predict(X_train_std_top10)
ypred_train_dollar = np.round(np.exp(ypred_train))
y_train_dollar = np.round(np.exp(y_train))

y_pred = xgb.predict(X_train_std_top10)
y_pred_dollar = np.round(np.exp(y_pred))

rmse = np.sqrt(mean_squared_error(y_train_dollar, ypred_train_dollar))
print(f'Root Mean Squared Error: {rmse}')

y_pred = xgb.predict(X_test_std_top10)
y_pred_dollar = np.exp(y_pred)

# save predictions to csv
out = pd.DataFrame({"ID":test_ids, "price":y_pred_dollar})
out.to_csv("XGBRegressorPredictions.csv", index=False, header=True)
out

CV Root Mean Squared Error: 0.4032907625708707
Root Mean Squared Error: 105.43441873858445


Unnamed: 0,ID,price
0,7000,218.025101
1,7001,170.046051
2,7002,52.022713
3,7003,211.393158
4,7004,172.832474
...,...,...
2995,9995,70.724083
2996,9996,132.522202
2997,9997,443.341919
2998,9998,54.599815


#### RandomForest

In [182]:
# on all predictors
rfr = RandomForestRegressor(random_state=42)

rfr.fit(X_train_std, y_train) # fit regressor

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(rfr, X_train_std, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

rfr.fit(X_train_std, y_train)


y_pred = rfr.predict(X_test_std)
y_pred_dollar = np.exp(y_pred)

# save predictions to csv
out = pd.DataFrame({"ID":test_ids, "price":y_pred_dollar})
out.to_csv("RandomForestPredictions.csv", index=False, header=True)
out

CV Root Mean Squared Error: 0.3540751702663731


Unnamed: 0,ID,price
0,7000,198.813792
1,7001,169.654201
2,7002,130.997747
3,7003,177.638626
4,7004,168.811830
...,...,...
2995,9995,88.456658
2996,9996,147.616131
2997,9997,506.967327
2998,9998,126.102810


In [183]:
# on top 30 predictors
rfr = XGBRegressor(random_state=42)

rfr.fit(X_train_std_top30, y_train) # fit regressor

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(rfr, X_train_std_top30, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

rfr.fit(X_train_std_top30, y_train)

ypred_train = rfr.predict(X_train_std_top30)
ypred_train_dollar = np.round(np.exp(ypred_train))
y_train_dollar = np.round(np.exp(y_train))

y_pred = rfr.predict(X_test_std_top30)
y_pred_dollar = np.round(np.exp(y_pred))

rmse = np.sqrt(mean_squared_error(y_train_dollar, ypred_train_dollar))
print(f'Root Mean Squared Error: {rmse}')

# save predictions to csv
out = pd.DataFrame({"ID":test_ids, "price":y_pred_dollar})
out.to_csv("RandomForestPredictions.csv", index=False, header=True)
out

CV Root Mean Squared Error: 0.3497282783211024
Root Mean Squared Error: 55.767071221746164


Unnamed: 0,ID,price
0,7000,214.0
1,7001,166.0
2,7002,89.0
3,7003,170.0
4,7004,164.0
...,...,...
2995,9995,88.0
2996,9996,156.0
2997,9997,639.0
2998,9998,119.0


In [184]:
# on top 15 predictors
rfr = XGBRegressor(random_state=42)

rfr.fit(X_train_std_top15, y_train) # fit regressor

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(rfr, X_train_std_top15, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

rfr.fit(X_train_std_top15, y_train)

ypred_train = rfr.predict(X_train_std_top15)
ypred_train_dollar = np.round(np.exp(ypred_train))
y_train_dollar = np.round(np.exp(y_train))

y_pred = rfr.predict(X_train_std_top15)
y_pred_dollar = np.round(np.exp(y_pred))

rmse = np.sqrt(mean_squared_error(y_train_dollar, ypred_train_dollar))
print(f'Root Mean Squared Error: {rmse}')

y_pred = rfr.predict(X_test_std_top15)
y_pred_dollar = np.exp(y_pred)

# save predictions to csv
out = pd.DataFrame({"ID":test_ids, "price":y_pred_dollar})
out.to_csv("RandomForestPredictions.csv", index=False, header=True)
out

CV Root Mean Squared Error: 0.3731078176669536
Root Mean Squared Error: 82.76576923387525


Unnamed: 0,ID,price
0,7000,206.339050
1,7001,161.059860
2,7002,66.624374
3,7003,174.551117
4,7004,178.516312
...,...,...
2995,9995,71.737259
2996,9996,187.560028
2997,9997,463.088867
2998,9998,69.701622


In [181]:
# on top 10 predictors
xgb = XGBRegressor(random_state=42)

xgb.fit(X_train_std_top10, y_train) # fit regressor

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(xgb, X_train_std_top10, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

xgb.fit(X_train_std_top10, y_train)

ypred_train = xgb.predict(X_train_std_top10)
ypred_train_dollar = np.round(np.exp(ypred_train))
y_train_dollar = np.round(np.exp(y_train))

y_pred = xgb.predict(X_train_std_top10)
y_pred_dollar = np.round(np.exp(y_pred))

rmse = np.sqrt(mean_squared_error(y_train_dollar, ypred_train_dollar))
print(f'Root Mean Squared Error: {rmse}')

y_pred = xgb.predict(X_test_std_top10)
y_pred_dollar = np.exp(y_pred)

# save predictions to csv
out = pd.DataFrame({"ID":test_ids, "price":y_pred_dollar})
out.to_csv("XGBRegressorPredictions.csv", index=False, header=True)
out

CV Root Mean Squared Error: 0.4032907625708707
Root Mean Squared Error: 105.43441873858445


Unnamed: 0,ID,price
0,7000,218.025101
1,7001,170.046051
2,7002,52.022713
3,7003,211.393158
4,7004,172.832474
...,...,...
2995,9995,70.724083
2996,9996,132.522202
2997,9997,443.341919
2998,9998,54.599815


### Test new Features Importances with dropping original Features and without outliers

In [192]:
test_ids = df11.index[7000:].values # save IDs for later output
X_train = df11.drop(columns=['accommodates', 'maximum_nights', 'minimum_nights', 'beds', 'number_of_reviews']).iloc[:7000]
X_test = df11.drop(columns=['accommodates', 'maximum_nights', 'minimum_nights', 'beds', 'number_of_reviews']).iloc[7000:]

In [193]:
# Standardize continuous columns
# Initialize a StandardScaler object for feature standardization
scaler = StandardScaler()

X_train_std = X_train.drop(['log_price', 'price'], axis=1).copy()
X_test_std = X_test.drop(['log_price', 'price'], axis=1).copy()

# Iterate through columns
for column in X_train_std.columns:
    # Check if the column has more than 10 unique values (threshold for categorical variables)
    if X_train_std[column].nunique() > 10:
        # Fit the StandardScaler on the training data
        scaler.fit(X_train_std[column].values.reshape(-1, 1))
        # Apply feature standardization to both training and test data
        X_train_std[column] = scaler.transform(X_train_std[column].values.reshape(-1, 1))
        X_test_std[column] = scaler.transform(X_test_std[column].values.reshape(-1, 1))
        
# put prices back
X_train_std[['log_price', 'price']] = X_train[['log_price', 'price']]
X_test_std[['log_price', 'price']] = X_test[['log_price', 'price']]

In [194]:
# Remove price outliers
price_outlier = get_outliers(X_train_std['price'], threshold=2)
X_train_std = X_train_std[~X_train_std['price'].isin(price_outlier['Outlier'].values)]
X_train_std.reset_index(drop=True, inplace=True)

In [195]:
y_train = X_train_std['log_price'].iloc[:7000].values
X_train_std = X_train_std.drop(['log_price', 'price'], axis=1)
X_test_std = X_test_std.drop(['log_price', 'price'], axis=1)

In [196]:
model = XGBRegressor()

feature_names = X_train_std.columns

model.fit(X_train_std, y_train)

importance = model.feature_importances_

df_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importance })

df_importances['Rank'] = df_importances['Importance'].rank(ascending=False)

# Sort the importances by rank
df_importances = df_importances.sort_values(by='Rank')

# Print the feature importances with ranks
print(len(importance))
print(len(feature_names))
df_importances

55
55


Unnamed: 0,Feature,Importance,Rank
9,bedrooms,0.338392,1.0
7,room_type,0.141061,2.0
54,roomtype_guests_ratio,0.126645,3.0
8,bathrooms,0.048369,4.0
37,property_type_Private room in home,0.045569,5.0
44,neighbourhood_cleansed_other,0.040256,6.0
10,availability_30,0.017963,7.0
34,instant_bookable_t,0.014433,8.0
47,review_scores_value_scaled,0.010343,9.0
32,host_identity_verified_t,0.01022,10.0


#### XGBoost

In [197]:
# xgboost on all predictors
xgb = XGBRegressor(random_state=42)

xgb.fit(X_train_std, y_train) # fit regressor

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(xgb, X_train_std, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

xgb.fit(X_train_std, y_train)


y_pred = xgb.predict(X_test_std)
y_pred_dollar = np.exp(y_pred)

# save predictions to csv
out = pd.DataFrame({"ID":test_ids, "price":y_pred_dollar})
out.to_csv("XGBRegressorPredictions.csv", index=False, header=True)
out

CV Root Mean Squared Error: 0.3458281885750061


Unnamed: 0,ID,price
0,7000,169.921799
1,7001,149.479141
2,7002,109.312057
3,7003,195.346054
4,7004,181.509293
...,...,...
2995,9995,90.054008
2996,9996,165.235748
2997,9997,506.504578
2998,9998,89.082420


In [198]:
# top 30 features
df_importances_top30 = [x for x in df_importances.Feature][:30]

X_train_std_top30 = X_train_std.copy()
X_test_std_top30 = X_test_std.copy()

for f in X_train_std_top30.columns:
    if f not in df_importances_top30:
        X_train_std_top30.drop(columns=f, inplace=True)
        X_test_std_top30.drop(columns=f, inplace=True)
        
# xgboost on top 30 predictors
xgb = XGBRegressor(random_state=42)

xgb.fit(X_train_std_top30, y_train) # fit regressor

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(xgb, X_train_std_top30, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

xgb.fit(X_train_std_top30, y_train)

ypred_train = xgb.predict(X_train_std_top30)
ypred_train_dollar = np.round(np.exp(ypred_train))
y_train_dollar = np.round(np.exp(y_train))

y_pred = xgb.predict(X_test_std_top30)
y_pred_dollar = np.round(np.exp(y_pred))

rmse = np.sqrt(mean_squared_error(y_train_dollar, ypred_train_dollar))
print(f'Root Mean Squared Error: {rmse}')

# save predictions to csv
out = pd.DataFrame({"ID":test_ids, "price":y_pred_dollar})
out.to_csv("XGBRegressorPredictions.csv", index=False, header=True)
out

CV Root Mean Squared Error: 0.3529845390203185
Root Mean Squared Error: 53.73930996392197


Unnamed: 0,ID,price
0,7000,163.0
1,7001,145.0
2,7002,58.0
3,7003,161.0
4,7004,164.0
...,...,...
2995,9995,110.0
2996,9996,160.0
2997,9997,452.0
2998,9998,102.0


In [199]:
# top 15 features
df_importances_top15 = [x for x in df_importances.Feature][:15]

X_train_std_top15 = X_train_std.copy()
X_test_std_top15 = X_test_std.copy()

for f in X_train_std_top15.columns:
    if f not in df_importances_top15:
        X_train_std_top15.drop(columns=f, inplace=True)
        X_test_std_top15.drop(columns=f, inplace=True)
        
# xgboost on top 15 predictors
xgb = XGBRegressor(random_state=42)

xgb.fit(X_train_std_top15, y_train) # fit regressor

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(xgb, X_train_std_top15, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

xgb.fit(X_train_std_top15, y_train)

ypred_train = xgb.predict(X_train_std_top15)
ypred_train_dollar = np.round(np.exp(ypred_train))
y_train_dollar = np.round(np.exp(y_train))

y_pred = xgb.predict(X_train_std_top15)
y_pred_dollar = np.round(np.exp(y_pred))

rmse = np.sqrt(mean_squared_error(y_train_dollar, ypred_train_dollar))
print(f'Root Mean Squared Error: {rmse}')

y_pred = xgb.predict(X_test_std_top15)
y_pred_dollar = np.exp(y_pred)

# save predictions to csv
out = pd.DataFrame({"ID":test_ids, "price":y_pred_dollar})
out.to_csv("XGBRegressorPredictions.csv", index=False, header=True)
out

CV Root Mean Squared Error: 0.3973189535680781
Root Mean Squared Error: 86.58316157854772


Unnamed: 0,ID,price
0,7000,194.483459
1,7001,191.338394
2,7002,59.721870
3,7003,163.009918
4,7004,185.632019
...,...,...
2995,9995,74.733978
2996,9996,174.820999
2997,9997,788.041992
2998,9998,64.106506


In [200]:
# top 10 features
df_importances_top10 = [x for x in df_importances.Feature][:10]

X_train_std_top10 = X_train_std.copy()
X_test_std_top10 = X_test_std.copy()

for f in X_train_std_top10.columns:
    if f not in df_importances_top10:
        X_train_std_top10.drop(columns=f, inplace=True)
        X_test_std_top10.drop(columns=f, inplace=True)
        
# xgboost on top 10 predictors
xgb = XGBRegressor(random_state=42)

xgb.fit(X_train_std_top10, y_train) # fit regressor

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(xgb, X_train_std_top10, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

xgb.fit(X_train_std_top10, y_train)

ypred_train = xgb.predict(X_train_std_top10)
ypred_train_dollar = np.round(np.exp(ypred_train))
y_train_dollar = np.round(np.exp(y_train))

y_pred = xgb.predict(X_train_std_top10)
y_pred_dollar = np.round(np.exp(y_pred))

rmse = np.sqrt(mean_squared_error(y_train_dollar, ypred_train_dollar))
print(f'Root Mean Squared Error: {rmse}')

y_pred = xgb.predict(X_test_std_top10)
y_pred_dollar = np.exp(y_pred)

# save predictions to csv
out = pd.DataFrame({"ID":test_ids, "price":y_pred_dollar})
out.to_csv("XGBRegressorPredictions.csv", index=False, header=True)
out

CV Root Mean Squared Error: 0.40503444657183246
Root Mean Squared Error: 87.83138462285274


Unnamed: 0,ID,price
0,7000,180.944565
1,7001,175.679291
2,7002,58.403881
3,7003,193.497391
4,7004,170.860550
...,...,...
2995,9995,59.160019
2996,9996,166.807800
2997,9997,427.999512
2998,9998,88.153595


#### RandomForest

In [None]:
# on all predictors
rfr = RandomForestRegressor(random_state=42)

rfr.fit(X_train_std, y_train) # fit regressor

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(rfr, X_train_std, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

rfr.fit(X_train_std, y_train)


y_pred = rfr.predict(X_test_std)
y_pred_dollar = np.exp(y_pred)

# save predictions to csv
out = pd.DataFrame({"ID":test_ids, "price":y_pred_dollar})
out.to_csv("RandomForestPredictions.csv", index=False, header=True)
out

CV Root Mean Squared Error: 0.3540751702663731


Unnamed: 0,ID,price
0,7000,198.813792
1,7001,169.654201
2,7002,130.997747
3,7003,177.638626
4,7004,168.811830
...,...,...
2995,9995,88.456658
2996,9996,147.616131
2997,9997,506.967327
2998,9998,126.102810


In [None]:
# on top 30 predictors
rfr = XGBRegressor(random_state=42)

rfr.fit(X_train_std_top30, y_train) # fit regressor

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(rfr, X_train_std_top30, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

rfr.fit(X_train_std_top30, y_train)

ypred_train = rfr.predict(X_train_std_top30)
ypred_train_dollar = np.round(np.exp(ypred_train))
y_train_dollar = np.round(np.exp(y_train))

y_pred = rfr.predict(X_test_std_top30)
y_pred_dollar = np.round(np.exp(y_pred))

rmse = np.sqrt(mean_squared_error(y_train_dollar, ypred_train_dollar))
print(f'Root Mean Squared Error: {rmse}')

# save predictions to csv
out = pd.DataFrame({"ID":test_ids, "price":y_pred_dollar})
out.to_csv("RandomForestPredictions.csv", index=False, header=True)
out

CV Root Mean Squared Error: 0.3497282783211024
Root Mean Squared Error: 55.767071221746164


Unnamed: 0,ID,price
0,7000,214.0
1,7001,166.0
2,7002,89.0
3,7003,170.0
4,7004,164.0
...,...,...
2995,9995,88.0
2996,9996,156.0
2997,9997,639.0
2998,9998,119.0


In [None]:
# on top 15 predictors
rfr = XGBRegressor(random_state=42)

rfr.fit(X_train_std_top15, y_train) # fit regressor

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(rfr, X_train_std_top15, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

rfr.fit(X_train_std_top15, y_train)

ypred_train = rfr.predict(X_train_std_top15)
ypred_train_dollar = np.round(np.exp(ypred_train))
y_train_dollar = np.round(np.exp(y_train))

y_pred = rfr.predict(X_train_std_top15)
y_pred_dollar = np.round(np.exp(y_pred))

rmse = np.sqrt(mean_squared_error(y_train_dollar, ypred_train_dollar))
print(f'Root Mean Squared Error: {rmse}')

y_pred = rfr.predict(X_test_std_top15)
y_pred_dollar = np.exp(y_pred)

# save predictions to csv
out = pd.DataFrame({"ID":test_ids, "price":y_pred_dollar})
out.to_csv("RandomForestPredictions.csv", index=False, header=True)
out

CV Root Mean Squared Error: 0.3731078176669536
Root Mean Squared Error: 82.76576923387525


Unnamed: 0,ID,price
0,7000,206.339050
1,7001,161.059860
2,7002,66.624374
3,7003,174.551117
4,7004,178.516312
...,...,...
2995,9995,71.737259
2996,9996,187.560028
2997,9997,463.088867
2998,9998,69.701622


In [None]:
# on top 10 predictors
xgb = XGBRegressor(random_state=42)

xgb.fit(X_train_std_top10, y_train) # fit regressor

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(xgb, X_train_std_top10, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

xgb.fit(X_train_std_top10, y_train)

ypred_train = xgb.predict(X_train_std_top10)
ypred_train_dollar = np.round(np.exp(ypred_train))
y_train_dollar = np.round(np.exp(y_train))

y_pred = xgb.predict(X_train_std_top10)
y_pred_dollar = np.round(np.exp(y_pred))

rmse = np.sqrt(mean_squared_error(y_train_dollar, ypred_train_dollar))
print(f'Root Mean Squared Error: {rmse}')

y_pred = xgb.predict(X_test_std_top10)
y_pred_dollar = np.exp(y_pred)

# save predictions to csv
out = pd.DataFrame({"ID":test_ids, "price":y_pred_dollar})
out.to_csv("XGBRegressorPredictions.csv", index=False, header=True)
out

CV Root Mean Squared Error: 0.4032907625708707
Root Mean Squared Error: 105.43441873858445


Unnamed: 0,ID,price
0,7000,218.025101
1,7001,170.046051
2,7002,52.022713
3,7003,211.393158
4,7004,172.832474
...,...,...
2995,9995,70.724083
2996,9996,132.522202
2997,9997,443.341919
2998,9998,54.599815


### Test new Features Importances with dropping original Features including outliers

In [None]:
test_ids = df11.index[7000:].values # save IDs for later output
X_train = df11.iloc[:7000]
X_test = df11.iloc[7000:]

In [None]:
# Standardize continuous columns
# Initialize a StandardScaler object for feature standardization
scaler = StandardScaler()

X_train_std = X_train.drop(['log_price', 'price'], axis=1).copy()
X_test_std = X_test.drop(['log_price', 'price'], axis=1).copy()

# Iterate through columns
for column in X_train_std.columns:
    # Check if the column has more than 10 unique values (threshold for categorical variables)
    if X_train_std[column].nunique() > 10:
        # Fit the StandardScaler on the training data
        scaler.fit(X_train_std[column].values.reshape(-1, 1))
        # Apply feature standardization to both training and test data
        X_train_std[column] = scaler.transform(X_train_std[column].values.reshape(-1, 1))
        X_test_std[column] = scaler.transform(X_test_std[column].values.reshape(-1, 1))
        
# put prices back
X_train_std[['log_price', 'price']] = X_train[['log_price', 'price']]
X_test_std[['log_price', 'price']] = X_test[['log_price', 'price']]

In [None]:
def get_outliers(data, threshold=3):
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data.values.reshape(-1, 1))
    z_scores = scaled_data.flatten()
    outlier_locations = np.where(np.abs(z_scores) > threshold)[0]
    outlier_values = data[outlier_locations]
    outlier_df = pd.DataFrame({'Outlier': outlier_values})
    return outlier_df

In [None]:
# Remove price outliers
price_outlier = get_outliers(X_train_std['price'], threshold=2)
X_train_std = X_train_std[~X_train_std['price'].isin(price_outlier['Outlier'].values)]
X_train_std.reset_index(drop=True, inplace=True)

In [None]:
y_train = X_train_std['log_price'].iloc[:7000].values
X_train_std = X_train_std.drop(['log_price', 'price'], axis=1)
X_test_std = X_test_std.drop(['log_price', 'price'], axis=1)

In [None]:
model = xgb.XGBRegressor()

feature_names = X_train_std.columns

model.fit(X_train_std, y_train)

importance = model.feature_importances_

df_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importance })

df_importances['Rank'] = df_importances['Importance'].rank(ascending=False)

# Sort the importances by rank
df_importances = df_importances.sort_values(by='Rank')

# Print the feature importances with ranks
print(len(importance))
print(len(feature_names))
df_importances

60
60


Unnamed: 0,Feature,Importance,Rank
8,accommodates,0.254028,1.0
7,room_type,0.179687,2.0
10,bedrooms,0.174557,3.0
42,property_type_Private room in home,0.057314,4.0
49,neighbourhood_cleansed_other,0.037697,5.0
9,bathrooms,0.034607,6.0
59,roomtype_guests_ratio,0.027882,7.0
14,availability_30,0.016865,8.0
39,instant_bookable_t,0.013511,9.0
44,property_type_other,0.012853,10.0


#### XGBoost

In [None]:
# xgboost on all predictors
xgb = XGBRegressor(random_state=42)

xgb.fit(X_train_std, y_train) # fit regressor

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(xgb, X_train_std, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

xgb.fit(X_train_std, y_train)


y_pred = xgb.predict(X_test_std)
y_pred_dollar = np.exp(y_pred)

# save predictions to csv
out = pd.DataFrame({"ID":test_ids, "price":y_pred_dollar})
out.to_csv("XGBRegressorPredictions.csv", index=False, header=True)
out

CV Root Mean Squared Error: 0.34122548543996434


Unnamed: 0,ID,price
0,7000,172.033112
1,7001,140.648758
2,7002,110.486366
3,7003,160.410461
4,7004,148.604355
...,...,...
2995,9995,95.687950
2996,9996,139.338654
2997,9997,770.766235
2998,9998,163.107880


In [None]:
# top 30 features
df_importances_top30 = [x for x in df_importances.Feature][:30]

X_train_std_top30 = X_train_std.copy()
X_test_std_top30 = X_test_std.copy()

for f in X_train_std_top30.columns:
    if f not in df_importances_top30:
        X_train_std_top30.drop(columns=f, inplace=True)
        X_test_std_top30.drop(columns=f, inplace=True)
        
# xgboost on top 30 predictors
xgb = XGBRegressor(random_state=42)

xgb.fit(X_train_std_top30, y_train) # fit regressor

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(xgb, X_train_std_top30, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

xgb.fit(X_train_std_top30, y_train)

ypred_train = xgb.predict(X_train_std_top30)
ypred_train_dollar = np.round(np.exp(ypred_train))
y_train_dollar = np.round(np.exp(y_train))

y_pred = xgb.predict(X_test_std_top30)
y_pred_dollar = np.round(np.exp(y_pred))

rmse = np.sqrt(mean_squared_error(y_train_dollar, ypred_train_dollar))
print(f'Root Mean Squared Error: {rmse}')

# save predictions to csv
out = pd.DataFrame({"ID":test_ids, "price":y_pred_dollar})
out.to_csv("XGBRegressorPredictions.csv", index=False, header=True)
out

CV Root Mean Squared Error: 0.3497282783211024
Root Mean Squared Error: 55.767071221746164


Unnamed: 0,ID,price
0,7000,214.0
1,7001,166.0
2,7002,89.0
3,7003,170.0
4,7004,164.0
...,...,...
2995,9995,88.0
2996,9996,156.0
2997,9997,639.0
2998,9998,119.0


In [None]:
# top 15 features
df_importances_top15 = [x for x in df_importances.Feature][:15]

X_train_std_top15 = X_train_std.copy()
X_test_std_top15 = X_test_std.copy()

for f in X_train_std_top15.columns:
    if f not in df_importances_top15:
        X_train_std_top15.drop(columns=f, inplace=True)
        X_test_std_top15.drop(columns=f, inplace=True)
        
# xgboost on top 15 predictors
xgb = XGBRegressor(random_state=42)

xgb.fit(X_train_std_top15, y_train) # fit regressor

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(xgb, X_train_std_top15, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

xgb.fit(X_train_std_top15, y_train)

ypred_train = xgb.predict(X_train_std_top15)
ypred_train_dollar = np.round(np.exp(ypred_train))
y_train_dollar = np.round(np.exp(y_train))

y_pred = xgb.predict(X_train_std_top15)
y_pred_dollar = np.round(np.exp(y_pred))

rmse = np.sqrt(mean_squared_error(y_train_dollar, ypred_train_dollar))
print(f'Root Mean Squared Error: {rmse}')

y_pred = xgb.predict(X_test_std_top15)
y_pred_dollar = np.exp(y_pred)

# save predictions to csv
out = pd.DataFrame({"ID":test_ids, "price":y_pred_dollar})
out.to_csv("XGBRegressorPredictions.csv", index=False, header=True)
out

CV Root Mean Squared Error: 0.3731078176669536
Root Mean Squared Error: 82.76576923387525


Unnamed: 0,ID,price
0,7000,206.339050
1,7001,161.059860
2,7002,66.624374
3,7003,174.551117
4,7004,178.516312
...,...,...
2995,9995,71.737259
2996,9996,187.560028
2997,9997,463.088867
2998,9998,69.701622


In [None]:
# top 10 features
df_importances_top10 = [x for x in df_importances.Feature][:10]

X_train_std_top10 = X_train_std.copy()
X_test_std_top10 = X_test_std.copy()

for f in X_train_std_top10.columns:
    if f not in df_importances_top10:
        X_train_std_top10.drop(columns=f, inplace=True)
        X_test_std_top10.drop(columns=f, inplace=True)
        
# xgboost on top 10 predictors
xgb = XGBRegressor(random_state=42)

xgb.fit(X_train_std_top10, y_train) # fit regressor

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(xgb, X_train_std_top10, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

xgb.fit(X_train_std_top10, y_train)

ypred_train = xgb.predict(X_train_std_top10)
ypred_train_dollar = np.round(np.exp(ypred_train))
y_train_dollar = np.round(np.exp(y_train))

y_pred = xgb.predict(X_train_std_top10)
y_pred_dollar = np.round(np.exp(y_pred))

rmse = np.sqrt(mean_squared_error(y_train_dollar, ypred_train_dollar))
print(f'Root Mean Squared Error: {rmse}')

y_pred = xgb.predict(X_test_std_top10)
y_pred_dollar = np.exp(y_pred)

# save predictions to csv
out = pd.DataFrame({"ID":test_ids, "price":y_pred_dollar})
out.to_csv("XGBRegressorPredictions.csv", index=False, header=True)
out

CV Root Mean Squared Error: 0.4032907625708707
Root Mean Squared Error: 105.43441873858445


Unnamed: 0,ID,price
0,7000,218.025101
1,7001,170.046051
2,7002,52.022713
3,7003,211.393158
4,7004,172.832474
...,...,...
2995,9995,70.724083
2996,9996,132.522202
2997,9997,443.341919
2998,9998,54.599815


#### RandomForest

In [None]:
# on all predictors
rfr = RandomForestRegressor(random_state=42)

rfr.fit(X_train_std, y_train) # fit regressor

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(rfr, X_train_std, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

rfr.fit(X_train_std, y_train)


y_pred = rfr.predict(X_test_std)
y_pred_dollar = np.exp(y_pred)

# save predictions to csv
out = pd.DataFrame({"ID":test_ids, "price":y_pred_dollar})
out.to_csv("RandomForestPredictions.csv", index=False, header=True)
out

CV Root Mean Squared Error: 0.3540751702663731


Unnamed: 0,ID,price
0,7000,198.813792
1,7001,169.654201
2,7002,130.997747
3,7003,177.638626
4,7004,168.811830
...,...,...
2995,9995,88.456658
2996,9996,147.616131
2997,9997,506.967327
2998,9998,126.102810


In [None]:
# on top 30 predictors
rfr = XGBRegressor(random_state=42)

rfr.fit(X_train_std_top30, y_train) # fit regressor

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(rfr, X_train_std_top30, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

rfr.fit(X_train_std_top30, y_train)

ypred_train = rfr.predict(X_train_std_top30)
ypred_train_dollar = np.round(np.exp(ypred_train))
y_train_dollar = np.round(np.exp(y_train))

y_pred = rfr.predict(X_test_std_top30)
y_pred_dollar = np.round(np.exp(y_pred))

rmse = np.sqrt(mean_squared_error(y_train_dollar, ypred_train_dollar))
print(f'Root Mean Squared Error: {rmse}')

# save predictions to csv
out = pd.DataFrame({"ID":test_ids, "price":y_pred_dollar})
out.to_csv("RandomForestPredictions.csv", index=False, header=True)
out

CV Root Mean Squared Error: 0.3497282783211024
Root Mean Squared Error: 55.767071221746164


Unnamed: 0,ID,price
0,7000,214.0
1,7001,166.0
2,7002,89.0
3,7003,170.0
4,7004,164.0
...,...,...
2995,9995,88.0
2996,9996,156.0
2997,9997,639.0
2998,9998,119.0


In [None]:
# on top 15 predictors
rfr = XGBRegressor(random_state=42)

rfr.fit(X_train_std_top15, y_train) # fit regressor

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(rfr, X_train_std_top15, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

rfr.fit(X_train_std_top15, y_train)

ypred_train = rfr.predict(X_train_std_top15)
ypred_train_dollar = np.round(np.exp(ypred_train))
y_train_dollar = np.round(np.exp(y_train))

y_pred = rfr.predict(X_train_std_top15)
y_pred_dollar = np.round(np.exp(y_pred))

rmse = np.sqrt(mean_squared_error(y_train_dollar, ypred_train_dollar))
print(f'Root Mean Squared Error: {rmse}')

y_pred = rfr.predict(X_test_std_top15)
y_pred_dollar = np.exp(y_pred)

# save predictions to csv
out = pd.DataFrame({"ID":test_ids, "price":y_pred_dollar})
out.to_csv("RandomForestPredictions.csv", index=False, header=True)
out

CV Root Mean Squared Error: 0.3731078176669536
Root Mean Squared Error: 82.76576923387525


Unnamed: 0,ID,price
0,7000,206.339050
1,7001,161.059860
2,7002,66.624374
3,7003,174.551117
4,7004,178.516312
...,...,...
2995,9995,71.737259
2996,9996,187.560028
2997,9997,463.088867
2998,9998,69.701622


In [None]:
# on top 10 predictors
xgb = XGBRegressor(random_state=42)

xgb.fit(X_train_std_top10, y_train) # fit regressor

n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

cv_scores = -cross_val_score(xgb, X_train_std_top10, y_train, cv=kf, scoring='neg_mean_squared_error')

mse = np.mean(cv_scores)
rmse = np.sqrt(mse)
print("CV Root Mean Squared Error:", rmse)

xgb.fit(X_train_std_top10, y_train)

ypred_train = xgb.predict(X_train_std_top10)
ypred_train_dollar = np.round(np.exp(ypred_train))
y_train_dollar = np.round(np.exp(y_train))

y_pred = xgb.predict(X_train_std_top10)
y_pred_dollar = np.round(np.exp(y_pred))

rmse = np.sqrt(mean_squared_error(y_train_dollar, ypred_train_dollar))
print(f'Root Mean Squared Error: {rmse}')

y_pred = xgb.predict(X_test_std_top10)
y_pred_dollar = np.exp(y_pred)

# save predictions to csv
out = pd.DataFrame({"ID":test_ids, "price":y_pred_dollar})
out.to_csv("XGBRegressorPredictions.csv", index=False, header=True)
out

CV Root Mean Squared Error: 0.4032907625708707
Root Mean Squared Error: 105.43441873858445


Unnamed: 0,ID,price
0,7000,218.025101
1,7001,170.046051
2,7002,52.022713
3,7003,211.393158
4,7004,172.832474
...,...,...
2995,9995,70.724083
2996,9996,132.522202
2997,9997,443.341919
2998,9998,54.599815
