## Importing libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
## Base models
import statsmodels.api as sm
from sklearn.linear_model import Lasso,Ridge,LassoCV,RidgeCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression

In [3]:
## Ensemble models
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [4]:
data=pd.read_csv(r"C:\Users\Akshaya\Desktop\python-great lakes\CAPSTONE\final_data_used-EDA.csv")

In [5]:
data.head()

Unnamed: 0.1,Unnamed: 0,host_is_superhost,host_identity_verified,zipcode,latitude,longitude,is_location_exact,property_type,room_type,accommodates,...,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,No_of_amenities
0,0,1,1,10119,52.534537,13.402557,f,Guesthouse,Entire home/apt,3,...,93.0,10.0,9.0,10.0,10.0,10.0,9.0,0,strict_14_with_grace_period,28
1,1,0,1,10437,52.548513,13.404553,t,Apartment,Private room,2,...,100.0,10.0,10.0,10.0,10.0,10.0,10.0,0,flexible,10
2,2,0,1,10405,52.534996,13.417579,t,Apartment,Entire home/apt,4,...,92.0,9.0,9.0,9.0,9.0,10.0,9.0,1,strict_14_with_grace_period,19
3,3,0,1,10777,52.498855,13.349065,t,Apartment,Private room,2,...,88.0,9.0,9.0,9.0,10.0,9.0,9.0,0,strict_14_with_grace_period,28
4,4,1,1,10437,52.543157,13.415091,t,Apartment,Private room,2,...,96.0,10.0,10.0,10.0,10.0,10.0,9.0,0,moderate,13


In [6]:
data1=pd.get_dummies(data,columns=['zipcode','property_type', 'room_type','bed_type','cancellation_policy'],drop_first=True)

In [7]:
data1.drop('amenities',axis=1,inplace=True)

In [8]:
data1['is_location_exact'] = data1['is_location_exact'].map({'f':0,'t':1})

## Splitting the data

In [9]:
y=data1['price']
X=data1.drop('price',axis=1)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

## Base Models

In [11]:
regressors = [
    LinearRegression(fit_intercept=True),
    DecisionTreeRegressor(max_depth= 4, max_features= 'auto', min_samples_leaf= 1,min_samples_split= 5),
    RandomForestRegressor(n_estimators=100),
    KNeighborsRegressor(n_neighbors= 11, weights= 'uniform')]

In [12]:
for i in regressors:
    
    model = i
    model.fit(X_train, y_train)
    print(i,"\n")
    print('train score', model.score(X_train,y_train))
    print('test score', model.score(X_test,y_test))
    y_pred_train= model.predict(X_train)
    y_pred_test= model.predict(X_test)
    rmse_train=np.sqrt(mean_squared_error(y_train,y_pred_train))
    rmse_test=np.sqrt(mean_squared_error(y_test,y_pred_test))
    print('rmse_train:',rmse_train)
    print('rmse_test:',rmse_test)
    print('***************************************')

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False) 

train score 0.313385873715357
test score 0.40484985765858406
rmse_train: 171.83620325329724
rmse_test: 172.04925364515606
***************************************
DecisionTreeRegressor(criterion='mse', max_depth=4, max_features='auto',
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=5, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best') 

train score 0.7405150963442103
test score 0.7607471421078947
rmse_train: 105.63658413720022
rmse_test: 109.08576689473712
***************************************
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_s

The models suffer from either bias or variance error. Hence we proceed to ensemble them.

## Ensembling the base models 

In [14]:
## Bagging
for i in regressors:
    
    model = BaggingRegressor(base_estimator=i)
    model.fit(X_train, y_train)
    print(i,"\n")
    print('train score', model.score(X_train,y_train))
    print('test score', model.score(X_test,y_test))
    y_pred_train= model.predict(X_train)
    y_pred_test= model.predict(X_test)
    rmse_train=np.sqrt(mean_squared_error(y_train,y_pred_train))
    rmse_test=np.sqrt(mean_squared_error(y_test,y_pred_test))
    print('rmse_train:',rmse_train)
    print('rmse_test:',rmse_test)
    print('***************************************')

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False) 

train score 0.3097989747642954
test score 0.40021702095160183
rmse_train: 172.28445810155458
rmse_test: 172.71759838395644
***************************************
DecisionTreeRegressor(criterion='mse', max_depth=4, max_features='auto',
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=5, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best') 

train score 0.7579959748823537
test score 0.7533386419906043
rmse_train: 102.01630650961783
rmse_test: 110.76181945684768
***************************************
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_

Bagging helps to minimise the variance error of the models, the variance in random forest model has been reduced, but to keep in mind random forest is already bagged, hence an external bagging may not be always advisable.

In [16]:
## Ada-boost regressor
for i in regressors:
    
    model = AdaBoostRegressor(base_estimator=i, random_state=0) 
    model.fit(X_train, y_train)
    print(i,"\n")
    print('train score', model.score(X_train,y_train))
    print('test score', model.score(X_test,y_test))
    y_pred_train= model.predict(X_train)
    y_pred_test= model.predict(X_test)
    rmse_train=np.sqrt(mean_squared_error(y_train,y_pred_train))
    rmse_test=np.sqrt(mean_squared_error(y_test,y_pred_test))
    print('rmse_train:',rmse_train)
    print('rmse_test:',rmse_test)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False) 

train score -17.3230524024292
test score -16.08517210766211
rmse_train: 887.6815900981265
rmse_test: 921.8265261909281
DecisionTreeRegressor(criterion='mse', max_depth=4, max_features='auto',
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=5, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best') 

train score -2.690281834751203
test score -2.4006156092756816
rmse_train: 398.3713238890769
rmse_test: 411.2617261939069
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0

Boosting helps reduce bias error, the models random forest and kNN have performed well on train sets compared to the non-boost models, but variance is also increasing.

In [17]:
# Gradient boost regressor

model= GradientBoostingRegressor(max_depth=7, max_features= 'sqrt', min_samples_leaf=1, min_samples_split= 2)
model.fit(X_train, y_train)

print('train score', model.score(X_train,y_train))
print('test score', model.score(X_test,y_test))
y_pred_train= model.predict(X_train)
y_pred_test= model.predict(X_test)
rmse_train=np.sqrt(mean_squared_error(y_train,y_pred_train))
rmse_test=np.sqrt(mean_squared_error(y_test,y_pred_test))
print('rmse_train:',rmse_train)
print('rmse_test:',rmse_test)

train score 0.9397619261396146
test score 0.7323971609814497
rmse_train: 50.89719393995624
rmse_test: 115.3678715682833


Gradient boost has helped to improve the performance of decision trees, but the model still suffers from variance error.


Conclusion:
Decision Tree model has been consistent among all the models, but it's performance is not enough. The Random Forest and Gradient Boost have better performances but suffer from lack consistency i.e., suffer with variance. By tuning the hyperparameters we may achieve better results. Other boosting methods may also be applied to see if there is an improvement in the performance.