In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.metrics import mean_squared_error,r2_score,roc_curve,auc
from sklearn.model_selection import KFold

import statsmodels.formula.api as smf
import statsmodels.api as sm


from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, ParameterGrid, StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier,BaggingRegressor,BaggingClassifier,RandomForestRegressor,RandomForestClassifier, AdaBoostClassifier
from sklearn.ensemble import VotingRegressor, VotingClassifier, StackingRegressor, StackingClassifier, GradientBoostingRegressor,GradientBoostingClassifier, BaggingRegressor,BaggingClassifier,RandomForestRegressor,RandomForestClassifier,AdaBoostRegressor,AdaBoostClassifier
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
import itertools as it
from pyearth import Earth

#Libraries for visualizing trees
from sklearn.tree import export_graphviz
from six import StringIO
from IPython.display import Image
import pydotplus
import time as time

# Data wrangling

In [2]:
data = pd.read_csv('data6.csv')

In [3]:
data.dropna(axis = 0, inplace = True)

In [4]:
data = pd.get_dummies(data)

In [5]:
X = data.drop(columns = ['y', 'review_scores_rating'])
y = data['y']

In [6]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y, train_size=0.8,random_state=1)

# Classification

In [154]:
#Function to compute confusion matrix and prediction accuracy on test/train data
def confusion_matrix_data(data,actual_values,model,cutoff=0.5):
#Predict the values using the Logit model
    pred_values = model.predict_proba(data)[:,1]
# Specify the bins
    bins=np.array([0,cutoff,1])
#Confusion matrix
    cm = np.histogram2d(actual_values, pred_values, bins=bins)[0]
    cm_df = pd.DataFrame(cm)
    cm_df.columns = ['Predicted 0','Predicted 1']
    cm_df = cm_df.rename(index={0: 'Actual 0',1:'Actual 1'})
# Calculate the accuracy
    accuracy = 100*(cm[0,0]+cm[1,1])/cm.sum()
    fnr = 100*(cm[1,0])/(cm[1,0]+cm[1,1])
    precision = 100*(cm[1,1])/(cm[0,1]+cm[1,1])
    fpr = 100*(cm[0,1])/(cm[0,0]+cm[0,1])
    tpr = 100*(cm[1,1])/(cm[1,0]+cm[1,1])
    print("Accuracy = ", accuracy)
    print("Precision = ", precision)
    print("FNR = ", fnr)
    print("FPR = ", fpr)
    print("TPR or Recall = ", tpr)
    print("Confusion matrix = \n", cm_df)
    return (" ")

# Base Non Linear Model - Decision Tree

In [9]:
model = DecisionTreeClassifier(random_state=1)

model.fit(Xtrain, ytrain)

DecisionTreeClassifier(random_state=1)

In [10]:
print(confusion_matrix_data(Xtrain,ytrain,model,cutoff = 0.5))

Accuracy =  100.0
Precision =  100.0
FNR =  0.0
FPR =  0.0
TPR or Recall =  100.0
Confusion matrix = 
           Predicted 0  Predicted 1
Actual 0       1536.0          0.0
Actual 1          0.0       1540.0
 


In [11]:
print(confusion_matrix_data(Xtest,ytest,model,cutoff = 0.5))

Accuracy =  65.32467532467533
Precision =  63.97849462365591
FNR =  35.84905660377358
FPR =  33.583959899749374
TPR or Recall =  64.15094339622641
Confusion matrix = 
           Predicted 0  Predicted 1
Actual 0        265.0        134.0
Actual 1        133.0        238.0
 


# Bagging

In [16]:
n_samples = Xtrain.shape[0]
n_features = Xtrain.shape[1]

params = {'n_estimators': [150,200,250],
          'max_samples': [0.5,1.0],
          'max_features': [0.5,1.0]}

cv = KFold(n_splits=5,shuffle=True,random_state=1)
bagging_classifier_grid = GridSearchCV(BaggingClassifier(random_state=1, n_jobs=-1), 
                                      param_grid =params, cv=cv, n_jobs=-1, verbose=1)
bagging_classifier_grid.fit(Xtrain, ytrain)

print('Train accuracy : %.3f'%bagging_classifier_grid.best_estimator_.score(Xtrain, ytrain))
print('Test accuracy : %.3f'%bagging_classifier_grid.best_estimator_.score(Xtest, ytest))
print('Best accuracy Through Grid Search : %.3f'%bagging_classifier_grid.best_score_)
print('Best Parameters : ',bagging_classifier_grid.best_params_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Train accuracy : 0.966
Test accuracy : 0.748
Best accuracy Through Grid Search : 0.731
Best Parameters :  {'max_features': 1.0, 'max_samples': 0.5, 'n_estimators': 200}


In [7]:
model = BaggingClassifier(n_estimators=200,
                          random_state=1,
                          max_features=1.0,
                          max_samples=0.5,
                          n_jobs=-1,
                          bootstrap=False,
                          bootstrap_features=False).fit(Xtrain, ytrain)

In [9]:
feature_importances = np.mean([
    tree.feature_importances_ for tree in model.estimators_
], axis=0)

importances = pd.concat([pd.Series(X.columns), pd.Series(feature_importances)], axis = 1)

importances.sort_values(by=1, ascending = False)

Unnamed: 0,0,1
2,host_is_superhost,0.162003
16,number_of_reviews,0.127762
6,price,0.078815
20,reviews_per_month,0.068303
15,availability_365,0.062974
...,...,...
73,property_type_Entire loft,0.001105
85,room_type_Shared room,0.000271
80,property_type_Room in boutique hotel,0.000203
83,room_type_Hotel room,0.000007


# Random Forest

In [32]:
params = {'n_estimators': [300, 500, 700],
          'max_features': range(1,6),
         }

param_list=list(it.product(*(params[Name] for Name in list(params.keys()))))
accuracy = [0]*len(param_list)

i=0
for pr in param_list:
    model = RandomForestClassifier(random_state=1,oob_score=True,verbose=False,n_estimators = pr[0],
                                  max_features=pr[1], n_jobs=-1).fit(Xtrain,ytrain)
    oob_pred = model.oob_decision_function_[:,1]
    bins=np.array([0,0.5,1])
    cm = np.histogram2d(ytrain, oob_pred, bins=bins)[0]
    accuracy = 100*(cm[0,0]+cm[1,1])/cm.sum()
    i=i+1
    
print("max accuracy = ", np.max(accuracy))
print("params= ", param_list[np.argmax(accuracy)])

max accuracy =  73.17945383615084
params=  (300, 1)


In [52]:
model = RandomForestClassifier(random_state=1,
                               n_jobs=-1,
                               max_features=1,
                               n_estimators=300).fit(Xtrain, ytrain)

In [54]:
feature_importances = np.mean([
    tree.feature_importances_ for tree in model.estimators_
], axis=0)

importances = pd.concat([pd.Series(X.columns), pd.Series(feature_importances)], axis = 1)

importances.sort_values(by=1, ascending = False)

Unnamed: 0,0,1
6,price,0.052431
20,reviews_per_month,0.051812
16,number_of_reviews,0.049996
15,availability_365,0.048253
2,host_is_superhost,0.047360
...,...,...
76,property_type_Entire serviced apartment,0.001239
80,property_type_Room in boutique hotel,0.001144
85,room_type_Shared room,0.000988
83,room_type_Hotel room,0.000842


# AdaBoost

In [44]:
model = AdaBoostClassifier(random_state=1)
grid = dict()
grid['n_estimators'] = [10, 50, 100,200,500]
grid['learning_rate'] = [0.0001, 0.001, 0.01,0.1, 1.0]
grid['base_estimator'] = [DecisionTreeClassifier(max_depth=3),DecisionTreeClassifier(max_depth=4),DecisionTreeClassifier(max_depth=5)]
# define the evaluation procedure
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
# define the grid search procedure
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv,scoring='accuracy',refit='accuracy')
# execute the grid search
grid_result = grid_search.fit(Xtrain, ytrain)
# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# summarize all scores that were evaluated
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
#for mean, stdev, param in zip(means, stds, params):
# print("%f (%f) with: %r" % (mean, stdev, param)

Best: 0.735698 using {'base_estimator': DecisionTreeClassifier(max_depth=4), 'learning_rate': 0.01, 'n_estimators': 200}


In [55]:
#Model based on the optimal parameters
model = AdaBoostClassifier(random_state=1,
                           base_estimator=DecisionTreeClassifier(max_depth=4),
                           learning_rate=0.01,
                           n_estimators=200).fit(Xtrain,ytrain)

# Gradient Boosting

In [64]:
model = GradientBoostingClassifier(random_state=1)
grid = dict()
grid['n_estimators'] = [10, 50, 100,200,500]
grid['learning_rate'] = [0.0001, 0.001, 0.01,0.1, 1.0]
grid['max_depth'] = [1,2,3,4,5]
grid['subsample'] = [0.5,1.0]
# define the evaluation procedure
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
# define the grid search procedure
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',refit='accuracy')
# execute the grid search
grid_result = grid_search.fit(Xtrain, ytrain)
# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# summarize all scores that were evaluated
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
#for mean, stdev, param in zip(means, stds, params):
#    print("%f (%f) with: %r" % (mean, stdev, param)

Best: 0.741548 using {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 500, 'subsample': 0.5}


In [65]:
#Model based on the optimal parameters
model = GradientBoostingClassifier(random_state=1,
                                   max_depth=5,
                                   learning_rate=0.01,
                                   subsample=0.5,
                                   n_estimators=500).fit(Xtrain,ytrain)

# XGBoost

In [72]:
param_grid = {'n_estimators':[100,200,300],
                'max_depth': [6,7,8],
              'learning_rate': [0.01,0.1],
               'gamma': [0.1,0.25],
               'reg_lambda':[0,0.01],
                'scale_pos_weight':[1.5,1.75]}

cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=1)
optimal_params = GridSearchCV(estimator=xgb.XGBClassifier(objective = 'binary:logistic',random_state=1,
                                                         use_label_encoder=False),
                             param_grid = param_grid,
                             scoring = 'accuracy',
                             verbose = 1,
                             n_jobs=-1,
                             cv = cv)
optimal_params.fit(Xtrain,ytrain)
print(optimal_params.best_params_,optimal_params.best_score_)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
{'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'reg_lambda': 0, 'scale_pos_weight': 1.5} 0.7389473128497519


In [75]:
model = xgb.XGBClassifier(objective = 'binary:logistic',
                           random_state=1,
                           gamma=0.1,
                           learning_rate = 0.1,
                           max_depth=6,
                           n_estimators = 100,
                           reg_lambda = 0,
                           scale_pos_weight=1.5,
                           use_label_encoder=False,
                           eval_metric='error')
model.fit(Xtrain,ytrain)
model.score(Xtest,ytest)

0.7376623376623377

# Ensemble Model

## Finding accuracies of the 5 models

In [77]:
#Bagging
model1 = BaggingClassifier(n_estimators=200,
                          random_state=1,
                          max_features=1.0,
                          max_samples=0.5,
                          n_jobs=-1,
                          bootstrap=False,
                          bootstrap_features=False).fit(Xtrain, ytrain)
test_accuracy1=model1.score(Xtest,ytest) #Returns the classification accuracy of the model on test data

#Random forest
model2 = RandomForestClassifier(random_state=1,
                               n_jobs=-1,
                               max_features=1,
                               n_estimators=300).fit(Xtrain, ytrain)
test_accuracy2=model2.score(Xtest,ytest) #Returns the classification accuracy of the model on test data
    
#Ada boost
model3 = AdaBoostClassifier(random_state=1,
                           base_estimator=DecisionTreeClassifier(max_depth=4),
                           learning_rate=0.01,
                           n_estimators=200).fit(Xtrain,ytrain)
test_accuracy3=model3.score(Xtest,ytest) #Returns the classification accuracy of the model on test data

#Gradient boost
model4 = GradientBoostingClassifier(random_state=1,
                                   max_depth=5,
                                   learning_rate=0.01,
                                   subsample=0.5,
                                   n_estimators=500).fit(Xtrain,ytrain)
test_accuracy4=model4.score(Xtest,ytest) #Returns the classification accuracy of the model on test data


#XGBoost
model5 = xgb.XGBClassifier(objective = 'binary:logistic',
                           random_state=1,
                           gamma=0.1,
                           learning_rate = 0.1,
                           max_depth=6,
                           n_estimators = 100,
                           reg_lambda = 0,
                           scale_pos_weight=1.5,
                           use_label_encoder=False,
                           eval_metric='error')
model5.fit(Xtrain,ytrain)
test_accuracy5=model5.score(Xtest,ytest) #Returns the classification accuracy of the model on test data


print("Bagging accuracy = ",test_accuracy1)
print("Random forest accuracy = ",test_accuracy2)
print("AdaBoost accuracy = ",test_accuracy3)
print("GradientBoost accuracy = ",test_accuracy4)
print("XGBoost accuracy = ",test_accuracy5)

Bagging accuracy =  0.7545454545454545
Random forest accuracy =  0.7480519480519481
AdaBoost accuracy =  0.7506493506493507
GradientBoost accuracy =  0.7545454545454545
XGBoost accuracy =  0.7376623376623377


## Hard voting

In [155]:
#Bagging
model1 = BaggingClassifier(n_estimators=200,
                          random_state=1,
                          max_features=1.0,
                          max_samples=0.5,
                          n_jobs=-1,
                          bootstrap=False,
                          bootstrap_features=False)
#Random forest
model2 = RandomForestClassifier(random_state=1,
                               n_jobs=-1,
                               max_features=1,
                               n_estimators=300)
    
#Ada boost
model3 = AdaBoostClassifier(random_state=1,
                           base_estimator=DecisionTreeClassifier(max_depth=4),
                           learning_rate=0.01,
                           n_estimators=200)

#Gradient boost
model = GradientBoostingClassifier(random_state=1,
                                   max_depth=5,
                                   learning_rate=0.01,
                                   subsample=0.5,
                                   n_estimators=500)

#XGBoost
model5 = xgb.XGBClassifier(objective = 'binary:logistic',
                           random_state=1,
                           gamma=0.1,
                           learning_rate = 0.1,
                           max_depth=6,
                           n_estimators = 100,
                           reg_lambda = 0,
                           scale_pos_weight=1.5,
                           use_label_encoder=False,
                           eval_metric='error')


ensemble_model = VotingClassifier(estimators=[('bag', model1),('rf',model2),
                                              ('ada',model3),('gb',model4),('xgb',model5)],
                                 voting = "hard")
ensemble_model.fit(Xtrain,ytrain)
print("")




In [156]:
np.mean(ensemble_model.predict(Xtest)==ytest)

0.7649350649350649

In [157]:
pred_values = ensemble_model.predict(Xtest)
bins=np.array([0,0.5,1])
cm = np.histogram2d(ytest, pred_values, bins=bins)[0]
cm_df = pd.DataFrame(cm)
cm_df.columns = ['Predicted 0','Predicted 1']
cm_df = cm_df.rename(index={0: 'Actual 0',1:'Actual 1'})
precision = 100*(cm[1,1])/(cm[0,1]+cm[1,1])
print("Precision = ", precision)

Precision =  73.75


## Soft voting

In [83]:
ensemble_model = VotingClassifier(estimators=[('bag', model1),('rf',model2),
                                              ('ada',model3),('gb',model4),('xgb',model5)],
                                 voting = "soft")
ensemble_model.fit(Xtrain,ytrain)
np.mean(ensemble_model.predict(Xtest)==ytest)

0.7545454545454545

In [84]:
print(confusion_matrix_data(Xtrain,ytrain,ensemble_model,cutoff=0.5))
print(confusion_matrix_data(Xtest,ytest,ensemble_model,cutoff=0.5))

Accuracy =  96.32639791937581
Precision =  95.53286534779834
FNR =  2.792207792207792
FPR =  4.557291666666667
TPR or Recall =  97.20779220779221
Confusion matrix = 
           Predicted 0  Predicted 1
Actual 0       1466.0         70.0
Actual 1         43.0       1497.0
 
Accuracy =  75.45454545454545
Precision =  72.75
FNR =  21.5633423180593
FPR =  27.318295739348372
TPR or Recall =  78.4366576819407
Confusion matrix = 
           Predicted 0  Predicted 1
Actual 0        290.0        109.0
Actual 1         80.0        291.0
 


## Stacking

In [85]:
#Using Logistic regression as the meta model (final_estimator)
ensemble_model = StackingClassifier(estimators=[('bag', model1),('rf',model2),
                                              ('ada',model3),('gb',model4),('xgb',model5)],
                                   final_estimator=LogisticRegression(random_state=1,max_iter=10000),n_jobs=-1,
                                   cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=1))
ensemble_model.fit(Xtrain,ytrain)
np.mean(ensemble_model.predict(Xtest)==ytest)

0.7506493506493507

In [86]:
print(confusion_matrix_data(Xtrain,ytrain,ensemble_model,cutoff=0.5))
print(confusion_matrix_data(Xtest,ytest,ensemble_model,cutoff=0.5))

Accuracy =  94.66840052015604
Precision =  94.61738002594034
FNR =  5.259740259740259
FPR =  5.403645833333333
TPR or Recall =  94.74025974025975
Confusion matrix = 
           Predicted 0  Predicted 1
Actual 0       1453.0         83.0
Actual 1         81.0       1459.0
 
Accuracy =  75.06493506493507
Precision =  73.1266149870801
FNR =  23.71967654986523
FPR =  26.06516290726817
TPR or Recall =  76.28032345013477
Confusion matrix = 
           Predicted 0  Predicted 1
Actual 0        295.0        104.0
Actual 1         88.0        283.0
 


In [115]:
#Using a tuned random forest as the meta model (final_estimator)

ensemble_model = StackingClassifier(estimators=[('bag', model1),('rf',model2),
                                                ('ada',model3),('gb',model4),('xgb',model5)],
                                   final_estimator=RandomForestClassifier(n_estimators=300, max_features=3,
                                                                          random_state=1,oob_score=True),n_jobs=-1,
                                   cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=1))
ensemble_model.fit(Xtrain,ytrain)
np.mean(ensemble_model.predict(Xtest)==ytest)

0.7376623376623377

In [117]:
#The final predictor - random forest obtains the maximum oob_score for max_features = 1
ensemble_model.final_estimator_.oob_score_

0.7213914174252276

In [118]:
print(confusion_matrix_data(Xtrain,ytrain,ensemble_model,cutoff=0.5))
print(confusion_matrix_data(Xtest,ytest,ensemble_model,cutoff=0.5))

Accuracy =  83.55006501950585
Precision =  80.70071258907363
FNR =  11.753246753246753
FPR =  21.158854166666668
TPR or Recall =  88.24675324675324
Confusion matrix = 
           Predicted 0  Predicted 1
Actual 0       1211.0        325.0
Actual 1        181.0       1359.0
 
Accuracy =  73.8961038961039
Precision =  71.68367346938776
FNR =  24.258760107816713
FPR =  27.81954887218045
TPR or Recall =  75.74123989218329
Confusion matrix = 
           Predicted 0  Predicted 1
Actual 0        288.0        111.0
Actual 1         90.0        281.0
 


# Best model (for test accuracy)  is hard voting