In [125]:
import pandas as pd
import numpy as np
import datetime as dt
import pickle
from matplotlib import pyplot as plt

## PREPARATION

### Functions

In [126]:
import time
from sklearn.model_selection import KFold, cross_val_score, RandomizedSearchCV, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, cohen_kappa_score, f1_score, precision_score, recall_score, make_scorer
from collections import Counter
from sklearn.neighbors import KDTree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from hyperopt import hp, fmin, tpe, rand, STATUS_OK, Trials, space_eval
import xgboost as xgb

In [127]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [128]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
    clf = model
    clf.fit(X_train, y_train)
    print_result(clf, X_train, X_test, y_train, y_test)
    return(clf.predict(X_test))

In [129]:
def print_result(clf, X_train, X_test, y_train, y_test):
    print('Accuracy Test :', f'{accuracy_score(clf.predict(X_test), y_test):.4f}', 
          '| F1 Test :', f'{f1_score(clf.predict(X_test), y_test, pos_label="Bad"):.4f}',
          '| Precision Test :', f'{precision_score(clf.predict(X_test), y_test, pos_label="Bad"):.4f}', 
          '| Recall Test :', f'{recall_score(clf.predict(X_test), y_test, pos_label="Bad"):.4f}', 
          '| H Test :', f'{H_score(clf.predict(X_test), y_test):.4f}')
    
    print('Accuracy Train:', f'{accuracy_score(clf.predict(X_train), y_train):.4f}', 
          '| F1 Train:', f'{f1_score(clf.predict(X_train), y_train, pos_label="Bad"):.4f}',
          '| Precision Train:', f'{precision_score(clf.predict(X_train), y_train, pos_label="Bad"):.4f}', 
          '| Recall Train:', f'{recall_score(clf.predict(X_train), y_train, pos_label="Bad"):.4f}', 
          '| H Train:', f'{H_score(clf.predict(X_train), y_train):.4f}')

In [130]:
def H_score(X_train, y_train):
    acc = accuracy_score(X_train, y_train)
    f1 = f1_score(X_train, y_train, pos_label = "Bad")
    return(2 / ((1/(acc+0.00001))+(1/(f1+0.00001))))

In [131]:
def bayesian(space, X, y, modelo, nevals):
    
    f1 = make_scorer(f1_score, pos_label = "Bad")
    
    def objective(space):        
        global best_score
        model = modelo(**space, random_state = 1)   
        cv =  StratifiedKFold(n_splits = 5, random_state = 1)
        score = -cross_val_score(model, X, y, cv = cv, scoring = f1, verbose = False).mean()
        if (score < best_score):
            best_score = score
        return score

    start = time.time()
    rstate = np.random.RandomState(1)
    best = fmin(objective, space = space, algo = tpe.suggest, max_evals = nevals,trials = Trials(), rstate = rstate)

    print("Hyperopt search took %.2f seconds" % ((time.time() - start)))
    print("Best score: %.4f " % (-best_score))
    print("Best space: ", space_eval(params, best))
    return(space_eval(params, best))

### Create Data Frame

In [132]:
df_balanced = pd.read_csv('./data/df_balanced.gz')
df_model = df_balanced.sample(n=10000, random_state=1)
df_stacking = df_balanced.drop(df_model.index).sample(n=10000, random_state=11)

In [133]:
x_categorical = ['Review_Month','City','Pet','Purpose','Whom','Room_Recode','Nationality_Recode','Length_Recode','Stars']
x_numerical = ['Average_Score', 'Total_Number_of_Reviews_Reviewer_Has_Given', 'Close_Landmarks', 'Dist_Center', 
               'Dist_Train', 'Dist_Airport','food_Neg_Hotel','staff_Neg_Hotel', 'location_Neg_Hotel', 'value_Neg_Hotel',
               'comfort_Neg_Hotel', 'room_Neg_Hotel', 'facilities_Neg_Hotel','cleanliness_Neg_Hotel', 
               'food_Pos_Hotel', 'staff_Pos_Hotel','location_Pos_Hotel', 'value_Pos_Hotel', 'comfort_Pos_Hotel',
               'room_Pos_Hotel', 'facilities_Pos_Hotel', 'cleanliness_Pos_Hotel','count', 'mean', 'std', 'min', '25%', 
               '50%', '75%', 'max']

x_col = x_categorical + x_numerical
y_col = 'Category'

In [134]:
X_numerical = df_stacking[x_numerical]
X_numerical_std = X_numerical.apply(lambda x: ((x-np.mean(x)) / np.std(x)))

In [135]:
df_stacking['Review_Month'] = df_stacking['Review_Month'].astype(str)
X_categorical = pd.get_dummies(df_stacking[x_categorical], prefix_sep='_', drop_first=False)
X_categorical = X_categorical.fillna('Not Available')

In [136]:
X = pd.concat([X_numerical_std, X_categorical], axis=1, sort=False)
y = df_stacking[y_col]

In [137]:
X = X[['Average_Score', 'Total_Number_of_Reviews_Reviewer_Has_Given', 'mean','std', 'max', 'Review_Month_1', 
       'Review_Month_10', 'Review_Month_2','Review_Month_3', 'Review_Month_4', 'Review_Month_7', 'Review_Month_8',
       'Review_Month_9', 'City_Amsterdam', 'City_London', 'Pet_With a pet','Purpose_Leisure trip', 
       'Whom_Family with older children','Whom_Family with young children', 'Whom_Travelers with friends',
       'Room_Recode_Deluxe', 'Room_Recode_Other (Standard)','Room_Recode_Studio', 'Nationality_Recode_Arab States',
       'Nationality_Recode_Asia & Pacific','Nationality_Recode_Eastern Europe', 'Nationality_Recode_Middle east',
       'Nationality_Recode_North America', 'Nationality_Recode_Oceania','Nationality_Recode_UK & Ireland', 
       'Nationality_Recode_Western Europe','Length_Recode_Stayed 2 nights', 'Length_Recode_Stayed 5 nights',
       'Length_Recode_Stayed 6 nights', 'Length_Recode_Stayed 7 nights','Length_Recode_Stayed 8 nights', 'Stars_Pension',
       'Stars_hotel de 3 estrellas']]

### Add Predictions

In [138]:
models = pickle.load(open('./sav/model_f1.sav', 'rb'))

In [139]:
pred_knn = [Counter([y.iloc[k] for k in x]).most_common(1)[0][0] for x in models[0].query(X, k = 114)[1]]
pred_log = models[1].predict(X)
pred_svm = models[2].predict(X)
pred_tree = models[3].predict(X)
pred_rf = models[4].predict(X)
pred_gbt = models[5].predict(X)
pred_xgb = models[6].predict(X)

And I append the predictions of the model to the dataset

In [140]:
X['logistic'] = pred_log
X['gbt'] = pred_gbt
X['knn'] = pred_knn
X['svm'] = pred_svm
X['tree'] = pred_tree
X['xgb'] = pred_xgb
X['rf'] = pred_rf

In [141]:
X.iloc[:,-7:].head(2)

Unnamed: 0,logistic,gbt,knn,svm,tree,xgb,rf
231987,Bad,Bad,Bad,Bad,Bad,Bad,Bad
178130,Good,Good,Good,Good,Good,Good,Good


In [142]:
X.iloc[:,-7:] = X.iloc[:,-7:].apply(lambda x: [1 if i=='Good' else 0 for i in x])

In [143]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)

## MODELS

### Gradient Boosting Trees

In [144]:
params = {'learning_rate':     hp.choice('learning_rate',[0.0001, 0.00025, 0.0005, 0.00075, 0.001, 0.0025, 0.005, 
                                                          0.0075, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.50, 0.75, 1]), 
          'n_estimators':      hp.choice('n_estimators', range(1,400)),
          'max_depth':         hp.choice('max_depth',range(1,20)),
          'min_samples_split': hp.choice('min_samples_split',np.linspace(0.01, 1.0, 10, endpoint=True)),
          'min_samples_leaf':  hp.choice('min_samples_leaf',np.linspace(0.01, 0.5, 50, endpoint=True)), 
          'subsample':         hp.choice('subsample',[1]), 
          'max_features':      hp.choice('max_features',['sqrt'])}

best_score = 1
gbt_params = bayesian(params, X_train, y_train, GradientBoostingClassifier, 50)
pred_gbt_stck = evaluate_model(GradientBoostingClassifier(**gbt_params), X_train, X_test, y_train, y_test)

100%|███████████████████████████████████████████████| 50/50 [02:48<00:00,  3.37s/trial, best loss: -0.6999080416837604]
Hyperopt search took 168.80 seconds
Best score: 0.6999 
Best space:  {'learning_rate': 0.001, 'max_depth': 13, 'max_features': 'sqrt', 'min_samples_leaf': 0.11, 'min_samples_split': 0.12, 'n_estimators': 69, 'subsample': 1}
Accuracy Test : 0.6085 | F1 Test : 0.6983 | Precision Test : 0.8900 | Recall Test : 0.5745 | H Test : 0.6503
Accuracy Train: 0.6132 | F1 Train: 0.7011 | Precision Train: 0.8907 | Recall Train: 0.5780 | H Train: 0.6542


### XGBoosting

In [145]:
params = {'learning_rate':    hp.choice('learning_rate',[0.0001, 0.00025, 0.0005, 0.00075, 0.001, 0.0025, 0.005, 0.0075, 
                                                         0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75]), 
          'max_depth':        hp.choice('max_depth',range(1,20)),
          'min_child_weight': hp.choice('min_child_weight',np.linspace(0.01, 1.0, 100, endpoint=True)),
          'gamma':            hp.choice('gamma',np.linspace(0.01, 1.0, 100, endpoint=True)), 
          'colsample_bytree': hp.choice('colsample_bytree',np.linspace(0.0, 1, 101, endpoint=True)), 
          'n_estimators':     hp.choice('n_estimators', range(1,200))}

best_score = 1
xgb_params = bayesian(params, X_train, y_train, xgb.XGBClassifier, 50)
pred_xgb_stck = evaluate_model(xgb.XGBClassifier(**xgb_params), X_train, X_test, y_train, y_test)

100%|████████████████████████████████████████████████| 50/50 [08:18<00:00,  9.97s/trial, best loss: -0.669229118889345]
Hyperopt search took 498.75 seconds
Best score: 0.6692 
Best space:  {'colsample_bytree': 0.37, 'gamma': 0.37, 'learning_rate': 0.0075, 'max_depth': 8, 'min_child_weight': 0.72, 'n_estimators': 10}
Accuracy Test : 0.6515 | F1 Test : 0.6833 | Precision Test : 0.7387 | Recall Test : 0.6357 | H Test : 0.6670
Accuracy Train: 0.6721 | F1 Train: 0.6981 | Precision Train: 0.7444 | Recall Train: 0.6571 | H Train: 0.6849


### SVM

In [146]:
params = {"degree": hp.choice('degree', [2, 3, 4]),
          "kernel": hp.choice('kernel', ['poly']), 
          "C":      hp.choice('C', [0.0001, 0.00025, 0.0005, 0.00075, 0.001, 0.0025, 0.005, 0.0075, 
                                    0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75])}
best_score = 1
svm_params = bayesian(params, X_train, y_train, SVC, 10)
pred_svm_stck = evaluate_model(SVC(**svm_params), X_train, X_test, y_train, y_test)

100%|███████████████████████████████████████████████| 10/10 [03:58<00:00, 23.80s/trial, best loss: -0.6935612049703657]
Hyperopt search took 238.02 seconds
Best score: 0.6936 
Best space:  {'C': 0.0005, 'degree': 4, 'kernel': 'poly'}
Accuracy Test : 0.5695 | F1 Test : 0.6982 | Precision Test : 0.9784 | Recall Test : 0.5428 | H Test : 0.6273
Accuracy Train: 0.5670 | F1 Train: 0.6953 | Precision Train: 0.9705 | Recall Train: 0.5417 | H Train: 0.6247


### Logistic Regression

In [147]:
params = {"C":   hp.choice('C',[0.0001, 0.00025, 0.0005, 0.001, 0.0025, 0.005, 0.01, 0.025, 0.05, 0.1]),
          ""
          "tol": hp.choice('tol',[0.00001, 0.000025, 0.00005, 0.0001, 0.00025, 0.0005, 0.001, 0.0025, 0.005, 0.01, 0.025, 
                                  0.05, 0.1])}

best_score = 1
log_params = bayesian(params, X_train, y_train, LogisticRegression, 50)
pred_log_stck = evaluate_model(LogisticRegression(**log_params), X_train, X_test, y_train, y_test)

100%|████████████████████████████████████████████████| 50/50 [00:24<00:00,  2.07trial/s, best loss: -0.662091903074624]
Hyperopt search took 24.22 seconds
Best score: 0.6621 
Best space:  {'C': 0.01, 'tol': 0.025}
Accuracy Test : 0.6515 | F1 Test : 0.6644 | Precision Test : 0.6778 | Recall Test : 0.6516 | H Test : 0.6579
Accuracy Train: 0.6560 | F1 Train: 0.6640 | Precision Train: 0.6676 | Recall Train: 0.6604 | H Train: 0.6600


### Random Forest

In [148]:
params = {'bootstrap':         hp.choice('bootstrap',[True, False]),
          'max_depth':         hp.choice('max_depth', range(1, 20)),
          'max_features':      hp.choice('max_features',['auto', 'sqrt']),
          'min_samples_leaf':  hp.choice('min_samples_leaf',np.linspace(0.01, 0.5, 50, endpoint=True)), 
          'min_samples_split': hp.choice('min_samples_split',np.linspace(0.01, 1.0, 100, endpoint=True)), 
          'n_estimators':      hp.choice('n_estimators',range(1,400))}

best_score = 1
rf_params = bayesian(params, X_train, y_train, RandomForestClassifier, 50)
pred_rf_stck = evaluate_model(RandomForestClassifier(**rf_params), X_train, X_test, y_train, y_test)

100%|███████████████████████████████████████████████| 50/50 [02:45<00:00,  3.31s/trial, best loss: -0.6747286789792464]
Hyperopt search took 165.69 seconds
Best score: 0.6747 
Best space:  {'bootstrap': True, 'max_depth': 7, 'max_features': 'sqrt', 'min_samples_leaf': 0.4, 'min_samples_split': 0.8200000000000001, 'n_estimators': 387}
Accuracy Test : 0.5090 | F1 Test : 0.6746 | Precision Test : 1.0000 | Recall Test : 0.5090 | H Test : 0.5802
Accuracy Train: 0.5091 | F1 Train: 0.6747 | Precision Train: 1.0000 | Recall Train: 0.5091 | H Train: 0.5804


### Best Model

In [149]:
svm_params = {'C': 0.0005, 'degree': 4, 'kernel': 'poly'}
pred_svm_stck = evaluate_model(SVC(**svm_params), X_train, X_test, y_train, y_test)

Accuracy Test : 0.5695 | F1 Test : 0.6982 | Precision Test : 0.9784 | Recall Test : 0.5428 | H Test : 0.6273
Accuracy Train: 0.5670 | F1 Train: 0.6953 | Precision Train: 0.9705 | Recall Train: 0.5417 | H Train: 0.6247


In [150]:
pd.crosstab(pred_svm_stck, y_test)

Category,Bad,Good
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Bad,996,839
Good,22,143
