### Open File

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from matplotlib import pyplot as plt

In [2]:
df = pd.read_csv('./data/df_features.gz')

I delete the rows without a distance to city center (because in the origin hotel don't have latitude and longitude)

In [44]:
df = df.dropna(subset=['Dist_Center'])
df_original = df.copy()

In [43]:
top_hotel = df[['Hotel_Address','City','Country']].groupby(['Hotel_Address','City']).count().reset_index().\
            sort_values('Country', ascending = False).iloc[[0],[0]].values[0][0]

In [47]:
df = df[df.Hotel_Address == top_hotel]

In [48]:
df.shape

(4789, 69)

### Create Predicted Category for final models (2 categories)

In [49]:
diff_hotels = df[['Hotel_Address','Diff']].groupby('Hotel_Address').describe()
diff_hotels = diff_hotels.Diff.reset_index()

In [50]:
df = pd.merge(df, diff_hotels, on='Hotel_Address')

In [51]:
category = np.array(['Bad' if i < 7 else 'Good' for i in df.Reviewer_Score])
df.loc[:, 'Category'] = category
df.Category.value_counts() / len(df)

Good    0.524535
Bad     0.475465
Name: Category, dtype: float64

### Balance Nationalities and / or Categories

In [52]:
def balance_df(df, Balance_Nationality, Balance_Category, cut):
    df_balance_country = df.copy()
    if Balance_Nationality:
        df_balance_country = None
        for i in list(df.Nationality_Recode.value_counts().index):
            nationality = df[df.Nationality_Recode == i]
            n = cut
            if len(nationality) < n:
                n = len(nationality)
            nationality = nationality.sample(n, random_state=1)
            df_balance_country = pd.concat([df_balance_country, nationality])
    
    df_balance_class = df_balance_country.copy()
    if Balance_Category:
        df_balance_class = None
        minclass = np.min(df_balance_country.Category.value_counts())
        classes = list(df_balance_country.Category.value_counts().index)
        for i in classes:
            selected_class = df_balance_country[df_balance_country.Category == i].sample(minclass, random_state=1)
            df_balance_class = pd.concat([df_balance_class, selected_class])
    
    return(df_balance_class)

In [53]:
df_balanced_1 = balance_df(df, Balance_Nationality=False, Balance_Category=False, cut=10000)
df_balanced_2 = balance_df(df_balanced_1, Balance_Nationality=False, Balance_Category=False, 
                         cut=int(np.median(df_balanced_1.Nationality_Recode.value_counts())*1.5))
df_balanced_2.shape

(4789, 78)

In [54]:
df_balanced_2.Nationality_Recode.value_counts()

UK & Ireland           3966
Western Europe          404
Eastern Europe           98
Asia & Pacific           77
North America            73
Oceania                  51
Middle east              44
Sub-Saharian Africa      40
South/Latin America      14
China                    11
Arab States               8
Name: Nationality_Recode, dtype: int64

### Prepare Data to run Models

In [55]:
if len(df_balanced_2) > 20000:
    df_model = df_balanced_2.sample(n=20000, random_state=1)
else:
    df_model = df_balanced_2.copy()

df_model.shape

(4789, 78)

In [71]:
x_categorical = ['Review_Month','City','Pet','Purpose','Whom','Room_Recode','Nationality_Recode','Length_Recode']
x_numerical = ['Total_Number_of_Reviews_Reviewer_Has_Given']
x_col = x_categorical + x_numerical
y_col = 'Category'

In [72]:
X_numerical = df_model[x_numerical]
X_numerical_std = X_numerical.apply(lambda x: ((x-np.mean(x)) / np.std(x)))

In [73]:
df_model['Review_Month'] = df_model['Review_Month'].astype(str)
X_categorical = pd.get_dummies(df_model[x_categorical], prefix_sep='_', drop_first=True)
X_categorical = X_categorical.fillna('Not Available')

In [74]:
X = pd.concat([X_numerical_std, X_categorical], axis=1, sort=False)
y = df_model[y_col]

Split into Train and Test

In [75]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, stratify=y, random_state=1)

In [76]:
X_test.shape, y_test.shape, X_train.shape, y_train.shape

((3832, 38), (3832,), (957, 38), (957,))

## MODELS

### Evaluate Model

In [77]:
import time
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, KFold, cross_val_score
from sklearn.metrics import accuracy_score, cohen_kappa_score, f1_score, precision_score, recall_score, make_scorer
from hyperopt import hp, fmin, tpe, rand, STATUS_OK, Trials, space_eval

In [78]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [79]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
    clf = model
    clf.fit(X_train, y_train)
    print_result(clf, X_train, X_test, y_train, y_test)
    return(clf.predict(X_test))

In [80]:
def evaluate_grid(model, params, X_train, X_test, y_train, y_test, verbose = 1):
    f1 = make_scorer(f1_score, pos_label = "Bad")
    clf = GridSearchCV(estimator = model, param_grid = params, n_jobs = -1, cv = 5, verbose = verbose)    
    clf.fit(X_train, y_train)
    print(clf.best_params_, clf.best_score_)
    print_result(clf, X_train, X_test, y_train, y_test)
    return(clf.predict(X_test))

In [81]:
def print_result(clf, X_train, X_test, y_train, y_test):
    print('Accuracy Test :', f'{accuracy_score(clf.predict(X_test), y_test):.4f}', 
          '| F1 Test :', f'{f1_score(clf.predict(X_test), y_test, pos_label="Bad"):.4f}',
          '| Precision Test :', f'{precision_score(clf.predict(X_test), y_test, pos_label="Bad"):.4f}', 
          '| H Test :', f'{H_score(clf.predict(X_test), y_test):.4f}')
    
    print('Accuracy Train:', f'{accuracy_score(clf.predict(X_train), y_train):.4f}', 
          '| F1 Train:', f'{f1_score(clf.predict(X_train), y_train, pos_label="Bad"):.4f}',
          '| Precision Train:', f'{precision_score(clf.predict(X_train), y_train, pos_label="Bad"):.4f}', 
          '| H Train:', f'{H_score(clf.predict(X_train), y_train):.4f}')

In [82]:
def H_score(X_train, y_train):
    acc = accuracy_score(X_train, y_train)
    f1 = f1_score(X_train, y_train, pos_label = "Bad")
    return(2 / ((1/acc)+(1/f1)))

In [83]:
def bayesian(space, X, y, modelo, nevals):
    
    #H = make_scorer(H_score, greater_is_better=True) 
        
    def objective(space):
        
        global best_score
        
        model = modelo(**space)   
        kfold = KFold(n_splits=5, random_state=1985, shuffle=True)
        score = -cross_val_score(model, X, y, cv=kfold, scoring='accuracy', verbose=False).mean()
        if (score < best_score):
            best_score = score
        return score

    start = time.time()

    best = fmin(
      objective, 
      space = space,
      algo = tpe.suggest, 
      max_evals = nevals,
      trials = Trials())

    print("Hyperopt search took %.2f seconds for 200 candidates" % ((time.time() - start)))
    print("Best score: %.4f " % (-best_score))
    print("Best space: ", space_eval(params, best))
    return(space_eval(params, best))

### KNN

In [85]:
pred_knn = evaluate_model(KNeighborsClassifier(n_neighbors=5), X_train, X_test, y_train, y_test)

Accuracy Test : 0.5253 | F1 Test : 0.4886 | Precision Test : 0.4769 | H Test : 0.5063
Accuracy Train: 0.6980 | F1 Train: 0.6792 | Precision Train: 0.6725 | H Train: 0.6885


### Gradient Boosted Trees

In [86]:
params = {'learning_rate':     hp.choice('learning_rate',[0.0001, 0.00025, 0.0005, 0.00075, 0.001, 0.0025, 0.005, 
                                                          0.0075, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.50]), 
          'n_estimators':      hp.choice('n_estimators', range(1,400)),
          'max_depth':         hp.choice('max_depth',range(1,20)),
          'min_samples_split': hp.choice('min_samples_split',np.linspace(0.01, 1.0, 10, endpoint=True)),
          'min_samples_leaf':  hp.choice('min_samples_leaf',np.linspace(0.01, 0.5, 50, endpoint=True)), 
          'subsample':         hp.choice('subsample',[1]), 
          'max_features':      hp.choice('max_features',['sqrt'])}

best_score = 1
gbt_params = bayesian(params, X_train, y_train, GradientBoostingClassifier, 50)
pred_gbt = evaluate_model(GradientBoostingClassifier(**gbt_params), X_train, X_test, y_train, y_test)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:50<00:00,  1.00s/trial, best loss: -0.5799738219895288]
Hyperopt search took 50.59 seconds for 200 candidates
Best score: 0.5800 
Best space:  {'learning_rate': 0.1, 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 0.17, 'min_samples_split': 0.78, 'n_estimators': 358, 'subsample': 1}
Accuracy Test : 0.5785 | F1 Test : 0.4923 | Precision Test : 0.4297 | H Test : 0.5320
Accuracy Train: 0.5799 | F1 Train: 0.4820 | Precision Train: 0.4110 | H Train: 0.5264


### Random Forest

In [87]:
params = {'bootstrap':         hp.choice('bootstrap',[True, False]),
          'max_depth':         hp.choice('max_depth', range(1, 20)),
          'max_features':      hp.choice('max_features',['auto', 'sqrt']),
          'min_samples_leaf':  hp.choice('min_samples_leaf',np.linspace(0.01, 0.5, 50, endpoint=True)), 
          'min_samples_split': hp.choice('min_samples_split',np.linspace(0.01, 1.0, 100, endpoint=True)), 
          'n_estimators':      hp.choice('n_estimators',range(1,400))}

best_score = 1
rf_params = bayesian(params, X_train, y_train, RandomForestClassifier, 50)
pred_rf = evaluate_model(RandomForestClassifier(**rf_params), X_train, X_test, y_train, y_test)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [02:15<00:00,  2.72s/trial, best loss: -0.5809609511343804]
Hyperopt search took 136.05 seconds for 200 candidates
Best score: 0.5810 
Best space:  {'bootstrap': True, 'max_depth': 7, 'max_features': 'auto', 'min_samples_leaf': 0.02, 'min_samples_split': 0.02, 'n_estimators': 388}
Accuracy Test : 0.5759 | F1 Test : 0.5083 | Precision Test : 0.4610 | H Test : 0.5400
Accuracy Train: 0.6123 | F1 Train: 0.5380 | Precision Train: 0.4747 | H Train: 0.5728


### XGBoosting

In [88]:
params = {'learning_rate':    hp.choice('learning_rate',[0.0001, 0.00025, 0.0005, 0.00075, 0.001, 0.0025, 0.005, 0.0075, 
                                                         0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75]), 
          'max_depth':        hp.choice('max_depth',range(1,20)),
          'min_child_weight': hp.choice('min_child_weight',np.linspace(0.01, 1.0, 100, endpoint=True)),
          'gamma':            hp.choice('gamma',np.linspace(0.01, 1.0, 100, endpoint=True)), 
          'colsample_bytree': hp.choice('colsample_bytree',np.linspace(0.0, 1, 101, endpoint=True)), 
          'n_estimators':     hp.choice('n_estimators', range(1,200))}

best_score = 1
xgb_params = bayesian(params, X_train, y_train, xgb.XGBClassifier, 50)
pred_xgb = evaluate_model(xgb.XGBClassifier(**xgb_params), X_train, X_test, y_train, y_test)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [01:06<00:00,  1.33s/trial, best loss: -0.5987892670157068]
Hyperopt search took 66.71 seconds for 200 candidates
Best score: 0.5988 
Best space:  {'colsample_bytree': 0.9, 'gamma': 0.77, 'learning_rate': 0.025, 'max_depth': 2, 'min_child_weight': 0.3, 'n_estimators': 81}
Accuracy Test : 0.5723 | F1 Test : 0.4974 | Precision Test : 0.4451 | H Test : 0.5322
Accuracy Train: 0.6165 | F1 Train: 0.5349 | Precision Train: 0.4637 | H Train: 0.5728


### Logistic Regresion

In [89]:
params = {"C":   hp.choice('C',[0.0001, 0.00025, 0.0005, 0.001, 0.0025, 0.005, 0.01, 0.025, 0.05, 0.1]),
          "tol": hp.choice('tol',[0.00001, 0.000025, 0.00005, 0.0001, 0.00025, 0.0005, 0.001, 0.0025, 0.005, 0.01, 0.025, 
                                  0.05, 0.1])}

best_score = 1
log_params = bayesian(params, X_train, y_train, LogisticRegression, 50)
pred_log = evaluate_model(LogisticRegression(**log_params), X_train, X_test, y_train, y_test)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:05<00:00,  9.22trial/s, best loss: -0.5851767015706807]
Hyperopt search took 5.44 seconds for 200 candidates
Best score: 0.5852 
Best space:  {'C': 0.025, 'tol': 5e-05}
Accuracy Test : 0.5801 | F1 Test : 0.4939 | Precision Test : 0.4308 | H Test : 0.5335
Accuracy Train: 0.6040 | F1 Train: 0.5033 | Precision Train: 0.4220 | H Train: 0.5490


### Decision Trees

In [96]:
iterations = 100
params = {"max_depth":        hp.choice('max_depth', range(1, 20)),
          "max_features":     hp.choice('max_features', range(1, 38)),
          "min_samples_leaf": hp.choice('min_samples_leaf', range(1, 50)),
          "criterion":        hp.choice('criterion', ["gini", "entropy"])}

best_score = 1
tree_params = bayesian(params, X_train, y_train, DecisionTreeClassifier, 150)
pred_tree = evaluate_model(DecisionTreeClassifier(**tree_params), X_train, X_test, y_train, y_test)

100%|████████████████████████████████████████████████████████████████████████████████████████████████| 150/150 [00:10<00:00, 14.43trial/s, best loss: -0.5924901832460733]
Hyperopt search took 10.43 seconds for 200 candidates
Best score: 0.5925 
Best space:  {'criterion': 'gini', 'max_depth': 2, 'max_features': 33, 'min_samples_leaf': 41}
Accuracy Test : 0.5814 | F1 Test : 0.5391 | Precision Test : 0.5148 | H Test : 0.5595
Accuracy Train: 0.5904 | F1 Train: 0.5344 | Precision Train: 0.4945 | H Train: 0.5610


### SVM

In [98]:
params = {"degree": hp.choice('degree', [2, 3, 4]),
          "kernel": hp.choice('kernel', ['poly']), 
          "C":      hp.choice('C', [0.0001, 0.00025, 0.0005, 0.00075, 0.001, 0.0025, 0.005, 0.0075, 
                                    0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75])}
best_score = 1
svm_params = bayesian(params, X_train, y_train, SVC, 30)
pred_svm = evaluate_model(SVC(**svm_params), X_train, X_test, y_train, y_test)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:11<00:00,  2.62trial/s, best loss: -0.5914594240837696]
Hyperopt search took 11.48 seconds for 200 candidates
Best score: 0.5915 
Best space:  {'C': 0.25, 'degree': 2, 'kernel': 'poly'}
Accuracy Test : 0.5793 | F1 Test : 0.5278 | Precision Test : 0.4945 | H Test : 0.5524
Accuracy Train: 0.6134 | F1 Train: 0.5521 | Precision Train: 0.5011 | H Train: 0.5811


### Naive Bayes

In [99]:
params = {"alpha":     hp.choice('alpha', [0.025, 0.05, 0.075, 0.1, 0.15, 0.20, 0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 
                                           2, 2.5, 5, 10]), 
          "fit_prior": hp.choice('fit_prior', [True, False])}

best_score = 1
nb_params = bayesian(params, X_train, y_train, BernoulliNB, 50)
pred_nb = evaluate_model(BernoulliNB(**nb_params), X_train, X_test, y_train, y_test)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:04<00:00, 10.86trial/s, best loss: -0.5893433682373473]
Hyperopt search took 4.62 seconds for 200 candidates
Best score: 0.5893 
Best space:  {'alpha': 0.75, 'fit_prior': True}
Accuracy Test : 0.5712 | F1 Test : 0.5023 | Precision Test : 0.4550 | H Test : 0.5345
Accuracy Train: 0.6176 | F1 Train: 0.5448 | Precision Train: 0.4813 | H Train: 0.5789
