### Open File

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from matplotlib import pyplot as plt

In [2]:
df = pd.read_csv('./data/df_features.gz')

### Create Predicted Category for final models (2 categories)

In [3]:
diff_hotels = df[['Hotel_Address','Diff']].groupby('Hotel_Address').describe()
diff_hotels = diff_hotels.Diff.reset_index()

In [4]:
df = pd.merge(df, diff_hotels, on='Hotel_Address')

In [5]:
category = np.array(['Bad' if i < 7 else 'Good' for i in df.Reviewer_Score])
df.loc[:, 'Category'] = category
df.Category.value_counts() / len(df)

Good    0.831599
Bad     0.168401
Name: Category, dtype: float64

### Balance Nationalities and / or Categories

In [6]:
def balance_df(df, Balance_Nationality, Balance_Category, cut):
    df_balance_country = df.copy()
    if Balance_Nationality:
        df_balance_country = None
        for i in list(df.Nationality_Recode.value_counts().index):
            nationality = df[df.Nationality_Recode == i]
            n = cut
            if len(nationality) < n:
                n = len(nationality)
            nationality = nationality.sample(n, random_state=1)
            df_balance_country = pd.concat([df_balance_country, nationality])
    
    df_balance_class = df_balance_country.copy()
    if Balance_Category:
        df_balance_class = None
        minclass = np.min(df_balance_country.Category.value_counts())
        classes = list(df_balance_country.Category.value_counts().index)
        for i in classes:
            selected_class = df_balance_country[df_balance_country.Category == i].sample(minclass, random_state=1)
            df_balance_class = pd.concat([df_balance_class, selected_class])
    
    return(df_balance_class)

In [7]:
df_balanced_1 = balance_df(df, Balance_Nationality=False, Balance_Category=True, cut=10000)
df_balanced_2 = balance_df(df_balanced_1, Balance_Nationality=True, Balance_Category=True, 
                         cut=int(np.median(df_balanced_1.Nationality_Recode.value_counts())*1.5))
df_balanced_2.shape

(77270, 78)

In [8]:
df_balanced_2.Nationality_Recode.value_counts()

North America          11829
UK & Ireland           11810
Western Europe         11809
Middle east            11761
Eastern Europe          9058
Asia & Pacific          7869
Oceania                 7555
Sub-Saharian Africa     2023
South/Latin America     1614
China                   1127
Arab States              815
Name: Nationality_Recode, dtype: int64

### Prepare Data to run Models

In [9]:
df_model = df_balanced_2.sample(n=20000, random_state=1)
df_model.shape

(20000, 78)

In [10]:
x_categorical = ['Review_Month','City','Pet','Purpose','Whom','Room_Recode','Nationality_Recode','Length_Recode','Stars']
x_numerical = ['Average_Score', 'Total_Number_of_Reviews_Reviewer_Has_Given', 'Close_Landmarks', 'Dist_Center', 
               'Dist_Train', 'Dist_Airport','food_Neg_Hotel','staff_Neg_Hotel', 'location_Neg_Hotel', 'value_Neg_Hotel',
               'comfort_Neg_Hotel', 'room_Neg_Hotel', 'facilities_Neg_Hotel','cleanliness_Neg_Hotel', 
               'food_Pos_Hotel', 'staff_Pos_Hotel','location_Pos_Hotel', 'value_Pos_Hotel', 'comfort_Pos_Hotel',
               'room_Pos_Hotel', 'facilities_Pos_Hotel', 'cleanliness_Pos_Hotel','count', 'mean', 'std', 'min', '25%', 
               '50%', '75%', 'max']
x_col = x_categorical + x_numerical
y_col = 'Category'

In [11]:
X_numerical = df_model[x_numerical]
X_numerical_std = X_numerical.apply(lambda x: ((x-np.mean(x)) / np.std(x)))

In [12]:
df_model['Review_Month'] = df_model['Review_Month'].astype(str)
X_categorical = pd.get_dummies(df_model[x_categorical], prefix_sep='_', drop_first=True)
X_categorical = X_categorical.fillna('Not Available')

In [13]:
X = pd.concat([X_numerical_std, X_categorical], axis=1, sort=False)
y = df_model[y_col]

Split into Train and Test

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, stratify=y, random_state=1)

In [15]:
X_test.shape, y_test.shape, X_train.shape, y_train.shape

((10000, 77), (10000,), (10000, 77), (10000,))

## MODELS

### Evaluate Model

In [16]:
import time
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, KFold, cross_val_score
from sklearn.metrics import accuracy_score, cohen_kappa_score, f1_score, precision_score, recall_score, make_scorer
from hyperopt import hp, fmin, tpe, rand, STATUS_OK, Trials, space_eval

In [17]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [18]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
    clf = model
    clf.fit(X_train, y_train)
    print_result(clf, X_train, X_test, y_train, y_test)
    return(clf.predict(X_test))

In [19]:
def evaluate_grid(model, params, X_train, X_test, y_train, y_test, verbose = 1):
    f1 = make_scorer(f1_score, pos_label = "Bad")
    clf = GridSearchCV(estimator = model, param_grid = params, n_jobs = -1, cv = 5, verbose = verbose)    
    clf.fit(X_train, y_train)
    print(clf.best_params_, clf.best_score_)
    print_result(clf, X_train, X_test, y_train, y_test)
    return(clf.predict(X_test))

In [20]:
def print_result(clf, X_train, X_test, y_train, y_test):
    print('Accuracy Test :', f'{accuracy_score(clf.predict(X_test), y_test):.4f}', 
          '| F1 Test :', f'{f1_score(clf.predict(X_test), y_test, pos_label="Bad"):.4f}',
          '| Precision Test :', f'{precision_score(clf.predict(X_test), y_test, pos_label="Bad"):.4f}', 
          '| H Test :', f'{H_score(clf.predict(X_test), y_test):.4f}')
    
    print('Accuracy Train:', f'{accuracy_score(clf.predict(X_train), y_train):.4f}', 
          '| F1 Train:', f'{f1_score(clf.predict(X_train), y_train, pos_label="Bad"):.4f}',
          '| Precision Train:', f'{precision_score(clf.predict(X_train), y_train, pos_label="Bad"):.4f}', 
          '| H Train:', f'{H_score(clf.predict(X_train), y_train):.4f}')

In [21]:
def H_score(X_train, y_train):
    acc = accuracy_score(X_train, y_train)
    f1 = f1_score(X_train, y_train, pos_label = "Bad")
    return(2 / ((1/acc)+(1/f1)))

In [22]:
def bayesian(space, X, y, modelo, nevals):
    
    #H = make_scorer(H_score, greater_is_better=True) 
    f1 = make_scorer(f1_score, pos_label = "Bad")
        
    def objective(space):
        
        global best_score
        
        model = modelo(**space)   
        kfold = KFold(n_splits=5, random_state=1985, shuffle=True)
        score = -cross_val_score(model, X, y, cv=kfold, scoring=f1, verbose=False).mean()
        if (score < best_score):
            best_score = score
        return score

    start = time.time()

    best = fmin(
      objective, 
      space = space,
      algo = tpe.suggest, 
      max_evals = nevals,
      trials = Trials())

    print("Hyperopt search took %.2f seconds for 200 candidates" % ((time.time() - start)))
    print("Best score: %.4f " % (-best_score))
    print("Best space: ", space_eval(params, best))
    return(space_eval(params, best))

### KNN

### Gradient Boosted Trees

In [None]:
params = {'learning_rate':     hp.choice('learning_rate',[0.0001, 0.00025, 0.0005, 0.00075, 0.001, 0.0025, 0.005, 
                                                          0.0075, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.50]), 
          'n_estimators':      hp.choice('n_estimators', range(1,400)),
          'max_depth':         hp.choice('max_depth',range(1,20)),
          'min_samples_split': hp.choice('min_samples_split',np.linspace(0.01, 1.0, 10, endpoint=True)),
          'min_samples_leaf':  hp.choice('min_samples_leaf',np.linspace(0.01, 0.5, 50, endpoint=True)), 
          'subsample':         hp.choice('subsample',[1]), 
          'max_features':      hp.choice('max_features',['sqrt'])}

best_score = 1
gbt_params = bayesian(params, X_train, y_train, GradientBoostingClassifier, 50)
pred_gbt = evaluate_model(GradientBoostingClassifier(**gbt_params), X_train, X_test, y_train, y_test)

 16%|████████████████                                                                                    | 8/50 [00:49<04:39,  6.65s/trial, best loss: -0.691895769399164]

### Random Forest

In [26]:
params = {'bootstrap':         hp.choice('bootstrap',[True, False]),
          'max_depth':         hp.choice('max_depth', range(1, 20)),
          'max_features':      hp.choice('max_features',['auto', 'sqrt']),
          'min_samples_leaf':  hp.choice('min_samples_leaf',np.linspace(0.01, 0.5, 50, endpoint=True)), 
          'min_samples_split': hp.choice('min_samples_split',np.linspace(0.01, 1.0, 100, endpoint=True)), 
          'n_estimators':      hp.choice('n_estimators',range(1,400))}

best_score = 1
rf_params = bayesian(params, X_train, y_train, RandomForestClassifier, 50)
pred_rf = evaluate_model(RandomForestClassifier(**rf_params), X_train, X_test, y_train, y_test)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [02:45<00:00,  3.30s/trial, best loss: -0.6696542675503666]
Hyperopt search took 165.22 seconds for 200 candidates
Best score: 0.6697 
Best space:  {'bootstrap': True, 'max_depth': 4, 'max_features': 'auto', 'min_samples_leaf': 0.37, 'min_samples_split': 0.6, 'n_estimators': 111}
Accuracy Test : 0.5033 | F1 Test : 0.6696 | Precision Test : 1.0000 | H Test : 0.5747
Accuracy Train: 0.5034 | F1 Train: 0.6697 | Precision Train: 1.0000 | H Train: 0.5748


### XGBoosting

In [27]:
params = {'learning_rate':    hp.choice('learning_rate',[0.0001, 0.00025, 0.0005, 0.00075, 0.001, 0.0025, 0.005, 0.0075, 
                                                         0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75]), 
          'max_depth':        hp.choice('max_depth',range(1,20)),
          'min_child_weight': hp.choice('min_child_weight',np.linspace(0.01, 1.0, 100, endpoint=True)),
          'gamma':            hp.choice('gamma',np.linspace(0.01, 1.0, 100, endpoint=True)), 
          'colsample_bytree': hp.choice('colsample_bytree',np.linspace(0.0, 1, 101, endpoint=True)), 
          'n_estimators':     hp.choice('n_estimators', range(1,200))}

best_score = 1
xgb_params = bayesian(params, X_train, y_train, xgb.XGBClassifier, 50)
pred_xgb = evaluate_model(xgb.XGBClassifier(**xgb_params), X_train, X_test, y_train, y_test)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [13:10<00:00, 15.80s/trial, best loss: -0.6705076855956935]
Hyperopt search took 790.21 seconds for 200 candidates
Best score: 0.6705 
Best space:  {'colsample_bytree': 0.74, 'gamma': 0.64, 'learning_rate': 0.025, 'max_depth': 4, 'min_child_weight': 0.86, 'n_estimators': 138}
Accuracy Test : 0.6512 | F1 Test : 0.6600 | Precision Test : 0.6726 | H Test : 0.6556
Accuracy Train: 0.6894 | F1 Train: 0.6997 | Precision Train: 0.7187 | H Train: 0.6945


### Logistic Regresion

In [28]:
params = {"C":   hp.choice('C',[0.0001, 0.00025, 0.0005, 0.001, 0.0025, 0.005, 0.01, 0.025, 0.05, 0.1]),
          "tol": hp.choice('tol',[0.00001, 0.000025, 0.00005, 0.0001, 0.00025, 0.0005, 0.001, 0.0025, 0.005, 0.01, 0.025, 
                                  0.05, 0.1])}

best_score = 1
log_params = bayesian(params, X_train, y_train, LogisticRegression, 50)
pred_log = evaluate_model(LogisticRegression(**log_params), X_train, X_test, y_train, y_test)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:28<00:00,  1.75trial/s, best loss: -0.6659037353441525]
Hyperopt search took 28.60 seconds for 200 candidates
Best score: 0.6659 
Best space:  {'C': 0.01, 'tol': 0.0005}
Accuracy Test : 0.6555 | F1 Test : 0.6566 | Precision Test : 0.6545 | H Test : 0.6561
Accuracy Train: 0.6670 | F1 Train: 0.6719 | Precision Train: 0.6772 | H Train: 0.6694


### Decision Trees

In [29]:
iterations = 100
params = {"max_depth":        hp.choice('max_depth', range(1, 20)),
          "max_features":     hp.choice('max_features', range(1, 50)),
          "min_samples_leaf": hp.choice('min_samples_leaf', range(1, 50)),
          "criterion":        hp.choice('criterion', ["gini", "entropy"])}

best_score = 1
tree_params = bayesian(params, X_train, y_train, DecisionTreeClassifier, 150)
pred_tree = evaluate_model(DecisionTreeClassifier(**tree_params), X_train, X_test, y_train, y_test)

100%|████████████████████████████████████████████████████████████████████████████████████████████████| 150/150 [01:05<00:00,  2.31trial/s, best loss: -0.6729893201948508]
Hyperopt search took 65.07 seconds for 200 candidates
Best score: 0.6730 
Best space:  {'criterion': 'entropy', 'max_depth': 4, 'max_features': 28, 'min_samples_leaf': 42}
Accuracy Test : 0.6396 | F1 Test : 0.6215 | Precision Test : 0.5879 | H Test : 0.6304
Accuracy Train: 0.6471 | F1 Train: 0.6327 | Precision Train: 0.6039 | H Train: 0.6398


### SVM

In [30]:
params = {"degree": hp.choice('degree', [2, 3, 4]),
          "kernel": hp.choice('kernel', ['poly']), 
          "C":      hp.choice('C', [0.0001, 0.00025, 0.0005, 0.00075, 0.001, 0.0025, 0.005, 0.0075, 
                                    0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75])}
best_score = 1
svm_params = bayesian(params, X_train, y_train, SVC, 10)
pred_svm = evaluate_model(SVC(**svm_params), X_train, X_test, y_train, y_test)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [07:09<00:00, 42.95s/trial, best loss: -0.7003136174133103]
Hyperopt search took 429.48 seconds for 200 candidates
Best score: 0.7003 
Best space:  {'C': 0.00075, 'degree': 3, 'kernel': 'poly'}
Accuracy Test : 0.6022 | F1 Test : 0.6934 | Precision Test : 0.8937 | H Test : 0.6446
Accuracy Train: 0.6134 | F1 Train: 0.7021 | Precision Train: 0.9050 | H Train: 0.6548


### Naive Bayes

In [31]:
params = {"alpha":     hp.choice('alpha', [0.025, 0.05, 0.075, 0.1, 0.15, 0.20, 0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 
                                           2, 2.5, 5, 10]), 
          "fit_prior": hp.choice('fit_prior', [True, False])}

best_score = 1
nb_params = bayesian(params, X_train, y_train, BernoulliNB, 50)
pred_nb = evaluate_model(BernoulliNB(**nb_params), X_train, X_test, y_train, y_test)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:24<00:00,  2.05trial/s, best loss: -0.6421491418560226]
Hyperopt search took 24.42 seconds for 200 candidates
Best score: 0.6421 
Best space:  {'alpha': 1, 'fit_prior': True}
Accuracy Test : 0.6465 | F1 Test : 0.6391 | Precision Test : 0.6219 | H Test : 0.6428
Accuracy Train: 0.6457 | F1 Train: 0.6417 | Precision Train: 0.6301 | H Train: 0.6437


## STACKING

Predictions using Random Forest adding the predicions of other models to the original dataset.
I had to split the test set in 2 to create the new train/test set, otherwise i fall into overfitting because my original training set is biased vs the test set

First I append the predictions of the model to the dataset

In [32]:
X_2 = X_test.copy()
y_2 = y_test.copy()

In [33]:
X_2['logistic'] = pred_log
X_2['gbt'] = pred_gbt
X_2['knn'] = pred_knn
X_2['svm'] = pred_svm
X_2['tree'] = pred_tree
X_2['xgb'] = pred_xgb
X_2['nb'] = pred_nb
X_2['rf'] = pred_rf

In [34]:
X_2.iloc[:,-8:].head(2)

Unnamed: 0,logistic,gbt,knn,svm,tree,xgb,nb,rf
333373,Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad
127076,Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad


In [35]:
X_2.iloc[:,-8:] = X_2.iloc[:,-8:].apply(lambda x: [1 if i=='Good' else 0 for i in x])

In [36]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2, test_size=0.2, random_state=100)

### Gradient Boosting Trees

In [37]:
params = {'learning_rate':     hp.choice('learning_rate',[0.0001, 0.00025, 0.0005, 0.00075, 0.001, 0.0025, 0.005, 
                                                          0.0075, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.50]), 
          'n_estimators':      hp.choice('n_estimators', range(1,400)),
          'max_depth':         hp.choice('max_depth',range(1,20)),
          'min_samples_split': hp.choice('min_samples_split',np.linspace(0.01, 1.0, 10, endpoint=True)),
          'min_samples_leaf':  hp.choice('min_samples_leaf',np.linspace(0.01, 0.5, 50, endpoint=True)), 
          'subsample':         hp.choice('subsample',[1]), 
          'max_features':      hp.choice('max_features',['sqrt'])}

best_score = 1
gbt_params = bayesian(params, X_train_2, y_train_2, GradientBoostingClassifier, 50)
pred_gbt_stck = evaluate_model(GradientBoostingClassifier(**gbt_params), X_train_2, X_test_2, y_train_2, y_test_2)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [03:11<00:00,  3.83s/trial, best loss: -0.6888292216284582]
Hyperopt search took 191.75 seconds for 200 candidates
Best score: 0.6888 
Best space:  {'learning_rate': 0.0005, 'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 0.060000000000000005, 'min_samples_split': 0.12, 'n_estimators': 192, 'subsample': 1}
Accuracy Test : 0.6385 | F1 Test : 0.6850 | Precision Test : 0.8103 | H Test : 0.6609
Accuracy Train: 0.6472 | F1 Train: 0.6988 | Precision Train: 0.8058 | H Train: 0.6720


### XGB

In [38]:
params = {'learning_rate':    hp.choice('learning_rate',[0.0001, 0.00025, 0.0005, 0.00075, 0.001, 0.0025, 0.005, 0.0075, 
                                                         0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75]), 
          'max_depth':        hp.choice('max_depth',range(1,20)),
          'min_child_weight': hp.choice('min_child_weight',np.linspace(0.01, 1.0, 100, endpoint=True)),
          'gamma':            hp.choice('gamma',np.linspace(0.01, 1.0, 100, endpoint=True)), 
          'colsample_bytree': hp.choice('colsample_bytree',np.linspace(0.0, 1, 101, endpoint=True)), 
          'n_estimators':     hp.choice('n_estimators', range(1,200))}

best_score = 1
xgb_params = bayesian(params, X_train_2, y_train_2, xgb.XGBClassifier, 50)
pred_xgb_stck = evaluate_model(xgb.XGBClassifier(**xgb_params), X_train_2, X_test_2, y_train_2, y_test_2)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [11:31<00:00, 13.83s/trial, best loss: -0.6762949391020145]
Hyperopt search took 691.56 seconds for 200 candidates
Best score: 0.6763 
Best space:  {'colsample_bytree': 0.02, 'gamma': 0.67, 'learning_rate': 0.0005, 'max_depth': 4, 'min_child_weight': 0.9600000000000001, 'n_estimators': 91}
Accuracy Test : 0.6410 | F1 Test : 0.6604 | Precision Test : 0.7196 | H Test : 0.6505
Accuracy Train: 0.6620 | F1 Train: 0.6867 | Precision Train: 0.7293 | H Train: 0.6741


### SVM

In [39]:
params = {"degree": hp.choice('degree', [2, 3, 4]),
          "kernel": hp.choice('kernel', ['poly']), 
          "C":      hp.choice('C', [0.0001, 0.00025, 0.0005, 0.00075, 0.001, 0.0025, 0.005, 0.0075, 
                                    0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75])}
best_score = 1
svm_params = bayesian(params, X_train_2, y_train_2, SVC, 10)
pred_svm_stck = evaluate_model(SVC(**svm_params), X_train_2, X_test_2, y_train_2, y_test_2)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [05:13<00:00, 31.33s/trial, best loss: -0.6936698168236641]
Hyperopt search took 313.32 seconds for 200 candidates
Best score: 0.6937 
Best space:  {'C': 0.075, 'degree': 4, 'kernel': 'poly'}
Accuracy Test : 0.6155 | F1 Test : 0.6908 | Precision Test : 0.8856 | H Test : 0.6510
Accuracy Train: 0.6306 | F1 Train: 0.7070 | Precision Train: 0.8774 | H Train: 0.6666


### Logistic Regression

In [40]:
params = {"C":   hp.choice('C',[0.0001, 0.00025, 0.0005, 0.00075, 0.001, 0.0025, 0.005, 0.0075, 0.01, 0.025, 0.05, 
                                0.075, 0.1]),
          "tol": hp.choice('tol',[0.00001, 0.000025, 0.00005, 0.000075, 0.0001, 0.00025, 0.0005, 0.00075, 
                                  0.001, 0.0025, 0.005, 0.0075, 0.01, 0.025, 0.05, 0.075, 0.1])}

best_score = 1
log_params = bayesian(params, X_train_2, y_train_2, LogisticRegression, 50)
pred_log_stck = evaluate_model(LogisticRegression(**log_params), X_train_2, X_test_2, y_train_2, y_test_2)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:24<00:00,  2.07trial/s, best loss: -0.6619575242257707]
Hyperopt search took 24.19 seconds for 200 candidates
Best score: 0.6620 
Best space:  {'C': 0.005, 'tol': 0.005}
Accuracy Test : 0.6665 | F1 Test : 0.6663 | Precision Test : 0.6866 | H Test : 0.6664
Accuracy Train: 0.6596 | F1 Train: 0.6676 | Precision Train: 0.6729 | H Train: 0.6636


### Decision Trees

In [41]:
iterations = 100
params = {"max_depth":        hp.choice('max_depth', range(1, 20)),
          "max_features":     hp.choice('max_features', range(1, 50)),
          "min_samples_leaf": hp.choice('min_samples_leaf', range(1, 50)),
          "criterion":        hp.choice('criterion', ["gini", "entropy"])}

best_score = 1
tree_params = bayesian(params, X_train_2, y_train_2, DecisionTreeClassifier, 150)
pred_tree_stck = evaluate_model(DecisionTreeClassifier(**tree_params), X_train_2, X_test_2, y_train_2, y_test_2)

100%|████████████████████████████████████████████████████████████████████████████████████████████████| 150/150 [00:54<00:00,  2.77trial/s, best loss: -0.6766150494939945]
Hyperopt search took 54.24 seconds for 200 candidates
Best score: 0.6766 
Best space:  {'criterion': 'entropy', 'max_depth': 3, 'max_features': 27, 'min_samples_leaf': 25}
Accuracy Test : 0.6540 | F1 Test : 0.6802 | Precision Test : 0.7588 | H Test : 0.6669
Accuracy Train: 0.6524 | F1 Train: 0.6837 | Precision Train: 0.7396 | H Train: 0.6676


In [45]:
pd.crosstab(pred_gbt_stck, y_test_2)

Category,Bad,Good
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Bad,786,539
Good,184,491


## BACKWARD ELIMINATION

In [46]:
import random
from random import sample

In [47]:
def try_seed(seed, verbose=True):
    score = check_model('', X_train_2, X_test_2, y_train_2, y_test_2)[1]
    varout = []
    varin = list(X_test_2.columns)

    for n in range(len(varin)):
        max_score = score
        max_feature = []
        random.seed(seed)
        
        for i in sample(varin, len(varin)):
            var_test = varin.copy()
            var_test.remove(i)
            X_train_vartest = X_train_2[var_test]
            X_test_vartest = X_test_2[var_test]
            check = check_model(i, X_train_vartest, X_test_vartest, y_train_2, y_test_2)
            if check[1] > max_score:
                max_feature = check[0]
                max_score = check[1] 
                varin.remove(max_feature)   
                varout.append(max_feature)
                if verbose:
                    print('{0:0=2d}'.format(n), 'Original Score:', f'{score:.4f}', '| New score:', f'{max_score:.4f}', 
                          end='\r', flush=True)
                break

        if max_score > score:
            score = max_score
        else:
            print('Seed:',seed, '<-', f'{score:.4f}','                                                                       ')
            return(varin, score)
            break

### GBT

In [50]:
def check_model(variable, X_train, X_test, y_train, y_test):
    clf = GradientBoostingClassifier(**gbt_params, random_state=1)            
    clf.fit(X_train, y_train)
    score = f1_score(clf.predict(X_train), y_train, pos_label="Bad")
    return(variable, score)

In [51]:
max_score = 0
var_selec = []
for seed in range(4):
    varin, score = try_seed(seed, verbose=True)
    if score > max_score:
        max_score = score
        var_selec = varin

Seed: 0 <- 0.7020                                                                        
Seed: 1 <- 0.7008                                                                        
Seed: 2 <- 0.7003                                                                        
Seed: 3 <- 0.7004                                                                        


In [57]:
X_train_varin = X_train_2[var_selec]
X_test_varin = X_test_2[var_selec] 

clf = SVC(**svm_params, random_state=1)         
# clf = GradientBoostingClassifier(**gbt_params, random_state=1)            

clf.fit(X_train_varin, y_train_2)
print("F1-Score: ", f1_score(clf.predict(X_test_varin), y_test_2, pos_label='Bad'))
print("Accuracy: ", accuracy_score(clf.predict(X_test_varin), y_test_2))
print("Recall:   ", recall_score(clf.predict(X_test_varin), y_test_2, pos_label='Bad'))
print("Precision:", precision_score(clf.predict(X_test_varin), y_test_2, pos_label='Bad'))
print("H-Score  :", H_score(clf.predict(X_test_varin), y_test_2))

F1-Score:  0.6962127316680096
Accuracy:  0.623
Recall:    0.5714285714285714
Precision: 0.8907216494845361
H-Score  : 0.6575748117299465


In [55]:
pd.crosstab(clf.predict(X_test_varin), y_test_2)

Category,Bad,Good
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Bad,864,648
Good,106,382


## CONFIDENCE INTERVAL

In [None]:
pred_f1 = []
pred_ac = []
pred_H = []

for i in range(100):
    sample = np.random.randint(0, len(X_test_varin), size=len(X_test_varin))
    X_sample = X_test_varin.iloc[sample]
    y_sample = y_test_2.iloc[sample]
    print(i, end='\r', flush=True)
    pred_f1.append(f1_score(clf.predict(X_sample), y_sample, pos_label='Bad'))
    pred_ac.append(accuracy_score(clf.predict(X_sample), y_sample))
    pred_H.append(H_score(clf.predict(X_sample), y_sample))

In [None]:
plt.hist(pred_ac, bins=25, alpha=0.4)
plt.hist(pred_H, bins=25, alpha=0.4)
plt.hist(pred_f1, bins=25, alpha=0.4)

In [None]:
print('            2.5%    50%  97.5%')
print('H-Score: ',f'{np.percentile(pred_H, 2.5):.4f}',f'{np.percentile(pred_H, 50):.4f}',f'{np.percentile(pred_H, 97.5):.4f}')
print('Accuracy:',f'{np.percentile(pred_ac,2.5):.4f}',f'{np.percentile(pred_ac,50):.4f}',f'{np.percentile(pred_ac,97.5):.4f}')
print('F1-Score:',f'{np.percentile(pred_f1,2.5):.4f}',f'{np.percentile(pred_f1,50):.4f}',f'{np.percentile(pred_f1,97.5):.4f}')