In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import pickle
from matplotlib import pyplot as plt

## PREPARATION

### Functions

In [2]:
import time
from collections import Counter
from sklearn.model_selection import KFold, cross_val_score, RandomizedSearchCV, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, cohen_kappa_score, f1_score, precision_score, recall_score, make_scorer
from sklearn.neighbors import KDTree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from hyperopt import hp, fmin, tpe, rand, STATUS_OK, Trials, space_eval
from catboost import CatBoostClassifier
import xgboost as xgb

In [3]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [4]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
    clf = model
    clf.fit(X_train, y_train)
    print_result(clf, X_train, X_test, y_train, y_test)
    return(clf.predict(X_test))

In [5]:
def print_result(clf, X_train, X_test, y_train, y_test):
    print('Accuracy Test :', f'{accuracy_score(y_test, clf.predict(X_test)):.4f}', 
          '| F1 Test :', f'{f1_score(y_test, clf.predict(X_test), pos_label="Bad"):.4f}',
          '| Precision Test :', f'{precision_score(y_test, clf.predict(X_test), pos_label="Bad"):.4f}', 
          '| Recall Test :', f'{recall_score(y_test, clf.predict(X_test), pos_label="Bad"):.4f}', 
          '| H Test :', f'{H_score(y_test, clf.predict(X_test)):.4f}')
    
    print('Accuracy Test :', f'{accuracy_score(y_train, clf.predict(X_train)):.4f}', 
          '| F1 Test :', f'{f1_score(y_train, clf.predict(X_train), pos_label="Bad"):.4f}',
          '| Precision Test :', f'{precision_score(y_train, clf.predict(X_train), pos_label="Bad"):.4f}', 
          '| Recall Test :', f'{recall_score(y_train, clf.predict(X_train), pos_label="Bad"):.4f}', 
          '| H Test :', f'{H_score(y_train, clf.predict(X_train)):.4f}')

In [6]:
def H_score(X_train, y_train):
    acc = accuracy_score(y_train, X_train)
    f1 = f1_score(y_train, X_train, pos_label = "Bad")
    return(2 / ((1/(acc+0.0000001))+(1/(f1+0.0000001))))

In [7]:
def bayesian(space, X, y, modelo, nevals):
    
    f1 = make_scorer(f1_score, pos_label = "Bad")
    
    def objective(space):        
        global best_score
        model = modelo(**space, random_state = 1)   
        cv =  StratifiedKFold(n_splits = 5, random_state = 1)
        score = -cross_val_score(model, X, y, cv = cv, scoring = f1, verbose = False).mean()
        if (score < best_score):
            best_score = score
        return score

    start = time.time()
    rstate = np.random.RandomState(1)
    best = fmin(objective, space = space, algo = tpe.suggest, max_evals = nevals,trials = Trials(), rstate = rstate)

    print("Hyperopt search took %.2f seconds" % ((time.time() - start)))
    print("Best score: %.4f " % (-best_score))
    print("Best space: ", space_eval(params, best))
    return(space_eval(params, best))

### Create Data Frame

In [8]:
df_balanced = pickle.load(open('./sav/df_balanced.sav', 'rb'))
df_subset = pickle.load(open('./sav/df_subset.sav', 'rb'))
df_stacking = df_balanced.drop(df_subset.index).sample(n=10000, random_state=11)

In [9]:
y = df_subset['Category']
X = df_subset[df_subset.columns[1:-1]]

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)

In [11]:
X_test.shape, y_test.shape, X_train.shape, y_train.shape

((2000, 38), (2000,), (8000, 38), (8000,))

In [12]:
y = df_stacking['Category']
X = df_stacking[df_stacking.columns[1:-1]]

### Add Predictions

In [13]:
models = pickle.load(open('./sav/model_f1.sav', 'rb'))

In [14]:
pred_knn = [Counter([y.iloc[k] for k in x]).most_common(1)[0][0] for x in models[0].query(X, k = 100)[1]]
pred_log = models[1].predict(X)
pred_svm = models[2].predict(X)
pred_svm_2 = models[3].predict(X)
pred_tree = models[4].predict(X)
pred_rf = models[5].predict(X)
pred_gbt = models[6].predict(X)
pred_xgb = models[7].predict(X)
pred_cat = models[8].predict(X)

And I append the predictions of the model to the dataset

In [15]:
X['logistic'] = pred_log
X['gbt'] = pred_gbt
X['knn'] = pred_knn
X['svm'] = pred_svm
X['svm_2'] = pred_svm_2
X['tree'] = pred_tree
X['xgb'] = pred_xgb
X['rf'] = pred_rf
X['cat'] = pred_cat

In [16]:
X.iloc[:,-9:].head(5)

Unnamed: 0,logistic,gbt,knn,svm,svm_2,tree,xgb,rf,cat
231987,Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad
178130,Good,Good,Good,Good,Good,Good,Good,Good,Good
266651,Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad
428706,Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad
408969,Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad,Bad


In [17]:
X.iloc[:,-9:] = X.iloc[:,-9:].apply(lambda x: [1 if i=='Good' else 0 for i in x])

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)

## MODELS

### Gradient Boosting Trees

In [19]:
params = {'learning_rate':     hp.choice('learning_rate',[0.0001, 0.00025, 0.0005, 0.00075, 0.001, 0.0025, 0.005, 
                                                          0.0075, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.50, 0.75, 1]), 
          'n_estimators':      hp.choice('n_estimators', range(1,400)),
          'max_depth':         hp.choice('max_depth',range(1,20)),
          'min_samples_split': hp.choice('min_samples_split',np.linspace(0.01, 1.0, 10, endpoint=True)),
          'min_samples_leaf':  hp.choice('min_samples_leaf',np.linspace(0.01, 0.5, 50, endpoint=True)), 
          'subsample':         hp.choice('subsample',[1]), 
          'max_features':      hp.choice('max_features',['sqrt'])}

best_score = 1
gbt_params = bayesian(params, X_train, y_train, GradientBoostingClassifier, 50)
pred_gbt_stck = evaluate_model(GradientBoostingClassifier(**gbt_params, random_state=22), X_train, X_test, y_train, y_test)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [02:46<00:00,  3.32s/trial, best loss: -0.699247423326867]
Hyperopt search took 166.28 seconds
Best score: 0.6992 
Best space:  {'learning_rate': 0.00075, 'max_depth': 18, 'max_features': 'sqrt', 'min_samples_leaf': 0.15000000000000002, 'min_samples_split': 0.12, 'n_estimators': 82, 'subsample': 1}
Accuracy Test : 0.5950 | F1 Test : 0.6980 | Precision Test : 0.5625 | Recall Test : 0.9194 | H Test : 0.6424
Accuracy Test : 0.5964 | F1 Test : 0.6993 | Precision Test : 0.5633 | Recall Test : 0.9217 | H Test : 0.6437


### XGBoosting

In [20]:
params = {'learning_rate':    hp.choice('learning_rate',[0.0001, 0.00025, 0.0005, 0.00075, 0.001, 0.0025, 0.005, 0.0075, 
                                                         0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75]), 
          'max_depth':        hp.choice('max_depth',range(1,20)),
          'min_child_weight': hp.choice('min_child_weight',np.linspace(0.01, 1.0, 100, endpoint=True)),
          'gamma':            hp.choice('gamma',np.linspace(0.01, 1.0, 100, endpoint=True)), 
          'colsample_bytree': hp.choice('colsample_bytree',np.linspace(0.0, 1, 101, endpoint=True)), 
          'n_estimators':     hp.choice('n_estimators', range(1,200))}

best_score = 1
xgb_params = bayesian(params, X_train, y_train, xgb.XGBClassifier, 50)
pred_xgb_stck = evaluate_model(xgb.XGBClassifier(**xgb_params, random_state=22), X_train, X_test, y_train, y_test)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [21:20<00:00, 25.60s/trial, best loss: -0.6682564734072931]
Hyperopt search took 1280.32 seconds
Best score: 0.6683 
Best space:  {'colsample_bytree': 0.89, 'gamma': 0.68, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 0.55, 'n_estimators': 6}
Accuracy Test : 0.6535 | F1 Test : 0.6787 | Precision Test : 0.6427 | Recall Test : 0.7191 | H Test : 0.6659
Accuracy Test : 0.6535 | F1 Test : 0.6759 | Precision Test : 0.6452 | Recall Test : 0.7098 | H Test : 0.6645


### SVM (Poly)

In [21]:
params = {"degree": hp.choice('degree', [2, 3, 4]),
          "kernel": hp.choice('kernel', ['poly']), 
          "C":      hp.choice('C', [0.0001, 0.00025, 0.0005, 0.00075, 0.001, 0.0025, 0.005, 0.0075, 
                                    0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75])}
best_score = 1
svm_params = bayesian(params, X_train, y_train, SVC, 10)
pred_svm_stck = evaluate_model(SVC(**svm_params, random_state=22), X_train, X_test, y_train, y_test)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [05:41<00:00, 34.10s/trial, best loss: -0.696183616500966]
Hyperopt search took 341.05 seconds
Best score: 0.6962 
Best space:  {'C': 0.0005, 'degree': 4, 'kernel': 'poly'}
Accuracy Test : 0.5775 | F1 Test : 0.6988 | Precision Test : 0.5484 | Recall Test : 0.9627 | H Test : 0.6324
Accuracy Test : 0.5741 | F1 Test : 0.6964 | Precision Test : 0.5466 | Recall Test : 0.9592 | H Test : 0.6294


### SVM (RBF)

In [22]:
params = {'C':      hp.choice('C', [1, 2, 5, 10, 15, 20]), 
          'gamma':  hp.choice('gamma', [0.0001, 0.001, 0.01, 0.1]),
          'kernel': hp.choice('kernel', ['rbf'])}

best_score = 1
svm_params_2 = bayesian(params, X_train, y_train, SVC, 10)
pred_svm_2 = evaluate_model(SVC(**svm_params_2, random_state=22), X_train, X_test, y_train, y_test)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [07:52<00:00, 47.28s/trial, best loss: -0.6631869662235699]
Hyperopt search took 472.79 seconds
Best score: 0.6632 
Best space:  {'C': 5, 'gamma': 0.01, 'kernel': 'rbf'}
Accuracy Test : 0.6495 | F1 Test : 0.6670 | Precision Test : 0.6458 | Recall Test : 0.6896 | H Test : 0.6581
Accuracy Test : 0.6560 | F1 Test : 0.6698 | Precision Test : 0.6550 | Recall Test : 0.6852 | H Test : 0.6628


### Logistic Regression

In [23]:
params = {"C":   hp.choice('C',[0.0001, 0.00025, 0.0005, 0.001, 0.0025, 0.005, 0.01, 0.025, 0.05, 0.1]),
          ""
          "tol": hp.choice('tol',[0.00001, 0.000025, 0.00005, 0.0001, 0.00025, 0.0005, 0.001, 0.0025, 0.005, 0.01, 0.025, 
                                  0.05, 0.1])}

best_score = 1
log_params = bayesian(params, X_train, y_train, LogisticRegression, 50)
pred_log_stck = evaluate_model(LogisticRegression(**log_params, random_state=22), X_train, X_test, y_train, y_test)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:36<00:00,  1.37trial/s, best loss: -0.6615181023165619]
Hyperopt search took 36.47 seconds
Best score: 0.6615 
Best space:  {'C': 0.01, 'tol': 0.0025}
Accuracy Test : 0.6495 | F1 Test : 0.6628 | Precision Test : 0.6494 | Recall Test : 0.6768 | H Test : 0.6561
Accuracy Test : 0.6556 | F1 Test : 0.6644 | Precision Test : 0.6593 | Recall Test : 0.6695 | H Test : 0.6600


### Random Forest

In [24]:
params = {'bootstrap':         hp.choice('bootstrap',[True, False]),
          'max_depth':         hp.choice('max_depth', range(1, 20)),
          'max_features':      hp.choice('max_features',['auto', 'sqrt']),
          'min_samples_leaf':  hp.choice('min_samples_leaf',np.linspace(0.01, 0.5, 50, endpoint=True)), 
          'min_samples_split': hp.choice('min_samples_split',np.linspace(0.01, 1.0, 100, endpoint=True)), 
          'n_estimators':      hp.choice('n_estimators',range(1,400))}

best_score = 1
rf_params = bayesian(params, X_train, y_train, RandomForestClassifier, 50)
pred_rf_stck = evaluate_model(RandomForestClassifier(**rf_params, random_state=22), X_train, X_test, y_train, y_test)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [05:29<00:00,  6.59s/trial, best loss: -0.6747286789792464]
Hyperopt search took 329.53 seconds
Best score: 0.6747 
Best space:  {'bootstrap': True, 'max_depth': 7, 'max_features': 'sqrt', 'min_samples_leaf': 0.4, 'min_samples_split': 0.8200000000000001, 'n_estimators': 387}
Accuracy Test : 0.5090 | F1 Test : 0.6746 | Precision Test : 0.5090 | Recall Test : 1.0000 | H Test : 0.5802
Accuracy Test : 0.5091 | F1 Test : 0.6747 | Precision Test : 0.5091 | Recall Test : 1.0000 | H Test : 0.5803


### CatBoost

In [25]:
cat_features = [5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,
                38,39,40,41,42,43,44,45]
params = {'iterations':        hp.choice('iterations', range(100, 4000, 25)), 
          'learning_rate':     hp.choice('learning_rate', [0.001, 0.0025, 0.0075, 0.01, 0.025, 0.05, 0.1]),
          'l2_leaf_reg':       hp.choice('l2_leaf_reg', range(1, 10)), 
          'cat_features':      hp.choice('cat_features', [cat_features]), 
          'verbose':           hp.choice('verbose', [False])}

best_score = 1
cat_params = bayesian(params, X_train, y_train, CatBoostClassifier, 10)
pred_cat = evaluate_model(CatBoostClassifier(**cat_params, random_state=22), X_train, X_test, y_train, y_test)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [35:39<00:00, 213.98s/trial, best loss: -0.6633100343455033]
Hyperopt search took 2139.89 seconds
Best score: 0.6633 
Best space:  {'cat_features': (5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45), 'iterations': 1050, 'l2_leaf_reg': 2, 'learning_rate': 0.0075, 'verbose': False}
Accuracy Test : 0.6525 | F1 Test : 0.6692 | Precision Test : 0.6491 | Recall Test : 0.6906 | H Test : 0.6607
Accuracy Test : 0.6835 | F1 Test : 0.6941 | Precision Test : 0.6832 | Recall Test : 0.7054 | H Test : 0.6888


### Decision Trees

In [26]:
params = {"max_depth":        hp.choice('max_depth', range(1, 50)),
          "max_features":     hp.choice('max_features', range(1, X_train.columns.size)),
          "min_samples_leaf": hp.choice('min_samples_leaf', range(1, 200)),
          "criterion":        hp.choice('criterion', ["gini", "entropy"])}

best_score = 1
tree_params = bayesian(params, X_train, y_train, DecisionTreeClassifier, 50)
pred_tree = evaluate_model(DecisionTreeClassifier(**tree_params, random_state=22), X_train, X_test, y_train, y_test)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:14<00:00,  3.34trial/s, best loss: -0.6734143768052325]
Hyperopt search took 15.05 seconds
Best score: 0.6734 
Best space:  {'criterion': 'gini', 'max_depth': 8, 'max_features': 1, 'min_samples_leaf': 131}
Accuracy Test : 0.5430 | F1 Test : 0.4961 | Precision Test : 0.5653 | Recall Test : 0.4420 | H Test : 0.5185
Accuracy Test : 0.5581 | F1 Test : 0.4988 | Precision Test : 0.5903 | Recall Test : 0.4319 | H Test : 0.5268


### Best Model

In [27]:
gbt_params = {'learning_rate': 0.00075, 'max_depth': 18, 'max_features': 'sqrt', 
              'min_samples_leaf': 0.15000000000000002, 'min_samples_split': 0.12, 'n_estimators': 82, 'subsample': 1}
pred_gbt_stck = evaluate_model(GradientBoostingClassifier(**gbt_params, random_state=22), X_train, X_test, y_train, y_test)

Accuracy Test : 0.5950 | F1 Test : 0.6980 | Precision Test : 0.5625 | Recall Test : 0.9194 | H Test : 0.6424
Accuracy Test : 0.5964 | F1 Test : 0.6993 | Precision Test : 0.5633 | Recall Test : 0.9217 | H Test : 0.6437


In [28]:
pd.crosstab(pred_gbt_stck, y_test)

Category,Bad,Good
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Bad,936,728
Good,82,254


### Save Models

In [29]:
models = (LogisticRegression(**log_params, max_iter = 1000).fit(X_train, y_train),
          SVC(**svm_params).fit(X_train, y_train),
          SVC(**svm_params_2).fit(X_train, y_train),
          DecisionTreeClassifier(**tree_params).fit(X_train, y_train),
          RandomForestClassifier(**rf_params).fit(X_train, y_train),
          GradientBoostingClassifier(**gbt_params).fit(X_train, y_train),
          xgb.XGBClassifier(**xgb_params).fit(X_train, y_train),
          CatBoostClassifier(**cat_params).fit(X_train, y_train)
)

In [30]:
pickle.dump(models, open('./sav/model_f1stack.sav', 'wb'))