### Open File

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import pickle
from matplotlib import pyplot as plt

In [2]:
X, y = pickle.load(open('./data/xy_model.sav', 'rb'))

Split into Train and Test

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)

In [4]:
X_test.shape, y_test.shape, X_train.shape, y_train.shape

((2000, 38), (2000,), (8000, 38), (8000,))

## MODELS

### Functions

In [5]:
import time
from sklearn.model_selection import KFold, cross_val_score, RandomizedSearchCV, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, cohen_kappa_score, f1_score, precision_score, recall_score, make_scorer
from collections import Counter
from sklearn.neighbors import KDTree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from hyperopt import hp, fmin, tpe, rand, STATUS_OK, Trials, space_eval
import xgboost as xgb

In [6]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [7]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
    clf = model
    clf.fit(X_train, y_train)
    print_result(clf, X_train, X_test, y_train, y_test)
    return(clf.predict(X_test))

In [8]:
def print_result(clf, X_train, X_test, y_train, y_test):
    print('Accuracy Test :', f'{accuracy_score(clf.predict(X_test), y_test):.4f}', 
          '| F1 Test :', f'{f1_score(clf.predict(X_test), y_test, pos_label="Bad"):.4f}',
          '| Precision Test :', f'{precision_score(clf.predict(X_test), y_test, pos_label="Bad"):.4f}', 
          '| Recall Test :', f'{recall_score(clf.predict(X_test), y_test, pos_label="Bad"):.4f}', 
          '| H Test :', f'{H_score(clf.predict(X_test), y_test):.4f}')
    
    print('Accuracy Train:', f'{accuracy_score(clf.predict(X_train), y_train):.4f}', 
          '| F1 Train:', f'{f1_score(clf.predict(X_train), y_train, pos_label="Bad"):.4f}',
          '| Precision Train:', f'{precision_score(clf.predict(X_train), y_train, pos_label="Bad"):.4f}', 
          '| Recall Train:', f'{recall_score(clf.predict(X_train), y_train, pos_label="Bad"):.4f}', 
          '| H Train:', f'{H_score(clf.predict(X_train), y_train):.4f}')

In [9]:
def H_score(X_train, y_train):
    acc = accuracy_score(X_train, y_train)
    f1 = f1_score(X_train, y_train, pos_label = "Bad")
    return(2 / ((1/(acc+0.00001))+(1/(f1+0.00001))))

In [32]:
def bayesian(space, X, y, modelo, nevals):
    
    f1 = make_scorer(f1_score, pos_label = "Bad")
    H = make_scorer(H_score, greater_is_better=True) 
    
    def objective(space):        
        global best_score
        model = modelo(**space, random_state = 1)   
        cv =  StratifiedKFold(n_splits = 5, random_state = 1)
        score = -cross_val_score(model, X, y, cv = cv, scoring = H, verbose = False).mean()
        if (score < best_score):
            best_score = score
        return score

    start = time.time()
    rstate = np.random.RandomState(1)
    best = fmin(objective, space = space, algo = tpe.suggest, max_evals = nevals, trials = Trials(), rstate = rstate)

    print("Hyperopt search took %.2f seconds" % ((time.time() - start)))
    print("Best score: %.4f " % (-best_score))
    print("Best space: ", space_eval(params, best))
    return(space_eval(params, best))

### KNN (KDTree Implementation)

In [11]:
tree = KDTree(X_train)

In [12]:
nearest_dist, nearest_ind = tree.query(X_test, k = 114)

In [13]:
pred_knn = [Counter([y_train.iloc[k] for k in x]).most_common(1)[0][0] for x in nearest_ind]

In [14]:
print('Accuracy Test :', f'{accuracy_score(pred_knn, y_test):.4f}', 
      '| F1 Test :', f'{f1_score(pred_knn, y_test, pos_label="Bad"):.4f}',
      '| Precision Test :', f'{precision_score(pred_knn, y_test, pos_label="Bad"):.4f}', 
      '| Recall Test :', f'{recall_score(pred_knn, y_test, pos_label="Bad"):.4f}', 
      '| H Test :', f'{H_score(pred_knn, y_test):.4f}')

Accuracy Test : 0.6715 | F1 Test : 0.6677 | Precision Test : 0.6600 | Recall Test : 0.6755 | H Test : 0.6696


### Gradient Boosted Trees

In [15]:
params = {'learning_rate':     hp.choice('learning_rate',[0.0001, 0.00025, 0.0005, 0.00075, 0.001, 0.0025, 0.005, 
                                                          0.0075, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.50, 0.75, 1]), 
          'n_estimators':      hp.choice('n_estimators', range(1,400)),
          'max_depth':         hp.choice('max_depth',range(1,20)),
          'min_samples_split': hp.choice('min_samples_split',np.linspace(0.01, 1.0, 10, endpoint=True)),
          'min_samples_leaf':  hp.choice('min_samples_leaf',np.linspace(0.01, 0.5, 50, endpoint=True)), 
          'subsample':         hp.choice('subsample',[1]), 
          'max_features':      hp.choice('max_features',['sqrt'])}

best_score = 1
gbt_params = bayesian(params, X_train, y_train, GradientBoostingClassifier, 50)
pred_gbt = evaluate_model(GradientBoostingClassifier(**gbt_params), X_train, X_test, y_train, y_test)

100%|███████████████████████████████████████████████| 50/50 [02:04<00:00,  2.49s/trial, best loss: -0.6627344838217297]
Hyperopt search took 124.56 seconds for 200 candidates
Best score: 0.6627 
Best space:  {'learning_rate': 0.025, 'max_depth': 14, 'max_features': 'sqrt', 'min_samples_leaf': 0.06999999999999999, 'min_samples_split': 0.01, 'n_estimators': 145, 'subsample': 1}
Accuracy Test : 0.6630 | F1 Test : 0.6606 | Precision Test : 0.6560 | Recall Test : 0.6653 | H Test : 0.6618
Accuracy Train: 0.6676 | F1 Train: 0.6701 | Precision Train: 0.6756 | Recall Train: 0.6648 | H Train: 0.6689


### Random Forest

In [16]:
params = {'bootstrap':         hp.choice('bootstrap',[True, False]),
          'max_depth':         hp.choice('max_depth', range(1, 20)),
          'max_features':      hp.choice('max_features',['auto', 'sqrt']),
          'min_samples_leaf':  hp.choice('min_samples_leaf',np.linspace(0.01, 0.5, 50, endpoint=True)), 
          'min_samples_split': hp.choice('min_samples_split',np.linspace(0.01, 1.0, 100, endpoint=True)), 
          'n_estimators':      hp.choice('n_estimators',range(1,400))}

best_score = 1
rf_params = bayesian(params, X_train, y_train, RandomForestClassifier, 50)
pred_rf = evaluate_model(RandomForestClassifier(**rf_params), X_train, X_test, y_train, y_test)

100%|███████████████████████████████████████████████| 50/50 [02:03<00:00,  2.48s/trial, best loss: -0.6590016134314729]
Hyperopt search took 124.13 seconds for 200 candidates
Best score: 0.6590 
Best space:  {'bootstrap': True, 'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 0.01, 'min_samples_split': 0.12, 'n_estimators': 145}
Accuracy Test : 0.6635 | F1 Test : 0.6706 | Precision Test : 0.6850 | Recall Test : 0.6568 | H Test : 0.6670
Accuracy Train: 0.6607 | F1 Train: 0.6713 | Precision Train: 0.6931 | Recall Train: 0.6508 | H Train: 0.6660


### XGBoosting

In [17]:
params = {'learning_rate':    hp.choice('learning_rate',[0.0001, 0.00025, 0.0005, 0.00075, 0.001, 0.0025, 0.005, 0.0075, 
                                                         0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75]), 
          'max_depth':        hp.choice('max_depth',range(1,20)),
          'min_child_weight': hp.choice('min_child_weight',np.linspace(0.01, 1.0, 100, endpoint=True)),
          'gamma':            hp.choice('gamma',np.linspace(0.01, 1.0, 100, endpoint=True)), 
          'colsample_bytree': hp.choice('colsample_bytree',np.linspace(0.0, 1, 101, endpoint=True)), 
          'n_estimators':     hp.choice('n_estimators', range(1,200))}

best_score = 1
xgb_params = bayesian(params, X_train, y_train, xgb.XGBClassifier, 50)
pred_xgb = evaluate_model(xgb.XGBClassifier(**xgb_params), X_train, X_test, y_train, y_test)

100%|███████████████████████████████████████████████| 50/50 [04:25<00:00,  5.32s/trial, best loss: -0.6654064668150198]
Hyperopt search took 265.96 seconds for 200 candidates
Best score: 0.6654 
Best space:  {'colsample_bytree': 0.25, 'gamma': 0.75, 'learning_rate': 0.5, 'max_depth': 1, 'min_child_weight': 0.3, 'n_estimators': 137}
Accuracy Test : 0.6690 | F1 Test : 0.6697 | Precision Test : 0.6710 | Recall Test : 0.6683 | H Test : 0.6693
Accuracy Train: 0.6686 | F1 Train: 0.6717 | Precision Train: 0.6783 | Recall Train: 0.6652 | H Train: 0.6702


### Logistic Regresion

In [18]:
params = {"C":   hp.choice('C',[0.0001, 0.00025, 0.0005, 0.001, 0.0025, 0.005, 0.01, 0.025, 0.05, 0.1]),
          ""
          "tol": hp.choice('tol',[0.00001, 0.000025, 0.00005, 0.0001, 0.00025, 0.0005, 0.001, 0.0025, 0.005, 0.01, 0.025, 
                                  0.05, 0.1])}

best_score = 1
log_params = bayesian(params, X_train, y_train, LogisticRegression, 50)
pred_log = evaluate_model(LogisticRegression(**log_params, max_iter = 1000), X_train, X_test, y_train, y_test)

100%|███████████████████████████████████████████████| 50/50 [00:16<00:00,  3.12trial/s, best loss: -0.6652393230847832]
Hyperopt search took 16.04 seconds for 200 candidates
Best score: 0.6652 
Best space:  {'C': 0.1, 'tol': 0.1}
Accuracy Test : 0.6660 | F1 Test : 0.6599 | Precision Test : 0.6480 | Recall Test : 0.6722 | H Test : 0.6629
Accuracy Train: 0.6719 | F1 Train: 0.6706 | Precision Train: 0.6683 | Recall Train: 0.6729 | H Train: 0.6712


### Decision Trees

In [19]:
params = {"max_depth":        hp.choice('max_depth', range(1, 50)),
          "max_features":     hp.choice('max_features', range(1, 50)),
          "min_samples_leaf": hp.choice('min_samples_leaf', range(1, 200)),
          "criterion":        hp.choice('criterion', ["gini", "entropy"])}

best_score = 1
tree_params = bayesian(params, X_train, y_train, DecisionTreeClassifier, 50)
pred_tree = evaluate_model(DecisionTreeClassifier(**tree_params), X_train, X_test, y_train, y_test)

100%|███████████████████████████████████████████████| 50/50 [00:10<00:00,  4.61trial/s, best loss: -0.6572264037714508]
Hyperopt search took 10.91 seconds for 200 candidates
Best score: 0.6572 
Best space:  {'criterion': 'entropy', 'max_depth': 3, 'max_features': 35, 'min_samples_leaf': 135}
Accuracy Test : 0.6505 | F1 Test : 0.6881 | Precision Test : 0.7710 | Recall Test : 0.6213 | H Test : 0.6688
Accuracy Train: 0.6441 | F1 Train: 0.6831 | Precision Train: 0.7674 | Recall Train: 0.6154 | H Train: 0.6630


### SVM (poly)

In [20]:
params = {"degree": hp.choice('degree', [2, 3, 4]),
          "kernel": hp.choice('kernel', ['poly']), 
          "C":      hp.choice('C', [0.0001, 0.00025, 0.0005, 0.00075, 0.001, 0.0025, 0.005, 0.0075, 
                                    0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75])}
best_score = 1
svm_params = bayesian(params, X_train, y_train, SVC, 10)
pred_svm = evaluate_model(SVC(**svm_params), X_train, X_test, y_train, y_test)

100%|███████████████████████████████████████████████| 10/10 [02:13<00:00, 13.32s/trial, best loss: -0.6700120050386014]
Hyperopt search took 133.24 seconds for 200 candidates
Best score: 0.6700 
Best space:  {'C': 0.1, 'degree': 2, 'kernel': 'poly'}
Accuracy Test : 0.6615 | F1 Test : 0.6850 | Precision Test : 0.7360 | Recall Test : 0.6406 | H Test : 0.6730
Accuracy Train: 0.6650 | F1 Train: 0.6927 | Precision Train: 0.7554 | Recall Train: 0.6396 | H Train: 0.6786


### SVM (rbf)

In [21]:
params = {'C':      hp.choice('C', [1, 2, 5, 10, 15, 20]), 
          'gamma':  hp.choice('gamma', [0.0001, 0.001, 0.01, 0.1]),
          'kernel': hp.choice('kernel', ['rbf'])}

best_score = 1
svm_params_2 = bayesian(params, X_train, y_train, SVC, 10)
pred_svm_2 = evaluate_model(SVC(**svm_params_2), X_train, X_test, y_train, y_test)

100%|███████████████████████████████████████████████| 10/10 [02:38<00:00, 15.83s/trial, best loss: -0.6688779415682038]
Hyperopt search took 158.36 seconds for 200 candidates
Best score: 0.6689 
Best space:  {'C': 15, 'gamma': 0.001, 'kernel': 'rbf'}
Accuracy Test : 0.6675 | F1 Test : 0.6677 | Precision Test : 0.6680 | Recall Test : 0.6673 | H Test : 0.6676
Accuracy Train: 0.6711 | F1 Train: 0.6743 | Precision Train: 0.6811 | Recall Train: 0.6676 | H Train: 0.6727


### Best Model (SVM)

In [22]:
svm_params =  {'C': 0.1, 'degree': 2, 'kernel': 'poly'}
pred_svm = evaluate_model(SVC(**svm_params), X_train, X_test, y_train, y_test)

Accuracy Test : 0.6615 | F1 Test : 0.6850 | Precision Test : 0.7360 | Recall Test : 0.6406 | H Test : 0.6730
Accuracy Train: 0.6650 | F1 Train: 0.6927 | Precision Train: 0.7554 | Recall Train: 0.6396 | H Train: 0.6786


In [23]:
pd.crosstab(pred_svm, y_test)

Category,Bad,Good
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Bad,736,413
Good,264,587


### Save Models

In [30]:
models = (tree,
          LogisticRegression(**log_params, max_iter = 1000).fit(X_train, y_train),
          SVC(**svm_params).fit(X_train, y_train),
          DecisionTreeClassifier(**tree_params).fit(X_train, y_train),
          RandomForestClassifier(**rf_params).fit(X_train, y_train),
          GradientBoostingClassifier(**gbt_params).fit(X_train, y_train),
          xgb.XGBClassifier(**xgb_params).fit(X_train, y_train)
)

In [31]:
pickle.dump(models, open('./sav/model_H.sav', 'wb'))