In [19]:
import warnings
warnings.filterwarnings("ignore")

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, confusion_matrix, precision_score
from data_prep.data_prep import *
from sklearn.model_selection import StratifiedKFold

In [21]:
true_neg = make_scorer(lambda y, y_pred: confusion_matrix(y, y_pred)[0][0])
false_neg = make_scorer(lambda y, y_pred: confusion_matrix(y, y_pred)[1][0])
true_pos = make_scorer(lambda y, y_pred: confusion_matrix(y, y_pred)[1][1])
false_pos = make_scorer(lambda y, y_pred: confusion_matrix(y, y_pred)[0][1])
precision = make_scorer(precision_score, zero_division=0)

SCORING = {
    "roc_auc":"roc_auc",
    "accuracy":"accuracy",
    "recall": "recall",
    "precision": precision,
    "true_pos": true_pos,
    "true_neg": true_neg,
    "false_pos": false_pos,
    "false_neg": false_neg
}

SCORE_FEATURES = [
       'mean_test_roc_auc', 'std_test_roc_auc',
       'mean_train_roc_auc', 'std_train_roc_auc',
       'mean_test_accuracy', 'std_test_accuracy',
       'mean_train_accuracy', 'std_train_accuracy',
       'mean_test_recall', 'std_test_recall',
       'mean_train_recall', 'std_train_recall',
       'mean_test_precision', 'std_test_precision',
       'mean_train_precision', 'std_train_precision',
       'mean_test_true_pos', 'std_test_true_pos',
       'mean_train_true_pos', 'std_train_true_pos',
       'mean_test_true_neg', 'std_test_true_neg',
       'mean_train_true_neg', 'std_train_true_neg',
       'mean_test_false_pos', 'std_test_false_pos',
       'mean_train_false_pos', 'std_train_false_pos',
       'mean_test_false_neg', 'std_test_false_neg',
       'mean_train_false_neg', 'std_train_false_neg'
]

N_ORIG_EXAMPLES = 345

def test_model(clf, X, y, param_grid, **kwargs):

    # custom splits for cross validation
    # remove synthetic examples from validation splits for more accurate val scoring
    splitter = StratifiedKFold(n_splits=3)
    contains_synthetic_examples = len(X) > N_ORIG_EXAMPLES
    if contains_synthetic_examples:

        X_orig, y_orig = X.iloc[:N_ORIG_EXAMPLES], y[:N_ORIG_EXAMPLES]
        X_synth, y_synth = X.iloc[N_ORIG_EXAMPLES:], y[N_ORIG_EXAMPLES:]
        orig_data_splits = [cv_split for cv_split in splitter.split(X_orig, y_orig)]
        synth_data_splits = [cv_split for cv_split in splitter.split(X_synth, y_synth)]

        cv_splits = []
        for orig_data_split, synth_data_split in zip(orig_data_splits,
                                                     synth_data_splits):
            orig_train_idxs, orig_val_idxs = orig_data_split
            synth_train_idxs, _ = synth_data_split
            synth_train_idxs += N_ORIG_EXAMPLES
            cv_train_idxs = np.append(orig_train_idxs, synth_train_idxs)
            cv_splits.append((cv_train_idxs, orig_val_idxs))
    else:
        cv_splits = [cv_split for cv_split in splitter.split(X, y)]

    search_fit = GridSearchCV(clf,
                              param_grid,
                              cv=cv_splits,
                              scoring=SCORING,
                              refit="roc_auc",
                              return_train_score=True).fit(X, y, **kwargs)
    search_results = pd.DataFrame(search_fit.cv_results_)[SCORE_FEATURES]
    return search_fit.best_params_, search_results.iloc[search_fit.best_index_]

In [22]:
from sklearn.neighbors import KNeighborsClassifier

def nns(x, y, balanced=False):
    clf = KNeighborsClassifier(n_jobs=-2)
    param_grid = {'n_neighbors': np.arange(1,10),
                    'weights': ['uniform','distance'],
                    'metric':['euclidean','manhattan']}
    if balanced:
        print("no available balancing technique for nearest neighbours")
    return test_model(clf, x, y, param_grid)

In [23]:
from sklearn.svm import SVC

def svm(X, y, balanced=False):
    clf = SVC()
    param_grid = {'kernel': ['linear','rbf'],
                  'C': np.logspace(2,4,2), # np.logspace(2,5,6)
                  'gamma': np.logspace(-4,0.5,1)} # np.logspace(-4,0.5,10)}
    if balanced:
        param_grid["class_weight"] = "balanced",
    return test_model(clf, X, y, param_grid)

In [24]:
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

def gp(X, y, balanced=False):
    clf = GaussianProcessClassifier(random_state=0, n_jobs=-2)
    param_grid = {'kernel': [1.0 * RBF(1.0)]}
    if balanced:
        print("no available balancing technique for Gaussian Process")
    return test_model(clf, X, y, param_grid)

In [25]:
from sklearn.ensemble import RandomForestClassifier

def rfc(X, y, balanced=False):
    clf = RandomForestClassifier(n_estimators=100)
    param_grid = {'max_depth': [4, 6],
                  'min_samples_leaf': [3,5,9,17],
                  'max_features': [0.3]}
    if balanced:
        param_grid["class_weight"] = "balanced",
    return test_model(clf, X, y, param_grid)

In [26]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.utils.class_weight import compute_class_weight

def gbc(X, y, balanced=False):
    clf = GradientBoostingClassifier(n_estimators=100,random_state=0)
    param_grid = {'learning_rate': [0.1, 0.05, 0.02, 0.01],
                    'max_depth': [3,4,6],
                    'min_samples_leaf': [3,5,9,17],
                    'max_features': [x for x in np.linspace(0.2,0.4,4)]}
    if balanced:
        pos_weight, neg_weight = compute_class_weight(class_weight="balanced",
                                                      classes=[1,0],
                                                      y=y)
        y_weights = y.apply(lambda y: pos_weight if y else neg_weight)
    else:
        y_weights = np.ones(y.shape)

    return test_model(clf, X, y, param_grid, sample_weight=y_weights)

In [27]:
from sklearn.ensemble import AdaBoostClassifier

def ab(X, y, balanced=False):
    clf = AdaBoostClassifier(random_state=0)
    param_grid = {'n_estimators': [100,200],
                  'learning_rate': [0.001,0.01,0.1,0.2,0.5]}
    if balanced:
        pos_weight, neg_weight = compute_class_weight(class_weight="balanced",
                                                      classes=[1,0],
                                                      y=y)
        y_weights = y.apply(lambda y: pos_weight if y else neg_weight)
    else:
        y_weights = np.ones(y.shape)

    return test_model(clf, X, y, param_grid, sample_weight=y_weights)

In [28]:
from sklearn.naive_bayes import GaussianNB

def nb(X, y, balanced=False):
    clf = GaussianNB()
    param_grid = {'var_smoothing':  np.logspace(-11,-3,9,base=10)}
    if balanced:
        pos_weight, neg_weight = compute_class_weight(class_weight="balanced",
                                                      classes=[1,0],
                                                      y=y)
        y_weights = y.apply(lambda y: pos_weight if y else neg_weight)
    else:
        y_weights = np.ones(y.shape)
    return test_model(clf, X, y, param_grid, sample_weight=y_weights)

In [29]:
from sklearn.linear_model import LogisticRegression

def lr(X, y, balanced=False):
    clf = LogisticRegression(random_state=0, max_iter=10000)
    param_grid = {'penalty' : ['l2'],
                  'solver': ["liblinear"],
                  'C' : np.logspace(-4, 4, 20)}
    if balanced:
        param_grid["class_weight"] = "balanced",
    return test_model(clf, X, y, param_grid)

In [30]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

def qda(X, y, balanced=False):
    clf = QuadraticDiscriminantAnalysis()
    param_grid = {'reg_param':  [0.0]}
    if balanced:
        print("no available balancing technique for QDA")
    return test_model(clf, X, y, param_grid)

In [31]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

def lda(X, y, balanced=False):
    clf = LinearDiscriminantAnalysis()
    param_grid = {'solver':  ["svd", "lsqr", "eigen"],
                  "shrinkage": [None, "auto", 0.1, 0.3, 0.8, 1]}
    if balanced:
        print("no available balancing technique for LDA")
    return test_model(clf, X, y, param_grid)

In [32]:
ohe_test = {
    "name": "ohe",
    "balanced": False,
    "cat_encoder": "one_hot",
    "resampled": False,
}

target_test = {
    "name": "target",
    "balanced": False,
    "cat_encoder": "target",
    "resampled": False,
}

ohe_balanced_test = {
    "name": "ohe_balanced",
    "balanced": True,
    "cat_encoder": "one_hot",
    "resampled": False,
}

target_balanced_test = {
    "name": "target_resampled",
    "balanced": True,
    "cat_encoder": "target",
    "resampled": False,
}

ohe_resampled_test = {
    "name": "ohe_resampled",
    "balanced": False,
    "cat_encoder": "one_hot",
    "resampled": True,
}

target_resampled_test = {
    "name": "target_resampled",
    "balanced": False,
    "cat_encoder": "target",
    "resampled": True,
}

tests = [ohe_test, target_test, ohe_balanced_test, target_balanced_test,
         ohe_resampled_test, target_resampled_test]

test_results = {}

techniques_dict = {'K Nearest Neighbours': nns, 'Support Vector Machines': svm,
                   'Gaussian Process': gp, 'Random Forest Classifier': rfc,
                   'Gradient Boosting Classifier': gbc,  'Ada Boost classifier': ab,
                   'Gaussian Naieve Bayes': nb, 'Logistic Regression': lr,
                   'Quadratic Discriminant Analysis': qda}

for test in tests:
    print(50* '=')
    print(f"{test['name']} Test")
    print(50* '=')
    data_loc = "../data/ace_data_orig.csv"
    X_train, _, _, _ = return_train_test(loc=data_loc,
                                         cat_encoder=test["cat_encoder"],
                                         resampled=test["resampled"])
    X_train_scaled, y_train, _, _ = return_train_test(loc=data_loc,
                                                      cat_encoder=test["cat_encoder"],
                                                      resampled=test["resampled"])

    results_list = []
    for model_type, cv_model_func in techniques_dict.items():
        print(f"fitting {model_type}......")
        if cv_model_func in [nns, svm, gp, lr, lda, qda]:
            _, cv_results = cv_model_func(X_train_scaled, y_train, 
                                          balanced=test["balanced"])
            results_list.append(cv_results)
        else: # don't normalise x
            _, cv_results = cv_model_func(X_train, y_train,
                                          balanced=test["balanced"])
            results_list.append(cv_results)
        print("done.")

    test_results[test["name"]] = pd.DataFrame(results_list,
                                              index=techniques_dict.keys())
    


ohe Test
fitting K Nearest Neighbours......
done.
fitting Support Vector Machines......
done.
fitting Gaussian Process......
done.
fitting Random Forest Classifier......
done.
fitting Gradient Boosting Classifier......
done.
fitting Ada Boost classifier......
done.
fitting Gaussian Naieve Bayes......
done.
fitting Logistic Regression......
done.
fitting Quadratic Discriminant Analysis......
done.
target Test
fitting K Nearest Neighbours......
done.
fitting Support Vector Machines......
done.
fitting Gaussian Process......
done.
fitting Random Forest Classifier......
done.
fitting Gradient Boosting Classifier......
done.
fitting Ada Boost classifier......
done.
fitting Gaussian Naieve Bayes......
done.
fitting Logistic Regression......
done.
fitting Quadratic Discriminant Analysis......
done.
ohe_balanced Test
fitting K Nearest Neighbours......
no available balancing technique for nearest neighbours
done.
fitting Support Vector Machines......
done.
fitting Gaussian Process......
no avai