In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score, accuracy_score

In [2]:
from sklearn.neighbors import KNeighborsClassifier

def nns(X_train, y_train):
    neigh = KNeighborsClassifier()
    param_grid = {'n_neighbors': np.arange(1,10),
                    'weights': ['uniform','distance'],
                    'metric':['euclidean','manhattan']}
    regr_nns = GridSearchCV(neigh, param_grid).fit(X_train, y_train)
    return regr_nns

In [3]:
from sklearn.svm import SVC

def svm(X_train ,y_train):
    svc = SVC()
    param_grid = {'kernel': ['linear','rbf'],
                  'C': np.logspace(2,4,2), # np.logspace(2,5,6)
                  'gamma': np.logspace(-4,0.5,1)} # np.logspace(-4,0.5,10)}
    regr_svm = GridSearchCV(svc, param_grid).fit(X_train, y_train)
    return regr_svm

In [4]:
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

def gp(X_train, y_train):
    model = GaussianProcessClassifier(random_state=0)
    param_grid = {'kernel': [1.0 * RBF(1.0)]}
    regr_gp = GridSearchCV(model, param_grid).fit(X_train, y_train)
    return regr_gp

In [5]:
from sklearn.ensemble import RandomForestClassifier

def rfc(X_train, y_train):
    est = RandomForestClassifier(n_estimators=100)
    param_grid = {'max_depth': [4, 6],
                  'min_samples_leaf': [3,5,9,17],
                  'max_features': [0.3]}
    regr_rfc = GridSearchCV(est, param_grid).fit(X_train, y_train)
    return regr_rfc

In [6]:
from sklearn.ensemble import GradientBoostingClassifier

def gbc(X_train, y_train):
    est = GradientBoostingClassifier(n_estimators=100,random_state=0)
    param_grid = {'learning_rate': [0.1, 0.05, 0.02, 0.01],
                    'max_depth': [3,4,6],
                    'min_samples_leaf': [3,5,9,17],
                    'max_features': [x for x in np.linspace(0.2,0.4,4)]}
    regr_gbc = GridSearchCV(est, param_grid).fit(X_train, y_train)
    return regr_gbc

In [7]:
from sklearn.ensemble import AdaBoostClassifier

def ab(X_train, y_train):
    clf = AdaBoostClassifier(random_state=0)
    param_grid = {'n_estimators': [100,200],
                  'learning_rate': [0.001,0.01,0.1,0.2,0.5]}
    regr_ab = GridSearchCV(clf, param_grid).fit(X_train,y_train)
    return regr_ab

In [8]:
from sklearn.naive_bayes import GaussianNB

def nb(X_train, y_train):
    clf = GaussianNB()
    param_grid = {'var_smoothing':  np.logspace(-11,-3,9,base=10)}
    regr_nb = GridSearchCV(clf, param_grid).fit(X_train, y_train)
    return regr_nb

In [16]:
from sklearn.linear_model import LogisticRegression

def lr(X_train, y_train):
    clf = LogisticRegression(random_state=0,
                             max_iter=10000)
    param_grid = {'penalty' : ["none", 'l2'],
                  'C' : np.logspace(-4, 4, 20)}
    regr_lr = GridSearchCV(clf, param_grid).fit(X_train, y_train)
    return regr_lr

In [10]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

def qda(X_train, y_train):
    clf = QuadraticDiscriminantAnalysis()
    param_grid = {'reg_param':  [0.0]}
    regr_qda = GridSearchCV(clf, param_grid).fit(X_train, y_train)
    return regr_qda

In [18]:
data_dir = "../data/train_test_data/"

X_train = pd.read_pickle(data_dir + "X_train_res_ohe.pkl")
X_train_target = pd.read_pickle(data_dir + "X_train_res_target.pkl")
X_train_scaled = pd.read_pickle(data_dir + "X_train_res_ohe_scaled.pkl")
X_train_target_scaled = pd.read_pickle(data_dir + "X_train_res_target_scaled.pkl")
y_train = pd.read_pickle(data_dir + "y_train_res.pkl")

X_test = pd.read_pickle(data_dir + "X_test_ohe.pkl")
X_test_target = pd.read_pickle(data_dir + "X_test_target.pkl")
X_test_scaled = pd.read_pickle(data_dir + "X_test_ohe_scaled.pkl")
X_test_target_scaled = pd.read_pickle(data_dir + "X_test_target_scaled.pkl")
y_test = pd.read_pickle(data_dir + "y_test.pkl")

In [19]:
from sklearn.metrics import confusion_matrix

techniques_dict = {'K Nearest Neighbours': nns, 'Support Vector Machines': svm, 
                   'Gaussian Process': gp, 'Random Forest Classifier': rfc, 
                   'Gradient Boosting Classifier': gbc,  'Ada Boost classifier': ab,
                   'Gaussian Naieve Bayes': nb, 'Logistic Regression': lr, 
                   'Quadratic Discriminant Analysis': qda}



def eval_model(model, X_train, y_train, X_test, y_test):

    train_pred = (model.predict(X_train)).reshape(-1, 1)
    test_pred = (model.predict(X_test)).reshape(-1, 1)

    train_accuracy = accuracy_score(train_pred, y_train)
    test_accuracy = accuracy_score(test_pred, y_test)

    train_recall = recall_score(y_train, train_pred)
    test_recall = recall_score(y_test, test_pred)

    train_precision = precision_score(y_train, train_pred)
    test_precision = precision_score(y_test, test_pred)

    test_conf_matrix = confusion_matrix(y_test, test_pred)

    results = {
        "Train Accuracy": train_accuracy,
        "Test Accuracy": test_accuracy,
        "Train Precision": train_precision,
        "Train Recall": train_recall,
        "Test Precision": test_precision,
        "Test Recall": test_recall,
        "Test True -": test_conf_matrix[0][0],
        "Test False -": test_conf_matrix[1][0],
        "Test True +": test_conf_matrix[1][1],
        "Test False +": test_conf_matrix[0][1],
    }

    return results

model_results_list = []
for model_type in techniques_dict.keys():
    print(f"fitting {model_type}......")
    model_f = techniques_dict[model_type]
    if model_type in ['nns','svm','gp','lr','qda']:
        model = model_f(X_train_scaled, y_train)
        model_results = eval_model(model,
                                   X_train_scaled, y_train,
                                   X_test_scaled, y_test)
        model_results_list.append(model_results)
    else: # don't normalise x
        model = model_f(X_train, y_train)
        model_results = eval_model(model,
                                   X_train, y_train,
                                   X_test, y_test)
        model_results_list.append(model_results)
    print("done.")

model_results_df = pd.dataframe(model_results_list,
                             index=techniques_dict.keys())
model_results_df.to_csv("../data/model_results.csv")


fitting K Nearest Neighbours......
Done.
fitting Support Vector Machines......
Done.
fitting Gaussian Process......
Done.
fitting Random Forest Classifier......
Done.
fitting Gradient Boosting Classifier......
Done.
fitting Ada Boost classifier......
Done.
fitting Gaussian Naieve Bayes......
Done.
fitting Logistic Regression......
Done.
fitting Quadratic Discriminant Analysis......
Done.




In [20]:
model_results_df

Unnamed: 0,Train Accuracy,Test Accuracy,Train Precision,Train Recall,Test Precision,Test Recall,Test True -,Test False -,Test True +,Test False +
K Nearest Neighbours,0.977273,0.725352,1.0,0.954545,0.136364,0.130435,100,20,3,19
Support Vector Machines,0.905594,0.676056,0.884106,0.933566,0.151515,0.217391,91,18,5,28
Gaussian Process,0.987762,0.71831,0.986063,0.98951,0.095238,0.086957,100,21,2,19
Random Forest Classifier,0.879371,0.732394,0.85342,0.916084,0.2,0.217391,99,18,5,20
Gradient Boosting Classifier,1.0,0.788732,1.0,1.0,0.111111,0.043478,111,22,1,8
Ada Boost classifier,0.909091,0.739437,0.911972,0.905594,0.181818,0.173913,101,19,4,18
Gaussian Naieve Bayes,0.758741,0.56338,0.712644,0.867133,0.157895,0.391304,71,14,9,48
Logistic Regression,0.826923,0.669014,0.800643,0.870629,0.184211,0.304348,88,16,7,31
Quadratic Discriminant Analysis,0.578671,0.323944,0.543186,0.98951,0.188034,0.956522,24,1,22,95


In [21]:
target_model_results_list = []
for model_type in techniques_dict.keys():
    print(f"fitting {model_type}......")
    model_f = techniques_dict[model_type]
    if model_type in ['nns','svm','gp','lr','qda']:
        model = model_f(X_train_target_scaled, y_train)
        model_results = eval_model(model,
                                   X_train_target_scaled, y_train,
                                   X_test_target_scaled, y_test)
        target_model_results_list.append(model_results)
    else: # don't normalise x
        model = model_f(X_train_target, y_train)
        model_results = eval_model(model,
                                   X_train_target, y_train,
                                   X_test_target, y_test)
        target_model_results_list.append(model_results)
    print("done.")

target_model_results_df = pd.DataFrame(model_results_list,
                             index=techniques_dict.keys())
target_model_results_df.to_csv("../data/target_model_results.csv")


fitting K Nearest Neighbours......
done.
fitting Support Vector Machines......
done.
fitting Gaussian Process......
done.
fitting Random Forest Classifier......
done.
fitting Gradient Boosting Classifier......
done.
fitting Ada Boost classifier......
done.
fitting Gaussian Naieve Bayes......
done.
fitting Logistic Regression......
done.
fitting Quadratic Discriminant Analysis......
done.




AttributeError: module 'pandas' has no attribute 'dataframe'

In [24]:
target_model_results_df = pd.DataFrame(target_model_results_list,
                             index=techniques_dict.keys())
target_model_results_df.to_csv("../data/target_model_results.csv")

target_model_results_df

Unnamed: 0,Train Accuracy,Test Accuracy,Train Precision,Train Recall,Test Precision,Test Recall,Test True -,Test False -,Test True +,Test False +
K Nearest Neighbours,1.0,0.633803,1.0,1.0,0.108108,0.173913,86,19,4,33
Support Vector Machines,0.76049,0.619718,0.722388,0.846154,0.207547,0.478261,77,12,11,42
Gaussian Process,0.93007,0.711268,0.91,0.954545,0.25,0.391304,92,14,9,27
Random Forest Classifier,0.867133,0.753521,0.85473,0.884615,0.269231,0.304348,100,16,7,19
Gradient Boosting Classifier,0.996503,0.802817,0.993056,1.0,0.272727,0.130435,111,20,3,8
Ada Boost classifier,0.907343,0.732394,0.911661,0.902098,0.142857,0.130435,101,20,3,18
Gaussian Naieve Bayes,0.704545,0.471831,0.642336,0.923077,0.138889,0.434783,57,13,10,62
Logistic Regression,0.765734,0.56338,0.740506,0.818182,0.102041,0.217391,75,18,5,44
Quadratic Discriminant Analysis,0.657343,0.380282,0.595339,0.982517,0.171717,0.73913,37,6,17,82


In [23]:
model_results_df


Unnamed: 0,Train Accuracy,Test Accuracy,Train Precision,Train Recall,Test Precision,Test Recall,Test True -,Test False -,Test True +,Test False +
K Nearest Neighbours,0.977273,0.725352,1.0,0.954545,0.136364,0.130435,100,20,3,19
Support Vector Machines,0.905594,0.676056,0.884106,0.933566,0.151515,0.217391,91,18,5,28
Gaussian Process,0.987762,0.71831,0.986063,0.98951,0.095238,0.086957,100,21,2,19
Random Forest Classifier,0.879371,0.732394,0.85342,0.916084,0.2,0.217391,99,18,5,20
Gradient Boosting Classifier,1.0,0.788732,1.0,1.0,0.111111,0.043478,111,22,1,8
Ada Boost classifier,0.909091,0.739437,0.911972,0.905594,0.181818,0.173913,101,19,4,18
Gaussian Naieve Bayes,0.758741,0.56338,0.712644,0.867133,0.157895,0.391304,71,14,9,48
Logistic Regression,0.826923,0.669014,0.800643,0.870629,0.184211,0.304348,88,16,7,31
Quadratic Discriminant Analysis,0.578671,0.323944,0.543186,0.98951,0.188034,0.956522,24,1,22,95
