In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

In [2]:
from sklearn.neighbors import KNeighborsClassifier, DistanceMetric

def nns(X_train, y_train):
    neigh = KNeighborsClassifier()
    param_grid = {'n_neighbors': np.arange(1,10),
                    'weights': ['uniform','distance'],
                    'metric':['euclidean','manhattan']}
    regr_nns = GridSearchCV(neigh, param_grid).fit(X_train, y_train)
    return regr_nns

In [15]:
from sklearn.svm import SVC

def svm(X_train ,y_train):
    svc = SVC()
    param_grid = {'kernel': ['linear','rbf'],
                  'C': np.logspace(2,4,2), # np.logspace(2,5,6)
                  'gamma': np.logspace(-4,0.5,1)} # np.logspace(-4,0.5,10)}
    regr_svm = GridSearchCV(svc, param_grid).fit(X_train, y_train)
    return regr_svm

In [4]:
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

def gp(X_train, y_train):
    model = GaussianProcessClassifier(random_state=0)
    param_grid = {'kernel': [1.0 * RBF(1.0)]}
    regr_gp = GridSearchCV(model, param_grid).fit(X_train, y_train)
    return regr_gp

In [5]:
from sklearn.ensemble import RandomForestClassifier

def rfc(X_train, y_train):
    est = RandomForestClassifier(n_estimators=100)
    param_grid = {'max_depth': [4, 6],
                  'min_samples_leaf': [3,5,9,17],
                  'max_features': [0.3]}
    regr_rfc = GridSearchCV(est, param_grid).fit(X_train, y_train)
    return regr_rfc

In [6]:
from sklearn.ensemble import GradientBoostingClassifier

def gbc(X_train, y_train):
    est = GradientBoostingClassifier(n_estimators=100,random_state=0)
    param_grid = {'learning_rate': [0.1, 0.05, 0.02, 0.01],
                    'max_depth': [3,4,6],
                    'min_samples_leaf': [3,5,9,17],
                    'max_features': [x for x in np.linspace(0.2,0.4,4)]}
    regr_gbc = GridSearchCV(est, param_grid).fit(X_train, y_train)
    return regr_gbc

In [7]:
from sklearn.ensemble import AdaBoostClassifier

def ab(X_train, y_train):
    clf = AdaBoostClassifier(random_state=0)
    param_grid = {'n_estimators': [100,200],
                  'learning_rate': [0.001,0.01,0.1,0.2,0.5]}
    regr_ab = GridSearchCV(clf, param_grid).fit(X_train,y_train)
    return regr_ab

In [8]:
from sklearn.naive_bayes import GaussianNB

def nb(X_train, y_train):
    clf = GaussianNB()
    param_grid = {'var_smoothing':  np.logspace(-11,-3,9,base=10)}
    regr_nb = GridSearchCV(clf, param_grid).fit(X_train, y_train)
    return regr_nb

In [19]:
from sklearn.linear_model import LogisticRegression

def lr(X_train, y_train):
    clf = LogisticRegression(random_state=0)
    param_grid = {'penalty' : ['l1', 'l2'],
                  'C' : np.logspace(-4, 4, 20)}
    regr_lr = GridSearchCV(clf, param_grid).fit(X_train, y_train)
    return regr_lr

In [10]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

def qda(X_train, y_train):
    clf = QuadraticDiscriminantAnalysis()
    param_grid = {'reg_param':  [0.0]}
    regr_qda = GridSearchCV(clf, param_grid).fit(X_train, y_train)
    return regr_qda

In [11]:
def eval_model(regr, X_train, X_test, y_train, y_test):
    y_train_pred = (regr.predict(X_train)).reshape(-1, 1)
    y_test_pred = (regr.predict(X_test)).reshape(-1, 1)
    accuracy_train = accuracy_score(y_train_pred, y_train)
    accuracy_test = accuracy_score(y_test_pred, y_test)
    recall_train = recall_score(y_train, y_train_pred)
    recall_test = recall_score(y_test, y_test_pred)
    precision_train = precision_score(y_test, y_test_pred)
    precision_test = precision_score(y_test, y_test_pred)
    print('accuracy train:  %.3f, accuracy test: %.3f' %  (accuracy_train, accuracy_test))
    return (accuracy_train, accuracy_test, recall_train,
            recall_test, precision_train, precision_test)



In [12]:
data_dir = "../data/train_test_data/"

X_train = pd.read_pickle(data_dir + "X_train_res_ohe.pkl")
X_train_scaled = pd.read_pickle(data_dir + "X_train_res_ohe_scaled.pkl")
y_train = pd.read_pickle(data_dir + "y_train_res.pkl")

X_test = pd.read_pickle(data_dir + "X_test_ohe.pkl")
X_test_scaled = pd.read_pickle(data_dir + "X_test_ohe_scaled.pkl")
y_test = pd.read_pickle(data_dir + "y_test.pkl")

In [20]:
techniques_dict = {'nns': nns, 'svm': svm, 'gp': gp,
                   'rfc': rfc, 'gbc': gbc, 'ab': ab,
                   'nb': nb, 'lr': lr, 'qda': qda}


accuracy_train_array = np.zeros(len(techniques_dict))
accuracy_test_array = np.zeros(len(techniques_dict))

recall_train_array = np.zeros(len(techniques_dict))
recall_test_array = np.zeros(len(techniques_dict))

precision_train_array  = np.zeros(len(techniques_dict))
precision_test_array = np.zeros(len(techniques_dict))

count = -1
for t in techniques_dict.keys():
    count += 1
    f = techniques_dict[t]
    if t in ['nns','svm','gp','lr','qda']:
        regr = f(X_train_scaled, y_train)
        accuracy_train, accuracy_test, recall_train, recall_test, precision_train,precision_test = eval_model(regr,
                                                                                                              X_train_scaled, X_test_scaled, y_train, y_test)
    else: # don't normalise x
        regr = f(X_train, y_train)
        accuracy_train, accuracy_test, recall_train, recall_test, lprecision_train, precision_test = eval_model(regr,
                                                                                                            X_train,X_test,y_train,y_test)

    accuracy_train_array[count] = accuracy_train
    accuracy_test_array[count] = accuracy_test
    recall_train_array[count], precision_train_array[count] = recall_train, precision_train
    recall_test_array[count],precision_test_array[count] = recall_test, precision_test

m = pd.DataFrame([accuracy_train_array,accuracy_test_array,recall_train_array,recall_test_array,precision_train_array,precision_test_array])
m.columns = np.array(techniques)
m.index = ['train','test','recall_train','recall_test','precision_train','precision_test']
print(m)
m.to_excel('m26.xlsx')


accuracy train:  0.943, accuracy test: 0.745
accuracy train:  0.859, accuracy test: 0.681
accuracy train:  0.990, accuracy test: 0.759
accuracy train:  0.906, accuracy test: 0.709
accuracy train:  1.000, accuracy test: 0.766
accuracy train:  0.922, accuracy test: 0.723
accuracy train:  0.761, accuracy test: 0.546
accuracy train:  0.848, accuracy test: 0.688
accuracy train:  0.739, accuracy test: 0.468


Traceback (most recent call last):
  File "/home/samface/anaconda3/envs/ace_env/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/samface/anaconda3/envs/ace_env/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/samface/anaconda3/envs/ace_env/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 442, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "/home/samface/anaconda3/envs/ace_env/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/samface/anaconda3/envs/ace_env/lib/python3.8/sit

AttributeError: module 'pandas' has no attribute 'dataframe'

In [28]:

m = pd.DataFrame([accuracy_train_array, accuracy_test_array,
                  recall_train_array, recall_test_array,
                  precision_train_array, precision_test_array],
                 columns=techniques_dict.keys())
m.index = ['train','test','recall_train','recall_test','precision_train','precision_test']
print(m)
m.to_excel('m26.xlsx')

                      nns       svm        gp       rfc       gbc        ab  \
train            0.942509  0.858885  0.989547  0.905923  1.000000  0.921603   
test             0.744681  0.680851  0.758865  0.709220  0.765957  0.723404   
recall_train     0.885017  0.878049  0.989547  0.944251  1.000000  0.923345   
recall_test      0.260870  0.260870  0.304348  0.347826  0.130435  0.260870   
precision_train  0.240000  0.176471  0.280000  0.280000  0.280000  0.280000   
precision_test   0.240000  0.176471  0.280000  0.235294  0.187500  0.214286   

                       nb        lr       qda  
train            0.761324  0.848432  0.738676  
test             0.546099  0.687943  0.468085  
recall_train     0.888502  0.867596  0.954704  
recall_test      0.478261  0.260870  0.652174  
precision_train  0.280000  0.181818  0.182927  
precision_test   0.174603  0.181818  0.182927  


In [29]:
m

Unnamed: 0,nns,svm,gp,rfc,gbc,ab,nb,lr,qda
train,0.942509,0.858885,0.989547,0.905923,1.0,0.921603,0.761324,0.848432,0.738676
test,0.744681,0.680851,0.758865,0.70922,0.765957,0.723404,0.546099,0.687943,0.468085
recall_train,0.885017,0.878049,0.989547,0.944251,1.0,0.923345,0.888502,0.867596,0.954704
recall_test,0.26087,0.26087,0.304348,0.347826,0.130435,0.26087,0.478261,0.26087,0.652174
precision_train,0.24,0.176471,0.28,0.28,0.28,0.28,0.28,0.181818,0.182927
precision_test,0.24,0.176471,0.28,0.235294,0.1875,0.214286,0.174603,0.181818,0.182927


In [26]:
techniques_dict.keys()

dict_keys(['nns', 'svm', 'gp', 'rfc', 'gbc', 'ab', 'nb', 'lr', 'qda'])