In [63]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
import graphviz, pydotplus
from sklearn.tree import export_graphviz
from prettytable import PrettyTable

In [32]:
def extractData(csvFile):
    df = pd.read_csv(csvFile)
    y = df['label']
    X = df.drop(columns = ['accountName', 'label'])
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 42, stratify = y)
    scaler = StandardScaler()
    scaler.fit_transform(X_train)
    scaler.transform(X_test)
    
    return scaler, X_train, X_test, y_train, y_test

In [33]:
scaler, X_train, X_test, y_train, y_test = extractData('All_Accounts.csv')

In [34]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((494, 10), (88, 10), (494,), (88,))

In [35]:
clf_knn = KNeighborsClassifier()
k_list = [1, 5, 9, 13, 17, 21, 25, 31, 41, 51]
parameters = {'n_neighbors': k_list}
gs_cv_knn = GridSearchCV(clf_knn, parameters, cv = 3, scoring = 'f1', verbose = 3)
gs_cv_knn.fit(X_train, y_train)
clf_knn = gs_cv_knn.best_estimator_

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] n_neighbors=1 ...................................................
[CV] ....................... n_neighbors=1, score=0.822, total=   0.0s
[CV] n_neighbors=1 ...................................................
[CV] ....................... n_neighbors=1, score=0.798, total=   0.0s
[CV] n_neighbors=1 ...................................................
[CV] ....................... n_neighbors=1, score=0.818, total=   0.0s
[CV] n_neighbors=5 ...................................................
[CV] ....................... n_neighbors=5, score=0.830, total=   0.0s
[CV] n_neighbors=5 ...................................................
[CV] ....................... n_neighbors=5, score=0.838, total=   0.0s
[CV] n_neighbors=5 ...................................................
[CV] ....................... n_neighbors=5, score=0.840, total=   0.0s
[CV] n_neighbors=9 ...................................................
[CV] ...........

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV] ...................... n_neighbors=21, score=0.827, total=   0.0s
[CV] n_neighbors=25 ..................................................
[CV] ...................... n_neighbors=25, score=0.813, total=   0.0s
[CV] n_neighbors=25 ..................................................
[CV] ...................... n_neighbors=25, score=0.836, total=   0.0s
[CV] n_neighbors=25 ..................................................
[CV] ...................... n_neighbors=25, score=0.822, total=   0.0s
[CV] n_neighbors=31 ..................................................
[CV] ...................... n_neighbors=31, score=0.796, total=   0.0s
[CV] n_neighbors=31 ..................................................
[CV] ...................... n_neighbors=31, score=0.827, total=   0.0s
[CV] n_neighbors=31 ..................................................
[CV] ...................... n_neighbors=31, score=0.818, total=   0.0s
[CV] n_neighbors=41 ..................................................
[CV] .

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.4s finished


In [48]:
clf_knn.fit(X_train, y_train)
y_pred_knn = clf_knn.predict(X_test)
print(f1_score(y_test, y_pred_knn), clf_knn)

0.8409090909090909 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=9, p=2,
                     weights='uniform')


In [50]:
clf_nb = MultinomialNB()
alpha_list = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
parameters = {'alpha' : alpha_list}
gs_cv_nb = GridSearchCV(clf_nb, parameters, cv = 3, scoring='f1', verbose=3)
gs_cv_nb.fit(X_train, y_train)
clf_nb = gs_cv_nb.best_estimator_

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] alpha=1e-05 .....................................................
[CV] ......................... alpha=1e-05, score=0.670, total=   0.0s
[CV] alpha=1e-05 .....................................................
[CV] ......................... alpha=1e-05, score=0.722, total=   0.0s
[CV] alpha=1e-05 .....................................................
[CV] ......................... alpha=1e-05, score=0.649, total=   0.0s
[CV] alpha=0.0001 ....................................................
[CV] ........................ alpha=0.0001, score=0.670, total=   0.0s
[CV] alpha=0.0001 ....................................................
[CV] ........................ alpha=0.0001, score=0.722, total=   0.0s
[CV] alpha=0.0001 ....................................................
[CV] ........................ alpha=0.0001, score=0.649, total=   0.0s
[CV] alpha=0.001 .....................................................
[CV] ...........

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.2s finished


In [51]:
clf_nb.fit(X_train, y_train)
y_pred_nb = clf_nb.predict(X_test)
print(f1_score(y_test, y_pred_nb), clf_nb)

0.7079646017699116 MultinomialNB(alpha=1e-05, class_prior=None, fit_prior=True)


In [52]:
clf_lr = LogisticRegression(penalty= 'l2')
c_list = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
parameters = {'C' : c_list}
gs_cv_lr = GridSearchCV(clf_lr, parameters, cv = 3, scoring='f1', verbose=3)
gs_cv_lr.fit(X_train, y_train)
clf_lr = gs_cv_lr.best_estimator_

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] C=1e-05 .........................................................
[CV] ............................. C=1e-05, score=0.781, total=   0.0s
[CV] C=1e-05 .........................................................
[CV] ............................. C=1e-05, score=0.828, total=   0.0s
[CV] C=1e-05 .........................................................
[CV] ............................. C=1e-05, score=0.816, total=   0.0s
[CV] C=0.0001 ........................................................
[CV] ............................ C=0.0001, score=0.802, total=   0.0s
[CV] C=0.0001 ........................................................
[CV] ............................ C=0.0001, score=0.839, total=   0.0s
[CV] C=0.0001 ........................................................
[CV] ............................ C=0.0001, score=0.800, total=   0.0s
[CV] C=0.001 .........................................................
[CV] ...........

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.2s finished


In [53]:
clf_lr.fit(X_train, y_train)
y_pred_lr = clf_lr.predict(X_test)
print(f1_score(y_test, y_pred_lr), clf_lr)

0.8351648351648352 LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)




In [54]:
clf_svm = SGDClassifier(loss= 'hinge', penalty='l2')
alpha_list = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
parameters = {'alpha' : alpha_list}
gs_cv_svm = GridSearchCV(clf_svm, parameters, cv = 3, scoring='f1', verbose=3)
gs_cv_svm.fit(X_train, y_train)
clf_svm = gs_cv_svm.best_estimator_

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] alpha=1e-05 .....................................................
[CV] ......................... alpha=1e-05, score=0.411, total=   0.0s
[CV] alpha=1e-05 .....................................................
[CV] ......................... alpha=1e-05, score=0.699, total=   0.0s
[CV] alpha=1e-05 .....................................................
[CV] ......................... alpha=1e-05, score=0.500, total=   0.0s
[CV] alpha=0.0001 ....................................................
[CV] ........................ alpha=0.0001, score=0.813, total=   0.0s
[CV] alpha=0.0001 ....................................................
[CV] ........................ alpha=0.0001, score=0.828, total=   0.0s
[CV] alpha=0.0001 ....................................................
[CV] ........................ alpha=0.0001, score=0.508, total=   0.0s
[CV] alpha=0.001 .....................................................
[CV] ...........

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    0.2s finished


In [55]:
clf_svm.fit(X_train, y_train)
y_pred_svm = clf_svm.predict(X_test)
print(f1_score(y_test, y_pred_svm), clf_svm)

0.6233766233766234 SGDClassifier(alpha=10000, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)


In [56]:
clf_dt = DecisionTreeClassifier()
max_depth_list = [1, 5, 10, 50, 100, 500, 1000]
min_samples_split_list = [5, 10, 100, 500]
parameters = {'max_depth': max_depth_list, 'min_samples_split': min_samples_split_list}
gs_cv_dt = GridSearchCV(clf_dt, parameters, cv = 3, scoring='f1', verbose=3)
gs_cv_dt.fit(X_train, y_train)
clf_dt = gs_cv_dt.best_estimator_

Fitting 3 folds for each of 28 candidates, totalling 84 fits
[CV] max_depth=1, min_samples_split=5 ................................
[CV] .... max_depth=1, min_samples_split=5, score=0.841, total=   0.0s
[CV] max_depth=1, min_samples_split=5 ................................
[CV] .... max_depth=1, min_samples_split=5, score=0.847, total=   0.0s
[CV] max_depth=1, min_samples_split=5 ................................
[CV] .... max_depth=1, min_samples_split=5, score=0.865, total=   0.0s
[CV] max_depth=1, min_samples_split=10 ...............................
[CV] ... max_depth=1, min_samples_split=10, score=0.841, total=   0.0s
[CV] max_depth=1, min_samples_split=10 ...............................
[CV] ... max_depth=1, min_samples_split=10, score=0.847, total=   0.0s
[CV] max_depth=1, min_samples_split=10 ...............................
[CV] ... max_depth=1, min_samples_split=10, score=0.865, total=   0.0s
[CV] max_depth=1, min_samples_split=100 ..............................
[CV] .. max_dept

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV] .. max_depth=50, min_samples_split=10, score=0.865, total=   0.0s
[CV] max_depth=50, min_samples_split=100 .............................
[CV] . max_depth=50, min_samples_split=100, score=0.877, total=   0.0s
[CV] max_depth=50, min_samples_split=100 .............................
[CV] . max_depth=50, min_samples_split=100, score=0.885, total=   0.0s
[CV] max_depth=50, min_samples_split=100 .............................
[CV] . max_depth=50, min_samples_split=100, score=0.886, total=   0.0s
[CV] max_depth=50, min_samples_split=500 .............................
[CV] . max_depth=50, min_samples_split=500, score=0.000, total=   0.0s
[CV] max_depth=50, min_samples_split=500 .............................
[CV] . max_depth=50, min_samples_split=500, score=0.000, total=   0.0s
[CV] max_depth=50, min_samples_split=500 .............................
[CV] . max_depth=50, min_samples_split=500, score=0.000, total=   0.0s
[CV] max_depth=100, min_samples_split=5 ..............................
[CV] .

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Done  84 out of  84 | elapsed:    0.4s finished


In [57]:
clf_dt.fit(X_train, y_train)
y_pred_dt = clf_dt.predict(X_test)
print(f1_score(y_test, y_pred_dt), clf_dt)

0.8148148148148149 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=100,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')


In [68]:
df = pd.read_csv('All_Accounts.csv')
df = df.drop(columns = ['accountName', 'label'])
dot_data = export_graphviz(clf_dt, out_file= None, feature_names= list(df.columns), filled= True)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_png('DecisionTreeVisualization.png')

True

In [58]:
conclusion_table = PrettyTable()
conclusion_table.field_names = ['Classifier', 'Best Hyperparameter', 'f1-score on Unseen Data']

conclusion_table.add_row(['kNN', 'k= 9', '0.84'])
conclusion_table.add_row(['Naive Bayes', 'alpha= 0.00001', '0.71'])
conclusion_table.add_row(['Logistic Regression', 'C= 1000, L2 Regularizer', '0.83'])
conclusion_table.add_row(['Linear SVM', 'alpha= 10000, L2 Regularizer', '0.62'])
conclusion_table.add_row(['Decision Tree', 'max_depth= 5, min_samples_split= 100', '0.81'])

print(conclusion_table)

+---------------------+--------------------------------------+-------------------------+
|      Classifier     |         Best Hyperparameter          | f1-score on Unseen Data |
+---------------------+--------------------------------------+-------------------------+
|         kNN         |                 k= 9                 |           0.84          |
|     Naive Bayes     |            alpha= 0.00001            |           0.71          |
| Logistic Regression |       C= 1000, L2 Regularizer        |           0.83          |
|      Linear SVM     |     alpha= 10000, L2 Regularizer     |           0.62          |
|    Decision Tree    | max_depth= 5, min_samples_split= 100 |           0.81          |
+---------------------+--------------------------------------+-------------------------+
