In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, accuracy_score
import graphviz, pydotplus
from sklearn.tree import export_graphviz
from prettytable import PrettyTable

In [2]:
def extractData(csvFile):
    df = pd.read_csv(csvFile)
    y = df['label']
    X = df.drop(columns = ['accountName', 'label', 'Unnamed: 0'])
    print(X.columns)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 42, stratify = y)
    scaler = StandardScaler()
    scaler.fit_transform(X_train)
    scaler.transform(X_test)
    
    return scaler, X_train, X_test, y_train, y_test

In [3]:
csv_file_name = 'all_accounts_unfiltered_2.csv'
main_dir_path = os.path.dirname(os.getcwd())
csv_file_path = os.path.join(main_dir_path, 'Data', 'FINAL_DATASETS', csv_file_name)
scaler, X_train, X_test, y_train, y_test = extractData(csv_file_path)

Index(['SubmToCommentRatio', 'daily_downtime'], dtype='object')


In [4]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1977, 2), (349, 2), (1977,), (349,))

In [5]:
clf_knn = KNeighborsClassifier()
k_list = [1, 5, 9, 13, 17, 21, 25, 31, 41, 51]
parameters = {'n_neighbors': k_list}
gs_cv_knn = GridSearchCV(clf_knn, parameters, cv = 3, scoring = 'accuracy')
gs_cv_knn.fit(X_train, y_train)
clf_knn = gs_cv_knn.best_estimator_

In [6]:
clf_knn.fit(X_train, y_train)
y_pred_knn = clf_knn.predict(X_test)
print(accuracy_score(y_test, y_pred_knn), f1_score(y_test, y_pred_knn), clf_knn)

0.9369627507163324 0.7179487179487181 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=9, p=2,
                     weights='uniform')


In [7]:
clf_lr = LogisticRegression(penalty= 'l1')
c_list = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
parameters = {'C' : c_list}
gs_cv_lr = GridSearchCV(clf_lr, parameters, cv = 3, scoring='accuracy')
gs_cv_lr.fit(X_train, y_train)
clf_lr = gs_cv_lr.best_estimator_



In [8]:
clf_lr.fit(X_train, y_train)
y_pred_lr = clf_lr.predict(X_test)
print(accuracy_score(y_test, y_pred_lr), f1_score(y_test, y_pred_lr), clf_lr)

0.8825214899713467 0.0 LogisticRegression(C=1e-05, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)


  'precision', 'predicted', average, warn_for)


In [9]:
clf_svm = SGDClassifier(loss= 'hinge', penalty='l2')
alpha_list = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
parameters = {'alpha' : alpha_list}
gs_cv_svm = GridSearchCV(clf_svm, parameters, cv = 3, scoring='accuracy')
gs_cv_svm.fit(X_train, y_train)
clf_svm = gs_cv_svm.best_estimator_



In [10]:
clf_svm.fit(X_train, y_train)
y_pred_svm = clf_svm.predict(X_test)
print(accuracy_score(y_test, y_pred_svm), f1_score(y_test, y_pred_svm), clf_svm)

0.8825214899713467 0.0 SGDClassifier(alpha=1, average=False, class_weight=None, early_stopping=False,
              epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15,
              learning_rate='optimal', loss='hinge', max_iter=1000,
              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
              random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)


  'precision', 'predicted', average, warn_for)


In [11]:
clf_dt = DecisionTreeClassifier()
max_depth_list = [1, 5, 10, 50, 100, 500, 1000]
min_samples_split_list = [5, 10, 100, 500]
parameters = {'max_depth': max_depth_list, 'min_samples_split': min_samples_split_list}
gs_cv_dt = GridSearchCV(clf_dt, parameters, cv = 3, scoring='accuracy')
gs_cv_dt.fit(X_train, y_train)
clf_dt = gs_cv_dt.best_estimator_

In [12]:
clf_dt.fit(X_train, y_train)
y_pred_dt = clf_dt.predict(X_test)
print(accuracy_score(y_test, y_pred_dt), f1_score(y_test, y_pred_dt), clf_dt)

0.9598853868194842 0.8108108108108109 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')


In [13]:
df = pd.read_csv(csv_file_path)
df = df.drop(columns = ['accountName', 'label', 'Unnamed: 0'])
dot_data = export_graphviz(clf_dt, out_file= None, feature_names= list(df.columns), filled= True)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_png('DT_PLOTS/DecisionTreeVisualization_unfiltered_2.png')

True

In [14]:
from prettytable import PrettyTable

conclusion_table = PrettyTable()

conclusion_table.field_names = ['Classifier', 'f1-score on Unseen Data', 'accuracy on Unseen Data']

conclusion_table.add_row(['kNN', '71.79', '93.69'])
conclusion_table.add_row(['Logistic Regression', '0.0', '88.25'])
conclusion_table.add_row(['Linear SVM', '0.0', '88.25'])
conclusion_table.add_row(['Decision Tree', '81.01', '95.98'])

print(conclusion_table)

+---------------------+-------------------------+-------------------------+
|      Classifier     | f1-score on Unseen Data | accuracy on Unseen Data |
+---------------------+-------------------------+-------------------------+
|         kNN         |          71.79          |          93.69          |
| Logistic Regression |           0.0           |          88.25          |
|      Linear SVM     |           0.0           |          88.25          |
|    Decision Tree    |          81.01          |          95.98          |
+---------------------+-------------------------+-------------------------+


In [15]:
import matplotlib.pyplot as plt

import pickle

pred_train = clf_dt.predict(X_train)

pickle.dump((pred_train, y_train.values, y_pred_dt, y_test.values), open('CONF_MATRICES/unfiltered_2.p', 'wb') )