**Model Develop & Compare**

#### 1.import classifiers

In [0]:
from sklearn.metrics import confusion_matrix
from sklearn import metrics

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC  
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import pandas as pd

####2. Load DTM File and split as train & test

In [0]:

from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

drugAbuse_DTM = pd.read_csv('abuse_tfidf_df_30.txt', delimiter = '\t')

X = drugAbuse_DTM.drop('abused', axis = 1)
y = drugAbuse_DTM.abused


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 123)

Saving abuse_tfidf_df_30.txt to abuse_tfidf_df_30 (1).txt
User uploaded file "abuse_tfidf_df_30.txt" with length 1844090 bytes


In [0]:
from sklearn.utils import resample

In [0]:
drugAbuse_DTM.abused.value_counts()

0    2570
1     447
Name: abused, dtype: int64

In [0]:

df_abused = drugAbuse_DTM[drugAbuse_DTM.abused == 1]
df_nonabused = drugAbuse_DTM[drugAbuse_DTM.abused == 0]
"""
df_abuse_upsampled = resample(df_abused, 
                                 replace=True,     
                                 n_samples=2570,    
                                 random_state=123)
                                 """

In [0]:
from imblearn.over_sampling import SMOTE
X_train_SMOTE, y_train_SMOTE = SMOTE().fit_resample(X_train, y_train)

In [0]:
from collections import Counter
print(sorted(Counter(y_train_SMOTE).items()))

[(0, 2064), (1, 2064)]


####3. Predict & Create report of Every Model


In [0]:
def compare_models(classifiers, X, y):

  reports = []
  accuracies = []
  recalls = []
  matrix = []
  
  for _classi in classifiers:
    _predicted = _classi.predict(X = X)
    _report = metrics.classification_report(y, _predicted)
    _accuracy = metrics.accuracy_score(y, _predicted)
    _recall = metrics.recall_score(y, _predicted)
    _matrix = metrics.confusion_matrix(y, _predicted)
    
    reports.append(_report)
    accuracies.append(_accuracy)
    recalls.append(_recall)
    matrix.append(_matrix)
  
  return (reports, accuracies, recalls, matrix)

In [0]:
from sklearn.model_selection import cross_val_score

knn = KNeighborsClassifier()
svm = SVC(kernel = 'linear', random_state = 123)
svm_rbf = SVC(kernel = 'rbf', random_state = 123, gamma = 'scale')
svm_rbf_2 = SVC(kernel = 'rbf', random_state = 123, gamma = 2)
dt = DecisionTreeClassifier(random_state = 123)
nb = GaussianNB()

_models = [knn, svm, svm_rbf, svm_rbf_2, dt, nb]

for classifier in _models:
  classifier.fit(X = X_train_SMOTE, y = y_train_SMOTE)


train_report, train_accuracies, train_recall, train_matrix = compare_models(_models, X_train_SMOTE, y_train_SMOTE)
test_report, test_accuracies, test_recall, test_matrix = compare_models(_models, X_test, y_test)


In [0]:
from sklearn.model_selection import cross_val_score

def cross_validation(_models):
  
  mean_scores = []
  for _model in _models:
    scores = cross_val_score(_model, X , y, scoring = 'accuracy', cv = 10)
    mean_scores.append(scores.mean())
  
  return mean_scores

_models = [knn, svm, svm_rbf, svm_rbf_2, dt, nb]
validations = cross_validation(_models)

In [0]:
test_recall

[0.47959183673469385,
 0.5714285714285714,
 0.12244897959183673,
 0.09183673469387756,
 0.20408163265306123,
 0.6224489795918368]

In [0]:
models = ['KNN', 'SVM_Linear', 'SVM_rbf', 'SVM_rbf2', 'DecisionTree','NaiveBayesian']

pd.DataFrame(list(zip(models, train_accuracies, test_accuracies, train_recall, test_recall, validations)), columns = ['Model', 'Train Accuracy', 'Test Accuracy', 'Train Recall','Test Recall','Validation'])

Unnamed: 0,Model,Train Accuracy,Test Accuracy,Train Recall,Test Recall,Validation
0,KNN,0.827035,0.55298,0.97093,0.479592,0.837259
1,SVM_Linear,0.767684,0.682119,0.833333,0.571429,0.851842
2,SVM_rbf,0.959545,0.806291,0.941376,0.122449,0.85151
3,SVM_rbf2,0.96875,0.799669,0.957364,0.091837,0.848199
4,DecisionTree,0.977229,0.759934,0.958818,0.204082,0.782577
5,NaiveBayesian,0.722384,0.574503,0.903585,0.622449,0.516061


In [0]:
test_matrix

[array([[287, 219],
        [ 51,  47]]), array([[356, 150],
        [ 42,  56]]), array([[475,  31],
        [ 86,  12]]), array([[474,  32],
        [ 89,   9]]), array([[439,  67],
        [ 78,  20]]), array([[286, 220],
        [ 37,  61]])]

In [0]:
dt.get_depth()

104

####4. Trace Training & Test Accuracy by changing Max_Depth

In [0]:
# Decision Tree Figure max_depth

def figure_min_sample_split(samples, max_depth):

  train = []
  test = []
  
  for depth in max_depth:
    for sample in samples:
      dt = DecisionTreeClassifier(random_state = 123, min_samples_split = sample)
      dt.fit(X_train_SMOTE, y_train_SMOTE)
      train_predicted = dt.predict(X = X_train_SMOTE)
      train_accuracy = metrics.accuracy_score(y_train_SMOTE, train_predicted)
      train_recall = metrics.recall_score(y_train_SMOTE, train_predicted)

      test_predicted = dt.predict(X = X_test)
      test_accuracy = metrics.accuracy_score(y_test, test_predicted)
      test_recall = metrics.recall_score(y_test, test_predicted)

      train.append(train_recall)
      test.append(test_recall)

  return(train, test)

    
#depths = [i for i in range(5, 101) if i%5 == 0]
#(train_a, test_a) = figure_max_depth(depths)
#pd.DataFrame(list(zip(depths, train_a, test_a)), columns = ['Max_Depth', 'Train Accuracy', 'Test Accuracy'])
samples = [i for i in range(2,25,4)]
max_depths = [i for i in range(5, 101, 5)]

(train_a, test_a) = figure_min_sample_split(samples, max_depths)
pd.DataFrame(list(zip(samples, train_a, test_a)), columns = ['samples', 'Train Recall', 'Test Recall'])

Unnamed: 0,samples,Train Recall,Test Recall
0,2,0.958818,0.204082
1,6,0.945252,0.193878
2,10,0.928779,0.234694
3,14,0.921027,0.244898
4,18,0.917636,0.234694
5,22,0.916182,0.234694


In [0]:
test_a

[0.20408163265306123,
 0.19387755102040816,
 0.23469387755102042,
 0.24489795918367346,
 0.23469387755102042,
 0.23469387755102042,
 0.20408163265306123,
 0.19387755102040816,
 0.23469387755102042,
 0.24489795918367346,
 0.23469387755102042,
 0.23469387755102042,
 0.20408163265306123,
 0.19387755102040816,
 0.23469387755102042,
 0.24489795918367346,
 0.23469387755102042,
 0.23469387755102042,
 0.20408163265306123,
 0.19387755102040816,
 0.23469387755102042,
 0.24489795918367346,
 0.23469387755102042,
 0.23469387755102042,
 0.20408163265306123,
 0.19387755102040816,
 0.23469387755102042,
 0.24489795918367346,
 0.23469387755102042,
 0.23469387755102042,
 0.20408163265306123,
 0.19387755102040816,
 0.23469387755102042,
 0.24489795918367346,
 0.23469387755102042,
 0.23469387755102042,
 0.20408163265306123,
 0.19387755102040816,
 0.23469387755102042,
 0.24489795918367346,
 0.23469387755102042,
 0.23469387755102042,
 0.20408163265306123,
 0.19387755102040816,
 0.23469387755102042,
 0.2448979


#### Decision Tree Optimization

In [0]:
from sklearn.model_selection import GridSearchCV


ssr = list(range(2, 25))
param_grid = dict(min_samples_split=ssr)
grid = GridSearchCV(dt, param_grid, cv=10, scoring='accuracy')
grid.fit(X = X_train, y = y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=123,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
                                               12, 13, 14

In [0]:
print(grid.best_score_)
print(grid.best_params_)

0.872568093385214
{'min_samples_split': 4}
