In [57]:
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [24]:
df = pd.read_csv('drug_consumption.csv')
df.head()

Unnamed: 0,ID,Age,Gender,Education,Country,Ethnicity,Nscore,Escore,Oscore,Ascore,...,Ecstasy,Heroin,Ketamine,Legalh,LSD,Meth,Mushrooms,Nicotine,Semer,VSA
0,1,0.49788,0.48246,-0.05921,0.96082,0.126,0.31287,-0.57545,-0.58331,-0.91699,...,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL2,CL0,CL0
1,2,-0.07854,-0.48246,1.98437,0.96082,-0.31685,-0.67825,1.93886,1.43533,0.76096,...,CL4,CL0,CL2,CL0,CL2,CL3,CL0,CL4,CL0,CL0
2,3,0.49788,-0.48246,-0.05921,0.96082,-0.31685,-0.46725,0.80523,-0.84732,-1.6209,...,CL0,CL0,CL0,CL0,CL0,CL0,CL1,CL0,CL0,CL0
3,4,-0.95197,0.48246,1.16365,0.96082,-0.31685,-0.14882,-0.80615,-0.01928,0.59042,...,CL0,CL0,CL2,CL0,CL0,CL0,CL0,CL2,CL0,CL0
4,5,0.49788,0.48246,1.98437,0.96082,-0.31685,0.73545,-1.6334,-0.45174,-0.30172,...,CL1,CL0,CL0,CL1,CL0,CL0,CL2,CL2,CL0,CL0


In [65]:
# drop rows where Semer (fake drug) is not CL0 or CL1. Assume invalid row.
df_clean = df.loc[(df['Semer'] == 'CL0') | (df['Semer'] == 'CL1')]

# convert 'CLO' and 'CL1' into 0 and 'CL2'-'CL6' into 1
df_clean = df_clean.replace(to_replace=['CL0', 'CL1', 'CL2', 'CL3', 'CL4', 'CL5', 'CL6'], value=[0, 0, 1, 1, 1, 1, 1])
df_clean.head()

Unnamed: 0,ID,Age,Gender,Education,Country,Ethnicity,Nscore,Escore,Oscore,Ascore,...,Ecstasy,Heroin,Ketamine,Legalh,LSD,Meth,Mushrooms,Nicotine,Semer,VSA
0,1,0.49788,0.48246,-0.05921,0.96082,0.126,0.31287,-0.57545,-0.58331,-0.91699,...,0,0,0,0,0,0,0,1,0,0
1,2,-0.07854,-0.48246,1.98437,0.96082,-0.31685,-0.67825,1.93886,1.43533,0.76096,...,1,0,1,0,1,1,0,1,0,0
2,3,0.49788,-0.48246,-0.05921,0.96082,-0.31685,-0.46725,0.80523,-0.84732,-1.6209,...,0,0,0,0,0,0,0,0,0,0
3,4,-0.95197,0.48246,1.16365,0.96082,-0.31685,-0.14882,-0.80615,-0.01928,0.59042,...,0,0,1,0,0,0,0,1,0,0
4,5,0.49788,0.48246,1.98437,0.96082,-0.31685,0.73545,-1.6334,-0.45174,-0.30172,...,0,0,0,0,0,0,1,1,0,0


In [66]:
X = df_clean.drop(columns=['Meth', 'ID'])
y = df_clean['Meth']

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [68]:
sm = SMOTE(random_state=1)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

names = ["Nearest Neighbors", "Linear SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "AdaBoost",
         "Naive Bayes", "QDA"]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]


In [63]:
scores = cross_val_score(classifier, X_train_res, y_train_res, cv=5)
print(f"scores: {scores}\n95% CI: {round(scores.mean(), 2)} ",
     f"(+/- {round(scores.std() * 2, 2)})")

scores: [0.83561644 0.84703196 0.85779817 0.84633028 0.86009174]
95% CI: 0.85  (+/- 0.02)


In [69]:
for name, clf in zip(names, classifiers):
    clf.fit(X_train_res, y_train_res)
    preds = clf.predict(X_test)
    print(f"{name}\naccuracy: {accuracy_score(y_test, preds)}\nF1: ",
        f"{f1_score(y_test, preds)}\nconfusion matrix: \n", 
        f"{confusion_matrix(y_test, preds, labels=[0, 1])}")

Nearest Neighbors
accuracy: 0.7382978723404255
F1:  0.5591397849462366
confusion matrix: 
 [[269 103]
 [ 20  78]]
Linear SVM
accuracy: 0.7914893617021277
F1:  0.6230769230769231
confusion matrix: 
 [[291  81]
 [ 17  81]]
Gaussian Process
accuracy: 0.8063829787234043
F1:  0.5844748858447489
confusion matrix: 
 [[315  57]
 [ 34  64]]
Decision Tree
accuracy: 0.8042553191489362
F1:  0.6290322580645161
confusion matrix: 
 [[300  72]
 [ 20  78]]
Random Forest
accuracy: 0.7957446808510639
F1:  0.616
confusion matrix: 
 [[297  75]
 [ 21  77]]
AdaBoost
accuracy: 0.823404255319149
F1:  0.6244343891402715
confusion matrix: 
 [[318  54]
 [ 29  69]]
Naive Bayes
accuracy: 0.7659574468085106
F1:  0.5955882352941176
confusion matrix: 
 [[279  93]
 [ 17  81]]
QDA
accuracy: 0.7829787234042553
F1:  0.5853658536585366
confusion matrix: 
 [[296  76]
 [ 26  72]]
