In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import jaccard_score, log_loss

import prepare, model

### Construct a model to predict MoA annotation cyclooxygenase inhibitor

In [2]:
# Load X and Y.

X = pd.read_csv('train_features.csv', index_col=0)
Y = pd.read_csv('train_targets_scored.csv', index_col=0)

# Print their shapes
X.shape, Y.shape

((23814, 875), (23814, 206))

In [5]:
# Take a peek at X
X.head()

Unnamed: 0_level_0,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id_000644bb2,trt_cp,24,D1,1.062,0.5577,-0.2479,-0.6208,-0.1944,-1.012,-1.022,...,0.2862,0.2584,0.8076,0.5523,-0.1912,0.6584,-0.3981,0.2139,0.3801,0.4176
id_000779bfc,trt_cp,72,D1,0.0743,0.4087,0.2991,0.0604,1.019,0.5207,0.2341,...,-0.4265,0.7543,0.4708,0.023,0.2957,0.4899,0.1522,0.1241,0.6077,0.7371
id_000a6266a,trt_cp,48,D1,0.628,0.5817,1.554,-0.0764,-0.0323,1.239,0.1715,...,-0.725,-0.6297,0.6103,0.0223,-1.324,-0.3174,-0.6417,-0.2187,-1.408,0.6931
id_0015fd391,trt_cp,48,D1,-0.5138,-0.2491,-0.2656,0.5288,4.062,-0.8095,-1.959,...,-2.099,-0.6441,-5.63,-1.378,-0.8632,-1.288,-1.621,-0.8784,-0.3876,-0.8154
id_001626bd3,trt_cp,72,D2,-0.3254,-0.4009,0.97,0.6919,1.418,-0.8244,-0.28,...,0.0042,0.0048,0.667,1.069,0.5523,-0.3031,0.1094,0.2885,-0.3786,0.7125


In [3]:
# Isolate label cyclooxygenase_inhibitor
y = Y.loc[:, 'cyclooxygenase_inhibitor']

# Take a peek at y
y.head()

sig_id
id_000644bb2    0
id_000779bfc    0
id_000a6266a    0
id_0015fd391    0
id_001626bd3    0
Name: cyclooxygenase_inhibitor, dtype: int64

In [6]:
# Concat y to X
df = pd.concat([X, y], axis=1)

# Take a peek at df
df.head()

Unnamed: 0_level_0,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,...,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99,cyclooxygenase_inhibitor
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id_000644bb2,trt_cp,24,D1,1.062,0.5577,-0.2479,-0.6208,-0.1944,-1.012,-1.022,...,0.2584,0.8076,0.5523,-0.1912,0.6584,-0.3981,0.2139,0.3801,0.4176,0
id_000779bfc,trt_cp,72,D1,0.0743,0.4087,0.2991,0.0604,1.019,0.5207,0.2341,...,0.7543,0.4708,0.023,0.2957,0.4899,0.1522,0.1241,0.6077,0.7371,0
id_000a6266a,trt_cp,48,D1,0.628,0.5817,1.554,-0.0764,-0.0323,1.239,0.1715,...,-0.6297,0.6103,0.0223,-1.324,-0.3174,-0.6417,-0.2187,-1.408,0.6931,0
id_0015fd391,trt_cp,48,D1,-0.5138,-0.2491,-0.2656,0.5288,4.062,-0.8095,-1.959,...,-0.6441,-5.63,-1.378,-0.8632,-1.288,-1.621,-0.8784,-0.3876,-0.8154,0
id_001626bd3,trt_cp,72,D2,-0.3254,-0.4009,0.97,0.6919,1.418,-0.8244,-0.28,...,0.0048,0.667,1.069,0.5523,-0.3031,0.1094,0.2885,-0.3786,0.7125,0


In [3]:
# Load the .csv file
df = pd.read_csv('classify_cyclooxygenase_inhibitor.csv', index_col=0)
df.head()

Unnamed: 0_level_0,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,...,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99,cyclooxygenase_inhibitor
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id_000644bb2,trt_cp,24,D1,1.062,0.5577,-0.2479,-0.6208,-0.1944,-1.012,-1.022,...,0.2584,0.8076,0.5523,-0.1912,0.6584,-0.3981,0.2139,0.3801,0.4176,0
id_000779bfc,trt_cp,72,D1,0.0743,0.4087,0.2991,0.0604,1.019,0.5207,0.2341,...,0.7543,0.4708,0.023,0.2957,0.4899,0.1522,0.1241,0.6077,0.7371,0
id_000a6266a,trt_cp,48,D1,0.628,0.5817,1.554,-0.0764,-0.0323,1.239,0.1715,...,-0.6297,0.6103,0.0223,-1.324,-0.3174,-0.6417,-0.2187,-1.408,0.6931,0
id_0015fd391,trt_cp,48,D1,-0.5138,-0.2491,-0.2656,0.5288,4.062,-0.8095,-1.959,...,-0.6441,-5.63,-1.378,-0.8632,-1.288,-1.621,-0.8784,-0.3876,-0.8154,0
id_001626bd3,trt_cp,72,D2,-0.3254,-0.4009,0.97,0.6919,1.418,-0.8244,-0.28,...,0.0048,0.667,1.069,0.5523,-0.3031,0.1094,0.2885,-0.3786,0.7125,0


In [4]:
# Data Preparation

scaler, train_scaled, validate_scaled, test_scaled = prepare.classifiy_cyclooxygenase_inhibitor(df)

# Print the scaler and the shape of the data
scaler, train_scaled.shape, validate_scaled.shape, test_scaled.shape

(MinMaxScaler(copy=True, feature_range=(0, 1)),
 (17145, 877),
 (4287, 877),
 (2382, 877))

In [5]:
# Take a look at train_scaled
train_scaled.head()

Unnamed: 0_level_0,cyclooxygenase_inhibitor,cp_type_trt_cp,cp_time_48,cp_time_72,cp_dose_D2,g-0_scaled,g-1_scaled,g-2_scaled,g-3_scaled,g-4_scaled,...,c-90_scaled,c-91_scaled,c-92_scaled,c-93_scaled,c-94_scaled,c-95_scaled,c-96_scaled,c-97_scaled,c-98_scaled,c-99_scaled
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id_848dcc064,0,1,1,0,0,0.278145,0.438289,0.514682,0.429166,0.409585,...,0.752996,0.778936,0.71431,0.794292,0.723889,0.851225,0.74973,0.849285,0.781899,0.784009
id_b9ce40651,0,1,0,0,0,0.301602,0.46096,0.464432,0.374016,0.35635,...,0.727436,0.733129,0.694349,0.74858,0.7644,0.74371,0.77672,0.833604,0.806951,0.80564
id_a23c124dd,0,1,0,0,1,0.351926,0.524016,0.515898,0.40987,0.399615,...,0.752918,0.791095,0.706096,0.776449,0.661817,0.778625,0.821844,0.809407,0.714703,0.618445
id_632f29352,0,1,0,0,0,0.331621,0.518773,0.543719,0.407282,0.320056,...,0.738325,0.794703,0.587779,0.80865,0.703848,0.811472,0.791514,0.774037,0.719991,0.771593
id_db60af3c3,0,1,1,0,1,0.309449,0.436247,0.496584,0.438492,0.354072,...,0.823442,0.733865,0.788246,0.732076,0.73691,0.825784,0.760141,0.882725,0.826118,0.813803


In [6]:
# Create X and y (scaled)

X_train = train_scaled.iloc[:,1:]
X_validate = validate_scaled.iloc[:,1:]
X_test = test_scaled.iloc[:,1:]

y_train = train_scaled.loc[:, 'cyclooxygenase_inhibitor']
y_validate = validate_scaled.loc[:, 'cyclooxygenase_inhibitor']
y_test = test_scaled.loc[:, 'cyclooxygenase_inhibitor']

In [7]:
# Take a look at X_train
X_train.head()

Unnamed: 0_level_0,cp_type_trt_cp,cp_time_48,cp_time_72,cp_dose_D2,g-0_scaled,g-1_scaled,g-2_scaled,g-3_scaled,g-4_scaled,g-5_scaled,...,c-90_scaled,c-91_scaled,c-92_scaled,c-93_scaled,c-94_scaled,c-95_scaled,c-96_scaled,c-97_scaled,c-98_scaled,c-99_scaled
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id_848dcc064,1,1,0,0,0.278145,0.438289,0.514682,0.429166,0.409585,0.56983,...,0.752996,0.778936,0.71431,0.794292,0.723889,0.851225,0.74973,0.849285,0.781899,0.784009
id_b9ce40651,1,0,0,0,0.301602,0.46096,0.464432,0.374016,0.35635,0.544769,...,0.727436,0.733129,0.694349,0.74858,0.7644,0.74371,0.77672,0.833604,0.806951,0.80564
id_a23c124dd,1,0,0,1,0.351926,0.524016,0.515898,0.40987,0.399615,0.623608,...,0.752918,0.791095,0.706096,0.776449,0.661817,0.778625,0.821844,0.809407,0.714703,0.618445
id_632f29352,1,0,0,0,0.331621,0.518773,0.543719,0.407282,0.320056,0.651487,...,0.738325,0.794703,0.587779,0.80865,0.703848,0.811472,0.791514,0.774037,0.719991,0.771593
id_db60af3c3,1,1,0,1,0.309449,0.436247,0.496584,0.438492,0.354072,0.583075,...,0.823442,0.733865,0.788246,0.732076,0.73691,0.825784,0.760141,0.882725,0.826118,0.813803


In [8]:
# Take a look at X_train
y_train.head()

sig_id
id_848dcc064    0
id_b9ce40651    0
id_a23c124dd    0
id_632f29352    0
id_db60af3c3    0
Name: cyclooxygenase_inhibitor, dtype: int64

In [9]:
# Baseline accuracy
y_train.value_counts(normalize=True) # baseline accuracy 0.982 ? too high

0    0.981744
1    0.018256
Name: cyclooxygenase_inhibitor, dtype: float64

In [10]:
eval_df = pd.DataFrame(columns = ['model_type', 'data_type', 'accuracy', 'precision', 'recall'])
eval_df

Unnamed: 0,model_type,data_type,accuracy,precision,recall


In [13]:
# Logistic Regression

lr = LogisticRegression(C=1, random_state=123)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_train)

d = model.estimator_report('Logistic Regression', 'train', y_train, y_pred)
eval_df = eval_df.append(d, ignore_index=True)
eval_df

Unnamed: 0,model_type,data_type,accuracy,precision,recall
0,Logistic Regression,train,0.981744,,0.0


In [16]:
# Decision Tree

clf = DecisionTreeClassifier(max_depth=5, random_state=123)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_train)

d = model.estimator_report('Decision Tree', 'train', y_train, y_pred)
eval_df = eval_df.append(d, ignore_index=True)
eval_df

Unnamed: 0,model_type,data_type,accuracy,precision,recall
0,Logistic Regression,train,0.981744,,0.0
1,Decision Tree,train,0.982036,1.0,0.015974


In [20]:
# Random Forest

rf = RandomForestClassifier(n_estimators=100, 
                            max_depth=3, 
                            min_samples_split=2, 
                            min_samples_leaf=5, 
                            random_state=123)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_train)

d = model.estimator_report('Random Forest', 'train', y_train, y_pred)
eval_df = eval_df.append(d, ignore_index=True)
eval_df

Unnamed: 0,model_type,data_type,accuracy,precision,recall
0,Logistic Regression,train,0.981744,,0.0
1,Decision Tree,train,0.982036,1.0,0.015974
2,Random Forest,train,0.981744,,0.0


In [21]:
# Linear SVM

clf = SVC(kernel="linear", C=0.025)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_train)

d = model.estimator_report('Linear SVM', 'train',y_train, y_pred)
eval_df = eval_df.append(d, ignore_index=True)
eval_df

Unnamed: 0,model_type,data_type,accuracy,precision,recall
0,Logistic Regression,train,0.981744,,0.0
1,Decision Tree,train,0.982036,1.0,0.015974
2,Random Forest,train,0.981744,,0.0
3,Linear SVM,train,0.981744,,0.0


In [22]:
# RBF SVM  

clf = SVC(gamma=2, C=1, random_state=123)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_train) # Took about 22 mins. 

d = model.estimator_report('RBF SVM', 'train', y_train, y_pred)
eval_df = eval_df.append(d, ignore_index=True)
eval_df

Unnamed: 0,model_type,data_type,accuracy,precision,recall
0,Logistic Regression,train,0.981744,,0.0
1,Decision Tree,train,0.982036,1.0,0.015974
2,Random Forest,train,0.981744,,0.0
3,Linear SVM,train,0.981744,,0.0
4,RBF SVM,train,0.999825,1.0,0.990415


In [23]:
# Print out the confusion matrx for RBF SVM
confusion_matrix(y_train, y_pred)

array([[16832,     0],
       [    3,   310]])

In [24]:
# AdaBoostCalssifier

abc = AdaBoostClassifier(random_state=123)
abc.fit(X_train, y_train)
y_pred_abc = abc.predict(X_train) # Took 1.5 mins

d = model.estimator_report('AdaBoost', 'train', y_train, y_pred_abc)
eval_df = eval_df.append(d, ignore_index=True)
eval_df

Unnamed: 0,model_type,data_type,accuracy,precision,recall
0,Logistic Regression,train,0.981744,,0.0
1,Decision Tree,train,0.982036,1.0,0.015974
2,Random Forest,train,0.981744,,0.0
3,Linear SVM,train,0.981744,,0.0
4,RBF SVM,train,0.999825,1.0,0.990415
5,AdaBoost,train,0.981336,0.111111,0.003195


In [25]:
# Print out the confusion matrx for RBF SVM
confusion_matrix(y_train, y_pred_abc)

array([[16824,     8],
       [  312,     1]])

In [27]:
# Naive Bayes

nb =  GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_train)

d = model.estimator_report('Naive Bayes', 'train', y_train, y_pred_nb)
eval_df = eval_df.append(d, ignore_index=True)
eval_df

Unnamed: 0,model_type,data_type,accuracy,precision,recall
0,Logistic Regression,train,0.981744,,0.0
1,Decision Tree,train,0.982036,1.0,0.015974
2,Random Forest,train,0.981744,,0.0
3,Linear SVM,train,0.981744,,0.0
4,RBF SVM,train,0.999825,1.0,0.990415
5,AdaBoost,train,0.981336,0.111111,0.003195
6,Naive Bayes,train,0.280548,0.02422,0.977636


In [28]:
# Print out the confusion matrx for Naive Bayes
confusion_matrix(y_train, y_pred_nb)

array([[ 4504, 12328],
       [    7,   306]])

In [29]:
# QDA

qda =  QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)
y_pred_qda = qda.predict(X_train)

d = model.estimator_report('QDA', 'train', y_train, y_pred_qda)
eval_df = eval_df.append(d, ignore_index=True)
eval_df

Unnamed: 0,model_type,data_type,accuracy,precision,recall
0,Logistic Regression,train,0.981744,,0.0
1,Decision Tree,train,0.982036,1.0,0.015974
2,Random Forest,train,0.981744,,0.0
3,Linear SVM,train,0.981744,,0.0
4,RBF SVM,train,0.999825,1.0,0.990415
5,AdaBoost,train,0.981336,0.111111,0.003195
6,Naive Bayes,train,0.280548,0.02422,0.977636
7,QDA,train,0.981744,,0.0


In [30]:
# Print out the confusion matrx for QDA
confusion_matrix(y_train, y_pred_qda)

array([[16832,     0],
       [  313,     0]])

In [31]:
# KNN

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_train) # Took 8 mins.

d = model.estimator_report('KNN', 'train', y_train, y_pred_knn)
eval_df = eval_df.append(d, ignore_index=True)
eval_df

Unnamed: 0,model_type,data_type,accuracy,precision,recall
0,Logistic Regression,train,0.981744,,0.0
1,Decision Tree,train,0.982036,1.0,0.015974
2,Random Forest,train,0.981744,,0.0
3,Linear SVM,train,0.981744,,0.0
4,RBF SVM,train,0.999825,1.0,0.990415
5,AdaBoost,train,0.981336,0.111111,0.003195
6,Naive Bayes,train,0.280548,0.02422,0.977636
7,QDA,train,0.981744,,0.0
8,KNN,train,0.98291,0.8125,0.083067


In [32]:
# Print out the confusion matrx for KNN
confusion_matrix(y_train, y_pred_knn)

array([[16826,     6],
       [  287,    26]])

In [33]:
# Neutal Net

mlp = MLPClassifier(random_state=123)
mlp.fit(X_train, y_train)
y_pred_mlp = mlp.predict(X_train)

d = model.estimator_report('Neural Net', 'train', y_train, y_pred_mlp)
eval_df = eval_df.append(d, ignore_index=True)
eval_df

Unnamed: 0,model_type,data_type,accuracy,precision,recall
0,Logistic Regression,train,0.981744,,0.0
1,Decision Tree,train,0.982036,1.0,0.015974
2,Random Forest,train,0.981744,,0.0
3,Linear SVM,train,0.981744,,0.0
4,RBF SVM,train,0.999825,1.0,0.990415
5,AdaBoost,train,0.981336,0.111111,0.003195
6,Naive Bayes,train,0.280548,0.02422,0.977636
7,QDA,train,0.981744,,0.0
8,KNN,train,0.98291,0.8125,0.083067
9,Neural Net,train,0.981744,,0.0


In [34]:
# Print out the confusion matrx for Neutral Net
confusion_matrix(y_train, y_pred_mlp)

array([[16832,     0],
       [  313,     0]])

In [35]:
# Neutal Net-2

mlp2 = MLPClassifier(alpha=1, random_state=123)
mlp2.fit(X_train, y_train)
y_pred_mlp2 = mlp2.predict(X_train)

d = model.estimator_report('Neural Net-2', 'train', y_train, y_pred_mlp2)
eval_df = eval_df.append(d, ignore_index=True)
eval_df

Unnamed: 0,model_type,data_type,accuracy,precision,recall
0,Logistic Regression,train,0.981744,,0.0
1,Decision Tree,train,0.982036,1.0,0.015974
2,Random Forest,train,0.981744,,0.0
3,Linear SVM,train,0.981744,,0.0
4,RBF SVM,train,0.999825,1.0,0.990415
5,AdaBoost,train,0.981336,0.111111,0.003195
6,Naive Bayes,train,0.280548,0.02422,0.977636
7,QDA,train,0.981744,,0.0
8,KNN,train,0.98291,0.8125,0.083067
9,Neural Net,train,0.981744,,0.0


In [36]:
# Print out the confusion matrx for Neutral Net-2
confusion_matrix(y_train, y_pred_mlp2)

array([[16832,     0],
       [  313,     0]])

In [40]:
# Add baseline to the final eval
# Compute the y_pred for baseline

y_pred_b = [0]*y_train.shape[0]

# Compute the metrics for baseline
d = model.estimator_report('baseline', 'train', y_train, y_pred_b)

# Drop key data type
d = {'model_type': 'baseline',
     'accuracy': 0.9817439486730826,
     'precision': 'nan',
     'recall': 0.0}

# Append d to eval_final
eval_df = eval_df.append(d, ignore_index=True)

# Print eval_final
eval_df

Unnamed: 0,model_type,data_type,accuracy,precision,recall
0,Logistic Regression,train,0.981744,,0.0
1,Decision Tree,train,0.982036,1.0,0.015974
2,Random Forest,train,0.981744,,0.0
3,Linear SVM,train,0.981744,,0.0
4,RBF SVM,train,0.999825,1.0,0.990415
5,AdaBoost,train,0.981336,0.111111,0.003195
6,Naive Bayes,train,0.280548,0.0242204,0.977636
7,QDA,train,0.981744,,0.0
8,KNN,train,0.98291,0.8125,0.083067
9,Neural Net,train,0.981744,,0.0


In [44]:
# Fill baseline roll data type with 'train'
eval_df.iloc[11,1] = 'train'
eval_df

Unnamed: 0,model_type,data_type,accuracy,precision,recall
0,Logistic Regression,train,0.981744,,0.0
1,Decision Tree,train,0.982036,1.0,0.015974
2,Random Forest,train,0.981744,,0.0
3,Linear SVM,train,0.981744,,0.0
4,RBF SVM,train,0.999825,1.0,0.990415
5,AdaBoost,train,0.981336,0.111111,0.003195
6,Naive Bayes,train,0.280548,0.0242204,0.977636
7,QDA,train,0.981744,,0.0
8,KNN,train,0.98291,0.8125,0.083067
9,Neural Net,train,0.981744,,0.0


In [46]:
# Rank by accuracy
eval_df.sort_values(by='accuracy', ascending=False)

Unnamed: 0,model_type,data_type,accuracy,precision,recall
4,RBF SVM,train,0.999825,1.0,0.990415
8,KNN,train,0.98291,0.8125,0.083067
1,Decision Tree,train,0.982036,1.0,0.015974
0,Logistic Regression,train,0.981744,,0.0
2,Random Forest,train,0.981744,,0.0
3,Linear SVM,train,0.981744,,0.0
7,QDA,train,0.981744,,0.0
9,Neural Net,train,0.981744,,0.0
10,Neural Net-2,train,0.981744,,0.0
11,baseline,train,0.981744,,0.0


### Select top model on validate and Test

In [48]:
# RBF SVM on validate (took 1.5 min)

# Confirm classifier
print(clf)

# Make prediction
y_pred_v = clf.predict(X_validate)
d = model.estimator_report('RBF SVM', 'validate', y_validate, y_pred_v)
eval_df = eval_df.append(d, ignore_index=True)
eval_df

SVC(C=1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=2, kernel='rbf', max_iter=-1,
    probability=False, random_state=123, shrinking=True, tol=0.001,
    verbose=False)


Unnamed: 0,model_type,data_type,accuracy,precision,recall
0,Logistic Regression,train,0.981744,,0.0
1,Decision Tree,train,0.982036,1.0,0.015974
2,Random Forest,train,0.981744,,0.0
3,Linear SVM,train,0.981744,,0.0
4,RBF SVM,train,0.999825,1.0,0.990415
5,AdaBoost,train,0.981336,0.111111,0.003195
6,Naive Bayes,train,0.280548,0.0242204,0.977636
7,QDA,train,0.981744,,0.0
8,KNN,train,0.98291,0.8125,0.083067
9,Neural Net,train,0.981744,,0.0


In [49]:
# Print the confusion matrix for RBF SVM
confusion_matrix(y_validate, y_pred_v)

array([[4209,    0],
       [  78,    0]])

In [50]:
# RBF SVM on test

# Confirm classifier
print(clf)

# Make prediction
y_pred_t = clf.predict(X_test)
d = model.estimator_report('RBF SVM', 'test', y_test, y_pred_t)
eval_df = eval_df.append(d, ignore_index=True)
eval_df

SVC(C=1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=2, kernel='rbf', max_iter=-1,
    probability=False, random_state=123, shrinking=True, tol=0.001,
    verbose=False)


Unnamed: 0,model_type,data_type,accuracy,precision,recall
0,Logistic Regression,train,0.981744,,0.0
1,Decision Tree,train,0.982036,1.0,0.015974
2,Random Forest,train,0.981744,,0.0
3,Linear SVM,train,0.981744,,0.0
4,RBF SVM,train,0.999825,1.0,0.990415
5,AdaBoost,train,0.981336,0.111111,0.003195
6,Naive Bayes,train,0.280548,0.0242204,0.977636
7,QDA,train,0.981744,,0.0
8,KNN,train,0.98291,0.8125,0.083067
9,Neural Net,train,0.981744,,0.0


In [51]:
# Print the confusion matrix for RBF SVM
confusion_matrix(y_test, y_pred_t)

array([[2338,    0],
       [  44,    0]])

In [52]:
# Rank by accuracy
eval_df.sort_values(by='accuracy', ascending=False)

Unnamed: 0,model_type,data_type,accuracy,precision,recall
4,RBF SVM,train,0.999825,1.0,0.990415
8,KNN,train,0.98291,0.8125,0.083067
1,Decision Tree,train,0.982036,1.0,0.015974
12,RBF SVM,validate,0.981805,,0.0
0,Logistic Regression,train,0.981744,,0.0
2,Random Forest,train,0.981744,,0.0
3,Linear SVM,train,0.981744,,0.0
7,QDA,train,0.981744,,0.0
9,Neural Net,train,0.981744,,0.0
10,Neural Net-2,train,0.981744,,0.0


In [53]:
# eval_df.to_csv('evalute_model_cyclooxygenase_inhibitor.csv')

**Results and Conclusions**
1. 11 models have been trained to predict cyclooxygenase inhibitor. Recall and precision are used as the metrics.
    - 6 of them perform the same as the baseline:
        - LR
        - Random Forest
        - Linear SVM
        - QDA
        - Neural Net
        - Neural Net-2
    - 2 of them perform worse than the baseline:
        - AdaBoost
        - Naive Bayes
    - 2 of them perform slightly better than the baseline:
        - KNN
        - Decision Tree
    - The best model on train is RBF SVM.
2. However, the RBF SVM has overfitting issues when applying on valiate and test. Such pattern is observed when it was used to predict nfkb inhibitor. 