In [77]:
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from collections import Counter
import pandas as pd
import numpy as np
import xgboost as xgb

In [43]:
molecule_matrix = pd.read_csv('./Dataset/Molecule_matrix_mold2.csv',dtype={'CID':'str'})

In [44]:
molecule_matrix.head()

Unnamed: 0,CID,D001,D002,D003,D004,D005,D006,D007,D008,D009,...,D769,D770,D771,D772,D773,D774,D775,D776,D777,outcome
0,6603008,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,4.7,-0.627,0.273,1.574,inactive
1,6602571,1,0,0,1,2,0,0,0,0,...,0,0,0,0,0,4.907,-0.729,0.24,1.78,inactive
2,6602616,1,0,0,1,2,0,0,0,0,...,0,0,0,0,0,5.087,-0.196,0.207,2.447,inactive
3,644371,1,0,0,0,1,1,0,0,0,...,0,0,0,0,0,4.644,-0.787,0.286,2.756,inactive
4,6603132,2,0,0,0,3,0,0,0,0,...,0,0,0,0,0,5.0,-0.815,0.48,2.421,inactive


In [51]:
CID = np.array(molecule_matrix['CID'])
label = np.array(molecule_matrix['outcome'])
molecule_matrix = np.array(molecule_matrix.iloc[:,1:molecule_matrix.shape[1]-1])

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [46]:
print(molecule_matrix.shape)
print(Counter(label))

(302630, 648)
Counter({'inactive': 301747, 'active': 883})


#### Dimension Reduction

In [47]:
pca = PCA(n_components=10)
molecule_matrix_pca = pca.fit_transform(molecule_matrix)

In [48]:
molecule_matrix_pca.shape

(302630, 10)

#### Train and test split

In [52]:
label[label=='active']=1
label[label=='inactive']=0
label = label.astype('int')

In [58]:
ratio = {0:6400,1:880}
rus = RandomUnderSampler(random_state=8584096,ratio=ratio)b
x_resample,y_resample = rus.fit_sample(molecule_matrix_pca,label)

In [60]:
x_train,x_test,y_train,y_test = train_test_split(x_resample,y_resample,test_size = 0.2)

In [61]:
print(Counter(y_train))
print(Counter(y_test))

Counter({0: 5107, 1: 717})
Counter({0: 1293, 1: 163})


#### XGBoost

In [62]:
train = xgb.DMatrix(x_train,label=y_train)
test = xgb.DMatrix(x_test,label=y_test)

In [65]:
param = {'lambda':0.6,'max_depth':4,'scale_pos_weight':8,'objective':'binary:logistic','eval_metric':'auc','eta':0.8}
num_round = 20
# watchlist = [(test,'eval'),(train,'train')]

In [66]:
bst = xgb.train(dtrain=train,params=param,num_boost_round=num_round)

In [67]:
preds = bst.predict(test)

In [68]:
label = test.get_label()

In [69]:
preds_ = [1 if i>0.6 else 0 for i in preds]

In [70]:
np.mean(preds_!=label)

0.23008241758241757

In [73]:
confusion_matrix(y_true=label,y_pred=preds_)

array([[1035,  258],
       [  77,   86]])

In [75]:
tn,fp,fn,tp = confusion_matrix(y_true=label,y_pred=preds_).ravel()

In [76]:
print('Precision=', tp/(tp+fp))
print('Recall:',tp/(tp+fn))
print('False negative:',fn/(tp+fn))

Precision= 0.25
Recall: 0.527607361963
False negative: 0.472392638037


In [78]:
accuracy_score(y_pred=preds_,y_true=label)

0.76991758241758246