##### Source:
imblearn Documentation
- https://github.com/scikit-learn-contrib/imbalanced-learn 
- http://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html

In [1]:
import zipfile
import pandas as pd
import numpy as np
import glob
from scipy import sparse
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
molecule_descriptor_files = []
for file in glob.glob('./Dataset/PubChem_Molecule_Descriptor/PubChem/AID_2289_descriptors_part[0-9]*.csv'):
    molecule_descriptor_files.append(file)

### Drug Bank

In [3]:
drug_link = pd.read_csv('./Dataset/drug_links.csv',dtype={'PubChem Compound ID':'str'})

In [4]:
drug_link = drug_link.dropna(subset=['PubChem Compound ID'])

In [5]:
drug_map = dict(zip(drug_link['PubChem Compound ID'],drug_link['Name']))

### Prepare training data

#### Use counter-screen (AID: 588342) to filter out false positive

Counter-screen: A screen performed in parallel with or after the primary screen. The assay used in the counter-screen is developed to identify compounds that have the potential to interfere with the assay used in the primary screen (the primary assay). Counter-screens can also be used to eliminate compounds that possess undesirable properties, for example, a counter-screen for cytotoxicity.

In [116]:
fp = pd.read_csv('./Dataset/AID_588342_descriptors_part1.csv',dtype={'Row':'str'})

In [117]:
fp_id = fp.iloc[:,0]
fp_id = fp_id[pd.notna(fp_id)]

In [118]:
# only first file contains active molecule, filter out false positive first
molecule_matrix = pd.read_csv(molecule_descriptor_files[0],dtype={'Row':'str'})
features = molecule_matrix.columns[1:180] # feature names except the first column (molecule ID)

In [119]:
# find five false positive molecules
drop_idx = molecule_matrix[(molecule_matrix['outcome']=='active') & (molecule_matrix['Row'].isin(fp_id))].index
molecule_matrix.drop(drop_idx,inplace=True)
molecule_matrix = molecule_matrix.iloc[:,:181]
molecule_matrix = np.array(molecule_matrix.dropna(axis=0))

In [120]:
%%time
for file in molecule_descriptor_files[1:]:
    f = pd.read_csv(file,dtype={'Row':'str'})
    f = f.iloc[:,:181] # the first column is molecule ID column
    f = np.array(f.dropna(axis=0)) # drop rows with NA 
    molecule_matrix = np.vstack((molecule_matrix,f))

CPU times: user 9.29 s, sys: 2.05 s, total: 11.3 s
Wall time: 11.3 s


In [123]:
label = molecule_matrix[:,molecule_matrix.shape[1]-1]

In [124]:
moleculeID = molecule_matrix[:,0]

In [129]:
molecule_matrix = molecule_matrix[:,1:180]

In [131]:
print('Number of molecule ID:',len(moleculeID))
print('Number of label:',len(label))
print('Shape:',molecule_matrix.shape)

Number of molecule ID: 307783
Number of label: 307783
Shape: (307783, 179)


In [136]:
Counter(label)

Counter({'active': 3282, 'inactive': 304501})

### Handling the imbalance

##### 1). Under-sampling: Random majority under-sampling with replacement

In [62]:
%%time
from imblearn.under_sampling import RandomUnderSampler

molecule_matrix = sparse.csr_matrix(molecule_matrix)

ratio = {0:3287, 1:3287}
undersam = RandomUnderSampler(ratio=ratio,return_indices=True)
X_resample,y_resample, idx_resample = undersam.fit_sample(molecule_matrix,ylabel)

# # under-sampling
# # make use of k-means
# # source: http://contrib.scikit-learn.org/imbalanced-learn/stable/under_sampling.html
# from imblearn.under_sampling import ClusterCentroids
# cc = ClusterCentroids(random_state=0)
# X_resample, y_resample = cc.fit_sample(xtrain,y_train)
# print(sorted(Counter(y_resample).items()))

CPU times: user 127 ms, sys: 12 ms, total: 139 ms
Wall time: 137 ms


### Split the train and test

In [63]:
X_resample.shape

(6574, 179)

In [64]:
x_train,x_test,y_train,y_test = train_test_split(X_resample,y_resample)

In [65]:
# active class ratio
y_tr = dict(Counter(y_train))
print(y_tr[1]/(y_tr[0]+y_tr[1]))
y_te = dict(Counter(y_test))
print(y_te[1]/(y_te[0]+y_te[1]))

0.5010141987829615
0.49695863746958635


### Random Forest Classifier

In [66]:
highest_cross_val_accu = -1
best_t = None
indices = range(x_train.shape[0])
tree = np.arange(10,500,50)
kf = KFold(n_splits=5,shuffle=True,random_state=28584096)

for n_tree in tree:
    accuracies = []
    for train_indices, val_indices in kf.split(indices):
        x_tr = x_train[train_indices]
        y_tr = y_train[train_indices]
        cf = RandomForestClassifier(n_estimators=n_tree)
        cf.fit(x_tr,y_tr)
        pred = cf.predict(x_train[val_indices])
        acc = accuracy_score(y_true=y_train[val_indices],y_pred=pred)
        accuracies.append(acc)
    cross_val_acc = np.mean(accuracies)
    print('N_tree: ', n_tree, 'cross validation accuracy:', cross_val_acc)
    if cross_val_acc > highest_cross_val_accu:
        highest_cross_val_accu = cross_val_acc
        best_t = n_tree
print('Best N_tree: ',best_t, '; cross-validation accuracy: ',highest_cross_val_accu)

N_tree:  10 cross validation accuracy: 0.52860040568
N_tree:  60 cross validation accuracy: 0.522718052738
N_tree:  110 cross validation accuracy: 0.530020283976
N_tree:  160 cross validation accuracy: 0.533671399594
N_tree:  210 cross validation accuracy: 0.530425963489
N_tree:  260 cross validation accuracy: 0.529006085193
N_tree:  310 cross validation accuracy: 0.534888438134
N_tree:  360 cross validation accuracy: 0.534482758621
N_tree:  410 cross validation accuracy: 0.528194726166
N_tree:  460 cross validation accuracy: 0.525963488844
Best N_tree:  310 ; cross-validation accuracy:  0.534888438134


In [67]:
rf_cv = RandomForestClassifier(n_estimators=best_t)
rf_cv.fit(x_train,y_train)
pred_cv_train = rf_cv.predict(x_train)
y_pred = rf_cv.predict(x_test)
print('Train accuracy:', accuracy_score(y_true=y_train,y_pred=pred_cv_train))
print('Test accuracy:', accuracy_score(y_true=y_test,y_pred=y_pred))

Train accuracy: 0.999188640974
Test accuracy: 0.53102189781


In [72]:
tn,fp,fn,tp = confusion_matrix(y_pred=y_pred,y_true=y_test).ravel()

In [73]:
confusion_matrix(y_pred=y_pred,y_true=y_test)

array([[458, 369],
       [402, 415]])

In [74]:
print('Recall:', tp/(tp+fn))
print('Precision:', tp/(tp+fp))

Recall: 0.507955936353
Precision: 0.529336734694


In [75]:
features[np.argsort(rf_cv.feature_importances_)[::-1][:5]]

Index(['XLogP', 'MW', 'WBN_LP_H_0.25', 'PSA', 'WBN_GC_H_0.25'], dtype='object')

In [31]:
# %%bash 
# pwd # print working directory 

/home/yating


### SVM

In [46]:
highest_cross_val_accu = -1
best_c = None
indices = range(x_train.shape[0])
c = [1e-7,1e-5,1e-3,1,10,100]
kf = KFold(n_splits=5, shuffle=True, random_state=28584096)

for c_ in c:
    accuracies = []
    for train_indices, val_indices in kf.split(indices):
        x_tr = x_train[train_indices]
        y_tr = y_train[train_indices]
        svm = SVC(kernel='rbf',C=c_)
        svm.fit(x_tr,y_tr)
        pred = svm.predict(x_train[val_indices])
        acc = accuracy_score(y_pred=pred,y_true=y_train[val_indices])
        accuracies.append(acc)
    cross_val_acc = np.mean(accuracies)
    print('C:', c_, ' cross validation accuracy:', cross_val_acc)
    if cross_val_acc > highest_cross_val_accu:
        highest_cross_val_accu = cross_val_acc
        best_c = c_
print('Best c:', best_c, '; cross-validation accuracy:',highest_cross_val_accu)

C: 1e-07  cross validation accuracy: 0.67362962963
C: 1e-05  cross validation accuracy: 0.67362962963
C: 0.001  cross validation accuracy: 0.67362962963
C: 1  cross validation accuracy: 0.666518518519
C: 10  cross validation accuracy: 0.618074074074
C: 100  cross validation accuracy: 0.566074074074
Best c: 1e-07 ; cross-validation accuracy: 0.67362962963


### RF:  Drug Bank

In [47]:
drugbank = pd.read_csv('DrugBank_MV.csv',dtype={'Row':'str'})
drugid = np.array(drugbank.iloc[:,0].dropna(axis=0))
drugbank = drugbank.iloc[:,1:180]
drugbank = np.array(drugbank.dropna(axis=0))
print(drugbank.shape)

(8722, 179)


In [48]:
drug_pred = rf_cv.predict(drugbank)
print(Counter(drug_pred))
# predicted active molucule
drugid_rf = drugid[np.where(drug_pred==1)]

Counter({0: 8104, 1: 618})


### SVM:  Drug Bank

In [49]:
drug_pred_svm = svm.predict(drugbank)
print(Counter(drug_pred_svm))
drugid_svm = drugid[np.where(drug_pred_svm==1)]

Counter({0: 6264, 1: 2458})


In [57]:
drug_name_rf = [drug_map[idx] for idx in drugid_rf]

In [58]:
drug_name_svm = [drug_map[idx] for idx in drugid_svm]

In [59]:
name = [set(drug_name_svm) & set(drug_name_rf)]