##### Source:
imblearn Documentation
- https://github.com/scikit-learn-contrib/imbalanced-learn 
- http://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html

In [1]:
import zipfile
import pandas as pd
import numpy as np
import glob
from scipy import sparse
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from tempfile import TemporaryFile
%matplotlib inline

In [2]:
molecule_descriptor_files = []
for file in glob.glob('./Dataset/AID_2289_descriptors_part[0-9]*.csv'):
    molecule_descriptor_files.append(file)

### Drug Bank

In [3]:
# drug_link = pd.read_csv('./Dataset/drug_links.csv',dtype={'PubChem Compound ID':'str'})

In [4]:
# drug_link = drug_link.dropna(subset=['PubChem Compound ID'])

In [5]:
# drug_map = dict(zip(drug_link['PubChem Compound ID'],drug_link['Name']))

### Prepare training data

#### Use counter-screen (AID: 588342) to filter out false positive

Counter-screen: A screen performed in parallel with or after the primary screen. The assay used in the counter-screen is developed to identify compounds that have the potential to interfere with the assay used in the primary screen (the primary assay).

- Identify the overlapping CID between 588342 active and 2289 active: 2,399 molecules

In [6]:
active_588342 = pd.read_csv('./Dataset/active_588342.csv',dtype={'CID':'str'})
active_2289 = pd.read_csv('./Dataset/active_2289.csv',dtype={'CID':'str'})

In [7]:
false_positive_cid = list(set(active_2289['CID']) & set(active_588342['CID']))

In [8]:
len(false_positive_cid)

2399

In [9]:
# false_positive_cid_2 = list(set(active_2289['CID']) & set(inactive_588342['CID']))
# len(false_positive_cid_2)

In [10]:
# only first file contains active cases
molecule_matrix = pd.read_csv(molecule_descriptor_files[9],dtype={'CID':'str','SID':'str'})
features = molecule_matrix.columns[2:181] # feature names except the first column (molecule ID)
# molecule_matrix = np.array(molecule_matrix.dropna(axis=0))

In [11]:
filterout_index = []
for idx in range(molecule_matrix.shape[0]):
    cid = molecule_matrix.iloc[idx,:]['CID']
    if cid in false_positive_cid:
        filterout_index.append(idx)

In [12]:
ind = np.ones(molecule_matrix.shape[0]).astype('bool')
ind[filterout_index] = False
molecule_matrix = molecule_matrix.iloc[ind,:]
molecule_matrix = np.array(molecule_matrix)

In [13]:
for file in molecule_descriptor_files[:9]:
    f = pd.read_csv(file,dtype={'CID':'str','SID':'str'})
    f = np.array(f.dropna(axis=0)) # drop rows with NA 
    molecule_matrix = np.vstack((molecule_matrix,f))

In [14]:
label = molecule_matrix[:,molecule_matrix.shape[1]-1]
label[label=='active']=1
label[label=='inactive']=0
label = label.astype(int)

In [15]:
print(Counter(label))

Counter({0: 304501, 1: 884})


In [16]:
moleculeCID = molecule_matrix[:,0]

#### Construct the whole matrix

In [17]:
molecule_matrix

array([['329167', '843278', 0, ..., 0, 1, 1],
       ['421162', '844850', 0, ..., 1, 1, 1],
       ['647744', '845402', 0, ..., 0, 0, 1],
       ..., 
       ['2912871', '7978128', 0, ..., 0, 2, 0],
       ['740861', '7978129', 0, ..., 0, 1, 0],
       ['5402140', '7978130', 0, ..., 0, 3, 0]], dtype=object)

In [18]:
molecule_matrix = molecule_matrix[:,2:181]

In [19]:
molecule_matrix.shape

(305385, 179)

In [20]:
molecule_matrix = molecule_matrix.astype('float')

In [21]:
print('Number of molecule ID:',len(moleculeCID))
print('Number of label:',len(label))
print('Number of features:',len(features))
print('Shape:',molecule_matrix.shape)

Number of molecule ID: 305385
Number of label: 305385
Number of features: 179
Shape: (305385, 179)


#### Remove attributes of no variation

In [22]:
remove_col = []
for col in range(molecule_matrix.shape[1]):
    temp_sum = np.sum(molecule_matrix[:,col])
    if temp_sum == 0 or temp_sum == molecule_matrix.shape[0]:
        remove_col.append(col)

In [23]:
remove = np.ones(molecule_matrix.shape[1]).astype('bool')
remove[np.array(remove_col)] = False
molecule_matrix = molecule_matrix[:,remove]

In [24]:
print('After removing attributes without any variation:', molecule_matrix.shape)

After removing attributes without any variation: (305385, 158)


### Split the train and test data

In [25]:
# x_train,x_test,y_train,y_test = train_test_split(molecule_matrix,label,test_size=0.2)

In [26]:
# print(x_train.shape)
# print(y_train.shape)
# print(x_test.shape)
# print(y_test.shape)

In [27]:
# ytrain = dict(Counter(y_train))
# ytest = dict(Counter(y_test))

In [28]:
# print('active ratio in training:', ytrain[1]/(ytrain[0]+ytrain[1]))
# print('active ratio in test:', ytest[1]/(ytest[0]+ytest[1]))

In [29]:
# test_idx = np.random.permutation(molecule_matrix.shape[0])[:int(molecule_matrix.shape[0]*0.2)]
# mask = np.ones(molecule_matrix.shape[0]).astype('bool')
# mask[test_idx] = False
# train = molecule_matrix[mask,:]
# test = molecule_matrix[test_idx,:] 

In [30]:
# train_pd = pd.DataFrame(train)
# test_pd = pd.DataFrame(test)

In [31]:
# train_pd.to_csv('training.csv',index=False,index_label=False)
# test_pd.to_csv('test.csv',index=False,index_label=False)

#### Export files for cost sensitive classifier

In [32]:
# import csv2arff

In [33]:
# %%bash
# csv2arff training.csv training.arff
# csv2arff test.csv test.arff
# np.save('molecule_matrix.npy',molecule_matrix)
# np.save('label.npy',label)
# np.save('molecule_CID',moleculeCID)
# np.save('feature',features)

### Handling the imbalance

##### 1). Under-sampling: Random majority under-sampling with replacement

In [34]:
%%time
from imblearn.under_sampling import RandomUnderSampler

ratio = {0:1000,1:884}
molecule_matrix = sparse.csr_matrix(molecule_matrix)
undersam = RandomUnderSampler(ratio=ratio,return_indices=True)
X_resample,y_resample, idx_resample = undersam.fit_sample(molecule_matrix,label)

CPU times: user 1.49 s, sys: 151 ms, total: 1.64 s
Wall time: 2.71 s


In [41]:
x_train,x_test,y_train,y_test = train_test_split(X_resample,y_resample)

In [42]:
print(Counter(y_train))
print(Counter(ytest))

Counter({0: 753, 1: 660})
Counter({0: 241, 1: 230})


### Random Forest Classifier

In [43]:
highest_cross_val_accu = -1
best_t = None
indices = range(x_train.shape[0])
tree = np.arange(10,500,50)
kf = KFold(n_splits=5,shuffle=True,random_state=28584096)

for n_tree in tree:
    accuracies = []
    for train_indices, val_indices in kf.split(indices):
        x_tr = x_train[train_indices]
        y_tr = y_train[train_indices]
        cf = RandomForestClassifier(n_estimators=n_tree)
        cf.fit(x_tr,y_tr)
        pred = cf.predict(x_train[val_indices])
        acc = accuracy_score(y_true=y_train[val_indices],y_pred=pred)
        accuracies.append(acc)
    cross_val_acc = np.mean(accuracies)
    print('N_tree: ', n_tree, 'cross validation accuracy:', cross_val_acc)
    if cross_val_acc > highest_cross_val_accu:
        highest_cross_val_accu = cross_val_acc
        best_t = n_tree
print('Best N_tree: ',best_t, '; cross-validation accuracy: ',highest_cross_val_accu)

N_tree:  10 cross validation accuracy: 0.513146881187
N_tree:  60 cross validation accuracy: 0.522329148184
N_tree:  110 cross validation accuracy: 0.533639074756
N_tree:  160 cross validation accuracy: 0.523735057514
N_tree:  210 cross validation accuracy: 0.532939879207
N_tree:  260 cross validation accuracy: 0.518775530662
N_tree:  310 cross validation accuracy: 0.531513921259
N_tree:  360 cross validation accuracy: 0.52869207829
N_tree:  410 cross validation accuracy: 0.5209182267
N_tree:  460 cross validation accuracy: 0.534335764228
Best N_tree:  460 ; cross-validation accuracy:  0.534335764228


In [44]:
rf_cv = RandomForestClassifier(n_estimators=best_t)
rf_cv.fit(x_train,y_train)
# pred_cv_train = rf_cv.predict(x_train)
y_pred = rf_cv.predict(x_test)
# print('Train accuracy:', accuracy_score(y_true=y_train,y_pred=pred_cv_train))
print('Test accuracy:', accuracy_score(y_true=y_test,y_pred=y_pred))

Test accuracy: 0.47983014862


In [45]:
tn,fp,fn,tp = confusion_matrix(y_pred=y_pred,y_true=y_test).ravel()

In [46]:
confusion_matrix(y_pred=y_pred,y_true=y_test)

array([[135, 112],
       [133,  91]])

In [47]:
print('Recall:', tp/(tp+fn))
print('Precision:', tp/(tp+fp))

Recall: 0.40625
Precision: 0.448275862069


In [48]:
print('False Positive Rate:',fp/(fp+tn))
# print('True Positive Rate:',tp/(tp+fn))
print('False Negative Rate:',fn/(tp+fn))

False Positive Rate: 0.453441295547
False Negative Rate: 0.59375


In [76]:
features[np.argsort(rf_cv.feature_importances_)[::-1][:5]]

Index(['HYP_07_HYP', 'WBN_GC_L_0.75', 'ARC_07_HYP', 'ARC_01_ARC',
       'ARC_03_ARC'],
      dtype='object')

### RF:  Drug Bank

In [47]:
drugbank = pd.read_csv('DrugBank_MV.csv',dtype={'Row':'str'})
drugid = np.array(drugbank.iloc[:,0].dropna(axis=0))
drugbank = drugbank.iloc[:,1:180]
drugbank = np.array(drugbank.dropna(axis=0))
print(drugbank.shape)

(8722, 179)


In [48]:
drug_pred = rf_cv.predict(drugbank)
print(Counter(drug_pred))
# predicted active molucule
drugid_rf = drugid[np.where(drug_pred==1)]

Counter({0: 8104, 1: 618})
