##### Source:
imblearn Documentation
- http://contrib.scikit-learn.org/imbalanced-learn/stable/generated/imblearn.ensemble.BalancedBaggingClassifier.html

In [1]:
import zipfile
import pandas as pd
import numpy as np
import glob
import scipy.sparse
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from tempfile import TemporaryFile
%matplotlib inline

In [2]:
molecule_descriptor_files = []
for file in glob.glob('./Dataset/AID_2289_descriptors_part[0-9]*.csv'):
    molecule_descriptor_files.append(file)

#### Use counter-screen (AID: 588342) to filter out false positive

Counter-screen: A screen performed in parallel with or after the primary screen. The assay used in the counter-screen is developed to identify compounds that have the potential to interfere with the assay used in the primary screen (the primary assay).

Identify the overlapping CID between 588342 active and 2289 active: 2,399 molecules

In [3]:
active_588342 = pd.read_csv('./Dataset/active_588342.csv',dtype={'CID':'str'})
active_2289 = pd.read_csv('./Dataset/active_2289.csv',dtype={'CID':'str'})

In [4]:
false_positive_cid = list(set(active_2289['CID']) & set(active_588342['CID']))

In [5]:
len(false_positive_cid)

2399

In [6]:
# only one file contains active cases
molecule_matrix = pd.read_csv(molecule_descriptor_files[9],dtype={'CID':'str','SID':'str'})
features = molecule_matrix.columns[2:181] # feature names except the molecule ID

In [7]:
filterout_index = []
for idx in range(molecule_matrix.shape[0]):
    cid = molecule_matrix.iloc[idx,:]['CID']
    if cid in false_positive_cid:
        filterout_index.append(idx)

In [8]:
ind = np.ones(molecule_matrix.shape[0]).astype('bool')
ind[filterout_index] = False
molecule_matrix = molecule_matrix.iloc[ind,:]
molecule_matrix = np.array(molecule_matrix)

In [9]:
for file in molecule_descriptor_files[:9]:
    f = pd.read_csv(file,dtype={'CID':'str','SID':'str'})
    f = np.array(f.dropna(axis=0)) # drop rows with NA 
    molecule_matrix = np.vstack((molecule_matrix,f))

In [10]:
label = molecule_matrix[:,molecule_matrix.shape[1]-1]
label[label=='active']=1
label[label=='inactive']=0
label = label.astype(int)

In [11]:
print(Counter(label))

Counter({0: 304501, 1: 884})


In [12]:
moleculeCID = molecule_matrix[:,0]

#### Construct the whole matrix

In [13]:
molecule_matrix = molecule_matrix[:,2:181]

In [14]:
molecule_matrix.shape

(305385, 179)

In [15]:
molecule_matrix = molecule_matrix.astype('float')

In [16]:
print('Number of molecule ID:',len(moleculeCID))
print('Number of label:',len(label))
print('Number of features:',len(features))
print('Shape:',molecule_matrix.shape)

Number of molecule ID: 305385
Number of label: 305385
Number of features: 179
Shape: (305385, 179)


#### Remove attributes of no variation

In [17]:
remove_col = []
for col in range(molecule_matrix.shape[1]):
    temp_sum = np.sum(molecule_matrix[:,col])
    if temp_sum == 0 or temp_sum == molecule_matrix.shape[0]:
        remove_col.append(col)

In [18]:
remove = np.ones(molecule_matrix.shape[1]).astype('bool')
remove[np.array(remove_col)] = False
molecule_matrix = molecule_matrix[:,remove]

In [19]:
print('After removing attributes without any variation:', molecule_matrix.shape)

After removing attributes without any variation: (305385, 158)


### Split the train and test data
train = 0.8; test = 0.2

In [20]:
x_train,x_test,y_train,y_test = train_test_split(molecule_matrix,label,test_size=0.2)

In [21]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(244308, 158)
(244308,)
(61077, 158)
(61077,)


In [22]:
ytrain = dict(Counter(y_train))
ytest = dict(Counter(y_test))

Ratio of active to inactive in training/test:

In [23]:
print('active ratio in training:', ytrain[1]/(ytrain[0]+ytrain[1]))
print('active ratio in test:', ytest[1]/(ytest[0]+ytest[1]))

active ratio in training: 0.0029634723381960475
active ratio in test: 0.0026196440558639094


### Handling the imbalance

##### 1). ensemble of samplers

In [24]:
from imblearn.ensemble import BalancedBaggingClassifier
bbc = BalancedBaggingClassifier(ratio='auto',replacement=False, random_state=0,base_estimator=RandomForestClassifier())
bbc.fit(x_train,y_train)

BalancedBaggingClassifier(base_estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
             bootstrap=True, bootstrap_features=False, max_features=1.0,
             max_samples=1.0, n_estimators=10, n_jobs=1, oob_score=False,
             random_state=0, ratio='auto', replacement=False, verbose=0,
             warm_start=False)

In [25]:
y_pred = bbc.predict(x_test)

In [26]:
confusion_matrix(y_true=y_test,y_pred=y_pred)

array([[50591, 10326],
       [  133,    27]])

In [27]:
print(accuracy_score(y_pred=y_pred,y_true=y_test))

0.828757142623


### Balanced Bagging Classifier - CV

In [28]:
from imblearn.ensemble import BalancedBaggingClassifier
highest_cross_val_accu = -1
best_t = None
indices = range(x_train.shape[0])
tree = np.arange(10,500,50)
kf = KFold(n_splits=5,shuffle=True,random_state=28584096)

for n_tree in tree:
    accuracies = []
    for train_indices, val_indices in kf.split(indices):
        x_tr = x_train[train_indices]
        y_tr = y_train[train_indices]
        bbc = BalancedBaggingClassifier(ratio='auto',
                                        replacement=False, 
                                        base_estimator=RandomForestClassifier(n_estimators=n_tree))
        bbc.fit(x_tr,y_tr)
        pred = bbc.predict(x_train[val_indices])
        acc = accuracy_score(y_true=y_train[val_indices],y_pred=pred)
        accuracies.append(acc)
    cross_val_acc = np.mean(accuracies)
    print('N_tree: ', n_tree, 'cross validation accuracy:', cross_val_acc)
    if cross_val_acc > highest_cross_val_accu:
        highest_cross_val_accu = cross_val_acc
        best_t = n_tree
print('Best N_tree: ',best_t, '; cross-validation accuracy: ',highest_cross_val_accu)

N_tree:  10 cross validation accuracy: 0.816457055114
N_tree:  60 cross validation accuracy: 0.853042030331
N_tree:  110 cross validation accuracy: 0.854327331095
N_tree:  160 cross validation accuracy: 0.856054641403
N_tree:  210 cross validation accuracy: 0.860610427532
N_tree:  260 cross validation accuracy: 0.861752309439
N_tree:  310 cross validation accuracy: 0.860024972324
N_tree:  360 cross validation accuracy: 0.855309588378
N_tree:  410 cross validation accuracy: 0.861154792305
N_tree:  460 cross validation accuracy: 0.860565338934
Best N_tree:  260 ; cross-validation accuracy:  0.861752309439


In [31]:
bbc = BalancedBaggingClassifier(ratio='auto',replacement=False, base_estimator=RandomForestClassifier(n_estimators=best_t))
bbc.fit(x_train,y_train)
pred = bbc.predict(x_test)

In [37]:
confusion_matrix(y_pred=pred,y_true=y_test)

array([[53405,  7512],
       [  137,    23]])

In [32]:
print(accuracy_score(y_pred=pred,y_true=y_test))

0.874764641354


In [33]:
tn,fp,fn,tp = confusion_matrix(y_pred=pred,y_true=y_test).ravel()

In [36]:
# print('Recall:', tp/(tp+fn))
# print('Precision:', tp/(tp+fp))

In [35]:
print('False Positive Rate:',fp/(fp+tn))
# print('True Positive Rate:',tp/(tp+fn))
print('False Negative Rate:',fn/(tp+fn))

False Positive Rate: 0.123315330696
False Negative Rate: 0.85625
