In [1]:
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from collections import Counter
from sklearn.metrics import roc_curve,roc_auc_score
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

In [2]:
molecule_matrix = pd.read_csv('./Dataset/Molecule_matrix_mold2.csv',dtype={'CID':'str'})

In [3]:
CID = np.array(molecule_matrix['CID'])
label = np.array(molecule_matrix['outcome'])
molecule_matrix_ = np.array(molecule_matrix.iloc[:,1:molecule_matrix.shape[1]-1])

In [None]:
class baggingXgboost:
    def __init__(self,x,y,boost_size=10):
        self.x = x
        self.y = y
        self.boost_size = boost_size
        self.boost = []
        
    def underSampleMajority(self,majority_multiplier=10):
        label = dict(Counter(self.y))
        ratio = {1:label[1],0:int(majority_multiplier*label[1])}
        rus = RandomUnderSampler(ratio=ratio, replacement=False)
        x_resample, y_resample = rus.fit_sample(self.x,self.y)
        return x_resample, y_resample
    
    def train(self,x,y):
        train = xgb.DMatrix(x,y)
        scale = float(np.sum(y == 0)/np.sum(y == 1))
        param = {'gamma':0.8,'max_depth':6,'objective':'binary:logistic','eval_metric':'auc','scale_pos_weight':scale}
        num_round = 25
        bst = xgb.train(params=param,dtrain=train,num_boost_round=num_round)
        self.boost.append(bst)
        
    def predict(self,test):
        bagging_result = np.zeros((self.boost_size,x_test.shape[0]))
        pred_result = np.zeros(x_test.shape[0])
        true = test.get_label()
        
        for i in range(self.boost_size):
            bst_i = self.boost[i]
            pred = bst_i.predict(test)
            pred_ =  [1 if p>0.4 else 0 for p in pred]
            bagging_result[i] = pred_
        
        col_sum = np.sum(bagging_result,axis=0)
        posi_indx = np.where(col_sum >= (self.boost_size/2))
        pred_result[posi_indx] = 1
        
        return true, pred_result

In [None]:
rand_indx = np.random.permutation(molecule_matrix_.shape[0])
molecule_matrix_rand = molecule_matrix_[rand_indx]
label_rand = label[rand_indx]

In [None]:
neg_indx = list(np.where(label_rand==0)[0][:1500])
post_indx = np.where(label_rand==1)[0][:150]

In [None]:
for p in post_indx:
    neg_indx.append(p)

In [None]:
x_test = molecule_matrix_rand[neg_indx]
y_test = label_rand[neg_indx]

In [None]:
train_indx = np.ones(molecule_matrix_rand.shape[0]).astype('bool')
train_indx[neg_indx] = False
x_train = molecule_matrix_rand[train_indx]
y_train = label_rand[train_indx]

In [None]:
test = xgb.DMatrix(x_test,y_test)

In [None]:
bsize = 20
bx = baggingXgboost(x_train,y_train,boost_size=bsize)

In [None]:
for b in range(bsize):
    x_resample,y_resample = bx.underSampleMajority()
    bx.train(x_resample,y_resample)

In [None]:
true, pred = bx.predict(test)

In [None]:
tn,fp,fn,tp = confusion_matrix(y_pred=pred,y_true=true).ravel()
tpr = tp/(tp+fn)
fnr = fn/(tp+fn)
print('True positive rate:', tpr)
print('False negative rate:', fnr)