In [28]:
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from collections import Counter
import pandas as pd
import numpy as np
import xgboost as xgb

In [29]:
molecule_matrix = pd.read_csv('./Dataset/Molecule_matrix_mold2.csv',dtype={'CID':'str'})

In [30]:
CID = np.array(molecule_matrix['CID'])
label = np.array(molecule_matrix['outcome'])
molecule_matrix_ = np.array(molecule_matrix.iloc[:,1:molecule_matrix.shape[1]-1])

In [31]:
label[label=='active']=1
label[label=='inactive']=0
label = label.astype('int')

In [32]:
print(molecule_matrix_.shape)
print(Counter(label))

(302630, 648)
Counter({0: 301747, 1: 883})


#### Bagging

In [33]:
x_train,x_test,y_train,y_test = train_test_split(molecule_matrix_,label,test_size = 0.2)

In [34]:
def xgboost_bagging(matrix,label,n_estimator,neg):
    boost = []
    for i in range(n_estimator):
        pos = np.sum(label==1)
        ratio = {0:neg,1:pos}
        rus = RandomUnderSampler(ratio=ratio)
        x_resample,y_resample = rus.fit_sample(matrix,label)
        
        train = xgb.DMatrix(x_resample,y_resample)
        
        scale = float(np.sum(y_resample==1)/np.sum(y_resample==0))
        param = {'gamma':0.8,'max_depth':6,'objective':'binary:logistic','eval_metric':'auc','eta':0.8,'scale_pos_weight':scale}
        num_round = 20
        
        bst = xgb.train(params=param, dtrain=train,num_boost_round=num_round)
        boost.append(bst)  
    return boost

In [46]:
def predict(estimators, x_test,y_test):
    final_pred = np.zeros((10,len(y_test)))
    x_test = xgb.DMatrix(x_test,y_test)
    bagging_pred = []
    
    for i, est in enumerate(estimators):
        pred = est.predict(x_test)
        pred_ = np.array([1 if i>0.4 else 0 for i in pred])
        final_pred[i] = pred_
    
    for test in range(final_pred.shape[1]):
        s = np.sum(final_pred[:,test])
        if s > 6: 
            s = 1
        else:
            s = 0
        bagging_pred.append(s)
    return bagging_pred

#### XGBoost

In [36]:
t = xgboost_bagging(x_train,y_train,10,7000)

In [47]:
t_pred = predict(t,x_test,y_test)

In [49]:
len(t_pred)

60526

In [51]:
np.mean(t_pred!=y_test)

0.0035026269702276708

In [53]:
confusion_matrix(y_true=y_test,y_pred=t_pred)

array([[60310,    26],
       [  186,     4]])

In [113]:
# tn,fp,fn,tp = confusion_matrix(y_true=label,y_pred=preds_).ravel()

In [26]:
# print('Precision=', tp/(tp+fp))
# print('Recall:',tp/(tp+fn))
# print('False negative:',fn/(tp+fn))