In [6]:
from sklearn.ensemble import VotingClassifier, StackingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score

data = pd.read_csv('Cancer_Data.csv')

y = np.where(data.diagnosis == 'M', 1, 0)
X = data.drop(['diagnosis', 'id', 'Unnamed: 32'], axis=1)
features = X.columns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1, stratify=y)

In [7]:
ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), learning_rate=1, 
                         n_estimators=190).fit(X_train, y_train)

xg = xgb.XGBClassifier(objective = 'binary:logistic',random_state=1,gamma=0.25,learning_rate = 0.1,max_depth=1,
                              n_estimators = 400,reg_lambda = 0.01,scale_pos_weight=1.5).fit(X_train,y_train)

bag = BaggingClassifier(base_estimator=DecisionTreeClassifier(random_state=1), n_estimators=200, 
                          random_state=1,max_features=0.5,
                        max_samples=1.0,n_jobs=-1,bootstrap=True,bootstrap_features=True).fit(X_train, y_train)





In [9]:
#Voting ensemble: Averaging the predictions of all models
en=VotingClassifier(estimators = [('xgb',xg),('ada',ada),('bag',bag)], voting='soft')
en.fit(X_train,y_train)

prob = en.predict_proba(X_train)[:,1] 
results = pd.DataFrame(columns=['Threshold', 'Accuracy', 'Recall'])

#Select an optimal training threshold
for t in np.arange(0.01, 1.01, 0.01):
  pred = np.where(prob > t, 1, 0)
  accuracy = accuracy_score(y_train, pred)
  recall = recall_score(y_train, pred)
  results = pd.concat([results, pd.DataFrame({'Threshold':t, 'Accuracy':accuracy, 'Recall':recall}, 
                                             index=[0])]).reset_index(drop=True)

results.sort_values(['Recall', 'Accuracy'], ascending=False)





Unnamed: 0,Threshold,Accuracy,Recall
30,0.31,1.000000,1.0
31,0.32,1.000000,1.0
32,0.33,1.000000,1.0
33,0.34,1.000000,1.0
34,0.35,1.000000,1.0
...,...,...,...
95,0.96,0.626374,0.0
96,0.97,0.626374,0.0
97,0.98,0.626374,0.0
98,0.99,0.626374,0.0


In [11]:
thr = 0.31
y_pred_prob = en.predict_proba(X_test)[:,1] 

y_pred = y_pred_prob > thr
y_pred = y_pred.astype(int)

print("Accuracy: ",accuracy_score(y_pred, y_test)*100)  
print("Recall: ", recall_score(y_test, y_pred))

Accuracy:  96.49122807017544
Recall:  0.9523809523809523


In [12]:
#Stacking ensemble: Averaging the predictions of all models
en=StackingClassifier(estimators = [('xgb',xg),('ada',ada),('bag',bag)])
en.fit(X_train,y_train)

prob = en.predict_proba(X_train)[:,1] 
results = pd.DataFrame(columns=['Threshold', 'Accuracy', 'Recall'])

#Select an optimal training threshold
for t in np.arange(0.01, 1.01, 0.01):
  pred = np.where(prob > t, 1, 0)
  accuracy = accuracy_score(y_train, pred)
  recall = recall_score(y_train, pred)
  results = pd.concat([results, pd.DataFrame({'Threshold':t, 'Accuracy':accuracy, 'Recall':recall}, 
                                             index=[0])]).reset_index(drop=True)

results.sort_values(['Recall', 'Accuracy'], ascending=False)

















Unnamed: 0,Threshold,Accuracy,Recall
12,0.13,1.000000,1.000000
13,0.14,1.000000,1.000000
14,0.15,1.000000,1.000000
15,0.16,1.000000,1.000000
16,0.17,1.000000,1.000000
...,...,...,...
95,0.96,0.958242,0.888235
96,0.97,0.914286,0.770588
97,0.98,0.626374,0.000000
98,0.99,0.626374,0.000000


In [21]:
thr = 0.13
y_pred_prob = en.predict_proba(X_test)[:,1] 

y_pred = y_pred_prob > thr
y_pred = y_pred.astype(int)

print("Accuracy: ",accuracy_score(y_pred, y_test)*100)  
print("Recall: ", recall_score(y_test, y_pred))

Accuracy:  96.49122807017544
Recall:  0.9523809523809523
