In [3]:
#import necessary modules 
import pandas as pd 
from sklearn.linear_model import LogisticRegression as LR
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV 

In [5]:
#load dataset
cancer_df = pd.read_csv('datasets/datasets_180_408_data.csv', keep_default_na=False)
print(cancer_df.head())

         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  texture_worst  perimeter_worst  area_worst  smoothness

In [6]:
#create arrays for features and response variable
del cancer_df['Unnamed: 32']
y = cancer_df['diagnosis'].values
X = cancer_df.drop('diagnosis', axis=1).values

#Random seed for reproducibility
Seed = 1

In [7]:
# instantiate KNN classiffier with 6 neighbors, Logistic Regression and Decision Tree classifier
knn = KNN(n_neighbors=6)
log_reg = LR(random_state=Seed)
dt = DT(max_depth=6, random_state=Seed)

In [8]:
#split the data into 70% train and 30% test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = Seed)

In [9]:
# list of classifiers
classifiers = [('logistic regression', log_reg), ('nearest neighbor', knn),('decison tree', dt)]

In [10]:
# iterate over the predefined classifiers
for clf_name, clf in classifiers:
    
    #fit to the training data
    clf.fit(X_train, y_train)
    
    #predict the test set
    y_pred = clf.predict(X_test)
    
    #calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    #evaluate the classifiers accuracy on test set
    print('{:s} : {:.3f}'.format(clf_name, accuracy))

logistic regression : 0.632
nearest neighbor : 0.772
decison tree : 0.930


In [11]:
# estimating better performance by Ensembling
vc = VotingClassifier(estimators=classifiers)

In [12]:
# fit vc to the training data
vc.fit(X_train, y_train)

VotingClassifier(estimators=[('logistic regression',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=1, solver='lbfgs',
                                                 tol=0.0001, verbose=0,
                                                 warm_start=False)),
                             ('nearest neighbor',
                              KNeighborsClassifier(algorithm='auto',
                                                   leaf_siz...
                              DecisionTreeClassifier(ccp_alpha=0.0,
                          

In [13]:
# predict the response variable usig vc 
y_pred = vc.predict(X_test)

In [14]:
# calculaete the accuracy 
vc_accuracy = accuracy_score(y_test, y_pred)
print('Voting Classifier: {:.3f}'.format(accuracy))

Voting Classifier: 0.930


In [15]:
# bootstrap as an ensemble model 
bc = BaggingClassifier(base_estimator=dt, n_estimators=50, random_state=Seed, oob_score=True)

In [16]:
# fit the bagging classifier to training data
bc.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                        class_weight=None,
                                                        criterion='gini',
                                                        max_depth=6,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort='deprecated',
                                                        random_state=1,
      

In [17]:
# predict the response variable
y_pred = bc.predict(X_test)

In [18]:
# test the accuracy
bc_accuracy = accuracy_score(y_test, y_pred)
print('Bagging Classifier: {:.3f}'.format(bc_accuracy))

Bagging Classifier: 0.953


In [19]:
#test the Out of the bag accuracy
oob_accuracy = bc.oob_score_
print('OOB Accuracy: {:.3f}'.format(oob_accuracy))

OOB Accuracy: 0.947


In [20]:
# Adaptive Boosting classifier
ada = AdaBoostClassifier(base_estimator=dt, n_estimators=50, random_state=Seed)

In [21]:
# fit the ada classifier to training data
ada.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                         class_weight=None,
                                                         criterion='gini',
                                                         max_depth=6,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort='deprecated',
                          

In [22]:
y_pred_prob = ada.predict_proba(X_test)[:,1]

In [23]:
ada_auc_roc = roc_auc_score(y_test, y_pred_prob)
print('ROC AUC Score: {:.3f}'.format(ada_auc_roc))


ROC AUC Score: 0.924
