# Bagging + Boosting + Stacking Models


In [None]:
# By Alex Dance

https://www.linkedin.com/in/alex-dance/

# Step 2: Find the Data
### Blood Transfusion Service Center DataSet
- **Citation Request**

    This breast cancer databases was obtained from the **University of Wisconsin Hospitals**, **Madison** from **Dr. William H. Wolberg**. If you publish results when using this database, then please include this information in your acknowledgements.

- **Title**

    Wisconsin Breast Cancer Database (January 8, 1991)

- **Sources**
    - **Creator**
            Dr. WIlliam H. Wolberg (physician)
            University of Wisconsin Hospitals
            Madison, Wisconsin
            USA
    - **Donor**
            Olvi Mangasarian (mangasarian@cs.wisc.edu)
            Received by David W. Aha (aha@cs.jhu.edu)
    - **Date**
            15 July 1992
        
### UCI - Machine Learning Repository
- Center for Machine Learning and Intelligent Systems

The [**UCI Machine Learning Repository**](http://archive.ics.uci.edu/ml/about.html) is a collection of databases, domain theories, and data generators that are used by the machine learning community for the empirical analysis of machine learning algorithms.

In [1]:
# Names from  https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.names

In [2]:
import pandas as pd
import numpy as np

# Get Data 

In [3]:
#The belwo data is from a different data set used previously which was https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)

In [4]:
#Class 2 for benign, 4 for malignant

In [5]:
names = ["Sample code number", "Clump Thickness", "Uniformity of Cell Size", "Uniformity of Cell Shape", "Marginal Adhesion", "Single Epithelial Cell Size", "Bare Nuclei", "Bland Chromatin", "Normal Nucleoli", "Mitoses", "Class"]

In [6]:
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data", header=None, names=names)

In [7]:
df.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [8]:
mapping = {2:0, 4:1}
df['Class'] = df['Class'].map(mapping)

In [9]:
df.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,0
1,1002945,5,4,4,5,7,10,3,2,1,0
2,1015425,3,1,1,1,2,2,3,1,1,0
3,1016277,6,8,8,1,3,4,3,7,1,0
4,1017023,4,1,1,3,2,1,3,1,1,0


In [10]:
df.isnull().sum()

Sample code number             0
Clump Thickness                0
Uniformity of Cell Size        0
Uniformity of Cell Shape       0
Marginal Adhesion              0
Single Epithelial Cell Size    0
Bare Nuclei                    0
Bland Chromatin                0
Normal Nucleoli                0
Mitoses                        0
Class                          0
dtype: int64

In [11]:
y = df[['Class']]
X = df[['Clump Thickness', 'Uniformity of Cell Size' , 'Uniformity of Cell Shape']] # ‘Marginal Adhesion’, ‘Single Epithelial Cell Size’, ‘Bare Nuclei’, ‘Bland Chromatin’, ‘Normal Nucleoli’]]

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1) # For uncleaned data

# Model for All

In [14]:
from sklearn import svm, metrics
from sklearn.metrics import confusion_matrix

In [15]:
class color:  # Testing to make the heading look a liitle more impressive
   BOLD = '\033[1m'

In [16]:
def find_all(y_test_f,X_test_f,model_f,X_train_f, y_train_f):
    model_f.fit(X_train_f, y_train_f)
    preds = model_f.predict_proba(X_test_f)[:,1]
    fpr, tpr, thresholds  = metrics.roc_curve(y_test_f, preds)
    roc_auc = metrics.auc(fpr, tpr)
    y_pred_f = model_f.predict(X_test_f)
    cf = confusion_matrix(y_test_f, y_pred_f)
    accuracy_score_train =  model_f.score(X_train_f, y_train_f) 
    accuracy_score_test =  model_f.score(X_test_f, y_test_f)    
    return{'auc': roc_auc, 'cfm':cf ,'accuracy_score_train':accuracy_score_train , 'accuracy_score_test':accuracy_score_test}

# Logistic Regresison Model to Kick off

In [17]:
from sklearn.linear_model import LogisticRegression

In [18]:
LogisticRegressionModel = LogisticRegression()

In [19]:
LogisticRegressionModel.fit(X_train, y_train)
accuracy_score_train =  LogisticRegressionModel.score(X_train, y_train) 
accuracy_score_test =  LogisticRegressionModel.score(X_test, y_test)    

  y = column_or_1d(y, warn=True)


In [20]:
print(accuracy_score_train)

0.9499105545617174


In [21]:
print(accuracy_score_test)

0.9642857142857143


In [22]:
LR_Results= find_all(y_test, X_test ,LogisticRegression(), X_train, y_train)
print(color.BOLD +"Logistic Regression Results ")
print ('\033[0m')

print("Logistic Regression AUC Test  %.2f%%" % (LR_Results['auc']* 100.0))
print(LR_Results['cfm'])
print("Logistic Regression accuracy_score_train  %.2f%%" % (LR_Results ['accuracy_score_train'] * 100.0))
print("Logistic Regression accuracy_score_test  %.2f%%" % (LR_Results ['accuracy_score_test']* 100.0))

[1mLogistic Regression Results 
[0m
Logistic Regression AUC Test  99.12%
[[90  2]
 [ 3 45]]
Logistic Regression accuracy_score_train  94.99%
Logistic Regression accuracy_score_test  96.43%


  y = column_or_1d(y, warn=True)


# Bagging 

In [23]:
from sklearn.ensemble import BaggingClassifier

In [24]:
# Could do it through the below lines of code
#Bag_Model = BaggingClassifier()

In [25]:
#Bag_Model.fit(X_train,y_train)

In [26]:
#print("Bag train score ", Bag_Model.score(X_train, y_train))
#print("Bag test score ", Bag_Model.score(X_test, y_test))

In [27]:
Bag_Results= find_all(y_test, X_test ,BaggingClassifier(), X_train, y_train)
print(color.BOLD +"Bag_Resultsn Results ")
print ('\033[0m')

print("Bag_ResultsAUC Test  %.2f%%" % (Bag_Results['auc']* 100.0))
print(Bag_Results['cfm'])
print("Bag_Results accuracy_score_train  %.2f%%" % (Bag_Results ['accuracy_score_train'] * 100.0))
print("Bag_Resultsaccuracy_score_test  %.2f%%" % (Bag_Results ['accuracy_score_test']* 100.0))

  y = column_or_1d(y, warn=True)


[1mBag_Resultsn Results 
[0m
Bag_ResultsAUC Test  97.84%
[[90  2]
 [ 4 44]]
Bag_Results accuracy_score_train  98.39%
Bag_Resultsaccuracy_score_test  95.71%


# Boosting

In [28]:
from sklearn.ensemble import GradientBoostingClassifier

In [29]:
# Could have done it through the below lines of code
#Boost_Model = GradientBoostingClassifier()

In [30]:
#Boost_Model.fit(X_train,y_train)

In [31]:
#print("Boost train score ", Boost_Model.score(X_train, y_train))
#print("Boost test score ", Boost_Model.score(X_test, y_test))

In [32]:
Boost_Results= find_all(y_test, X_test ,GradientBoostingClassifier(), X_train, y_train)
print(color.BOLD +"Boost_Results Results ")
print ('\033[0m')

print("Boost_Results AUC Test  %.2f%%" % (Boost_Results['auc']* 100.0))
print(Boost_Results['cfm'])
print("Boost_Results accuracy_score_train  %.2f%%" % (Boost_Results ['accuracy_score_train'] * 100.0))
print("Boost_Results accuracy_score_test  %.2f%%" % (Boost_Results ['accuracy_score_test']* 100.0))

[1mBoost_Results Results 
[0m
Boost_Results AUC Test  99.00%
[[90  2]
 [ 3 45]]
Boost_Results accuracy_score_train  96.60%
Boost_Results accuracy_score_test  96.43%


  y = column_or_1d(y, warn=True)


# Stacking

In [33]:
from sklearn.ensemble import StackingClassifier

In [34]:
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.StackingClassifier.html

In [35]:
#estimators = [('Boo_S', GradientBoostingClassifier()), ('Bag_S', BaggingClassifier(random_state=42))]
estimators = [('Boo_S', GradientBoostingClassifier()), ('Log', LogisticRegression(random_state=42))]

In [36]:
Stacking_Model = StackingClassifier(estimators=estimators)#, final_estimator=LogisticRegression)

In [37]:
Stacking_Model.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


StackingClassifier(cv=None,
                   estimators=[('Boo_S',
                                GradientBoostingClassifier(ccp_alpha=0.0,
                                                           criterion='friedman_mse',
                                                           init=None,
                                                           learning_rate=0.1,
                                                           loss='deviance',
                                                           max_depth=3,
                                                           max_features=None,
                                                           max_leaf_nodes=None,
                                                           min_impurity_decrease=0.0,
                                                           min_impurity_split=None,
                                                           min_samples_leaf=1,
                                                           min_sample

In [38]:
print("Stack train score ", Stacking_Model.score(X_train, y_train))
print("Stack test score ", Stacking_Model.score(X_test, y_test))

Stack train score  0.9570661896243292
Stack test score  0.9642857142857143


In [39]:
#preds = Stacking_Model.predict_proba(X_test)
preds = Stacking_Model.predict(X_test)

In [40]:
fpr, tpr, thresholds  = metrics.roc_curve(y_test, preds)

In [41]:
roc_auc_stack = metrics.auc(fpr, tpr)

In [42]:
print("Stack AUC Test  %.2f%%" % (roc_auc_stack* 100.0))

Stack AUC Test  95.79%


# Now All together

In [43]:
print(color.BOLD +"AUC results ")
print ('\033[0m')
print("Logistic Regression AUC Test  %.2f%%" % (LR_Results['auc']* 100.0))
print("Bag_ResultsAUC Test  %.2f%%" % (Bag_Results['auc']* 100.0))
print("Boost_Results AUC Test  %.2f%%" % (Boost_Results['auc']* 100.0))
print("Stack AUC Test  %.2f%%" % (roc_auc_stack* 100.0))
print(" ")
print(" ")
print(color.BOLD +"Rest of results ")
print ('\033[0m')

print("Logistic Regression accuracy_score_train  %.2f%%" % (LR_Results ['accuracy_score_train'] * 100.0))
print("Logistic Regression accuracy_score_test  %.2f%%" % (LR_Results ['accuracy_score_test']* 100.0))
print(" ")
print("Bag_Results accuracy_score_train  %.2f%%" % (Bag_Results ['accuracy_score_train'] * 100.0))
print("Bag_Resultsaccuracy_score_test  %.2f%%" % (Bag_Results ['accuracy_score_test']* 100.0))
print(" ")
print("Boost_Results accuracy_score_train  %.2f%%" % (Boost_Results ['accuracy_score_train'] * 100.0))
print("Boost_Results accuracy_score_test  %.2f%%" % (Boost_Results ['accuracy_score_test']* 100.0))
print(" ")
print("Stack train score ", Stacking_Model.score(X_train, y_train))
print("Stack test score ", Stacking_Model.score(X_test, y_test))




[1mAUC results 
[0m
Logistic Regression AUC Test  99.12%
Bag_ResultsAUC Test  97.84%
Boost_Results AUC Test  99.00%
Stack AUC Test  95.79%
 
 
[1mRest of results 
[0m
Logistic Regression accuracy_score_train  94.99%
Logistic Regression accuracy_score_test  96.43%
 
Bag_Results accuracy_score_train  98.39%
Bag_Resultsaccuracy_score_test  95.71%
 
Boost_Results accuracy_score_train  96.60%
Boost_Results accuracy_score_test  96.43%
 
Stack train score  0.9570661896243292
Stack test score  0.9642857142857143


In [None]:
# by Alex Dance