In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn import datasets
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")


  return f(*args, **kwds)


In [2]:
brc = datasets.load_breast_cancer()

In [3]:
y = brc.target
x = brc.data

In [4]:
x.shape, y.shape

((569, 30), (569,))

In [5]:
xtrain, xtest, ytrain, ytest = train_test_split(x,y, random_state=9)

In [6]:
xtrain.shape, xtest.shape, ytrain.shape, ytest.shape

((426, 30), (143, 30), (426,), (143,))

In [7]:
dt = DecisionTreeClassifier()

# Let's check the accuracy of individual classifier.

In [8]:
dt.fit(xtrain,ytrain)
ypred = dt.predict(xtest)
accuracy_score(ytest, ypred)


0.951048951048951

# Bagging Classifier

In [9]:
bgc = BaggingClassifier(
    DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=100,
    max_features=1.0,
    bootstrap=True,
    bootstrap_features=False,
    oob_score=False,
    warm_start=False,
    n_jobs=-1,
    random_state=9,
    verbose=0,
)

# Let's check the accuracy of Individual and the Bagging classifier

In [10]:
for clf in (dt, bgc):
    clf.fit(xtrain, ytrain)
    ypred = clf. predict(xtest)
    print(clf.__class__.__name__, accuracy_score(ytest, ypred))

DecisionTreeClassifier 0.965034965034965
BaggingClassifier 0.972027972027972


We have achived 97.2% accuracy using the Bagging Ensemble Learning

# Let's use RandomForest Classification

In [12]:
rf = RandomForestClassifier(
    n_estimators=500,
    criterion='gini',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features='auto',
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    bootstrap=True,
    oob_score=False,
    n_jobs=None,
    random_state=9,
    verbose=0,
    warm_start=False,
    class_weight=None,
    ccp_alpha=0.0,
    max_samples=None,
)

In [14]:
for clf in (dt, bgc, rf):
    clf.fit(xtrain, ytrain)
    ypred = clf. predict(xtest)
    print(clf.__class__.__name__, accuracy_score(ytest, ypred))

DecisionTreeClassifier 0.951048951048951
BaggingClassifier 0.972027972027972
RandomForestClassifier 0.972027972027972


As we can see Accuracy is same for both.

# Feature Importance

In [17]:
rf.feature_importances_

array([0.05642484, 0.01607375, 0.04593491, 0.03754688, 0.00706111,
       0.01281772, 0.03679751, 0.09600027, 0.00433564, 0.00385631,
       0.01251585, 0.00501863, 0.01699092, 0.04841115, 0.00309128,
       0.0045569 , 0.00630307, 0.00431927, 0.00337058, 0.0058418 ,
       0.12050279, 0.02078284, 0.1222515 , 0.11410986, 0.01446155,
       0.01472394, 0.02855266, 0.12047159, 0.00917359, 0.0077013 ])

Let's add the Feature names as well.

In [16]:
for feature, score in zip(brc.feature_names, rf.feature_importances_):
    print(feature, " : ", score)

mean radius  :  0.05642483751167683
mean texture  :  0.016073746147144585
mean perimeter  :  0.04593491236166825
mean area  :  0.03754688396700512
mean smoothness  :  0.007061114582785731
mean compactness  :  0.012817721633748912
mean concavity  :  0.036797507035716284
mean concave points  :  0.09600026896195112
mean symmetry  :  0.004335636349633174
mean fractal dimension  :  0.003856308585639196
radius error  :  0.012515849121101444
texture error  :  0.005018628821654433
perimeter error  :  0.01699091756683065
area error  :  0.04841114752296611
smoothness error  :  0.003091276093299652
compactness error  :  0.004556897864539464
concavity error  :  0.006303066837725432
concave points error  :  0.004319271297257645
symmetry error  :  0.0033705768092159123
fractal dimension error  :  0.005841798276295813
worst radius  :  0.12050279352150113
worst texture  :  0.020782842654276625
worst perimeter  :  0.1222514997563545
worst area  :  0.11410986133246931
worst smoothness  :  0.014461545177

In [18]:
import pandas as pd

In [20]:
df = pd.DataFrame(data=x, columns= brc.feature_names)

In [21]:
df

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [24]:
newx = df.drop(["smoothness error", "symmetry error", "mean fractal dimension", "concave points error", 
                "mean symmetry", "compactness error", "texture error", "fractal dimension error", 
                "concavity error", "mean smoothness", "worst fractal dimension", "worst symmetry"], axis=1)

In [26]:
newx

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean compactness,mean concavity,mean concave points,radius error,perimeter error,area error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points
0,17.99,10.38,122.80,1001.0,0.27760,0.30010,0.14710,1.0950,8.589,153.40,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654
1,20.57,17.77,132.90,1326.0,0.07864,0.08690,0.07017,0.5435,3.398,74.08,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860
2,19.69,21.25,130.00,1203.0,0.15990,0.19740,0.12790,0.7456,4.585,94.03,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430
3,11.42,20.38,77.58,386.1,0.28390,0.24140,0.10520,0.4956,3.445,27.23,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575
4,20.29,14.34,135.10,1297.0,0.13280,0.19800,0.10430,0.7572,5.438,94.44,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11590,0.24390,0.13890,1.1760,7.673,158.70,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216
565,20.13,28.25,131.20,1261.0,0.10340,0.14400,0.09791,0.7655,5.203,99.04,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628
566,16.60,28.08,108.30,858.1,0.10230,0.09251,0.05302,0.4564,3.425,48.55,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418
567,20.60,29.33,140.10,1265.0,0.27700,0.35140,0.15200,0.7260,5.772,86.22,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650


In [30]:
xtrain, xtest, ytrain, ytest = train_test_split(newx,y, random_state=9)

In [31]:
for clf in (dt, bgc, rf):
    clf.fit(xtrain, ytrain)
    ypred = clf. predict(xtest)
    print(clf.__class__.__name__, accuracy_score(ytest, ypred))

DecisionTreeClassifier 0.951048951048951
BaggingClassifier 0.972027972027972
RandomForestClassifier 0.9790209790209791


Previous Score on the full dataset were:

DecisionTreeClassifier 0.951048951048951

BaggingClassifier 0.972027972027972

RandomForestClassifier 0.972027972027972

# You can see accuracy of RandomForest has been increased.