In [1]:
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
import xgboost
from sklearn.ensemble import GradientBoostingClassifier
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = datasets.load_breast_cancer()

In [3]:
df.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [4]:
df.target_names

array(['malignant', 'benign'], dtype='<U9')

In [5]:
X = pd.DataFrame(columns = df.feature_names, data = df.data)

In [42]:
X.head()

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal
1,63,1,3,145,233,1,2,150,0,2.3,3,0.0,0
2,67,1,0,160,286,0,2,108,1,1.5,2,3.0,1
3,67,1,0,120,229,0,2,129,1,2.6,2,2.0,2
4,37,1,1,130,250,0,0,187,0,3.5,3,0.0,1
5,41,0,2,130,204,0,2,172,0,1.4,1,0.0,1


In [6]:
y = df.target

In [41]:
y

1      0
2      1
3      1
4      0
5      0
      ..
298    1
299    1
300    1
301    1
302    1
Name: target, Length: 297, dtype: int64

In [7]:
X.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [8]:
X.isnull().sum()

mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
dtype: int64

No null values.

In [9]:
df.target.shape

(569,)

In [10]:
target = {'target' : df.target}

In [11]:
y = pd.DataFrame(data = target)

In [12]:
y.value_counts()

target
1         357
0         212
dtype: int64

The data looks balanced, so we will choose accuracy as our metric.

Here,
*   1 - Benign
*   0 - Malignant




In [13]:
y = y['target']

In [14]:
X.describe()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


Takeaways : 


1. The data doesnot require any preprocessing.
2. EDA is not as such required so we move ahead to the modeling part.




**K fold cross Validation**

In [15]:
dtc =  DecisionTreeClassifier()
rfc = RandomForestClassifier()
knn =  KNeighborsClassifier()
gbm = GradientBoostingClassifier()
lr = LogisticRegression()

In [16]:
clf = [dtc,rfc,knn, gbm, lr]
for algo in clf:
    score = cross_val_score( algo,X,y,cv = 5,scoring = 'accuracy')
    print("The accuracy score of {} is:".format(algo),score.mean())


The accuracy score of DecisionTreeClassifier() is: 0.9173730787144854
The accuracy score of RandomForestClassifier() is: 0.9596025461884802
The accuracy score of KNeighborsClassifier() is: 0.9279459711224964
The accuracy score of GradientBoostingClassifier() is: 0.9613724576929048
The accuracy score of LogisticRegression() is: 0.9385188635305075


# Stacking

In [17]:

dtc =  DecisionTreeClassifier()
rfc = RandomForestClassifier()
knn =  KNeighborsClassifier()
xgb =  xgboost.XGBClassifier()
gbm = GradientBoostingClassifier()
lr = LogisticRegression()




In [18]:
clf = [('rfc',rfc),('knn',knn), ('xgb', xgb),('gbm', gbm)] #list of (str, estimator)
lr = LogisticRegression()
stack_model = StackingClassifier( estimators = clf,final_estimator = lr)
score = cross_val_score(stack_model,X,y,cv = 5,scoring = 'accuracy')
print("The accuracy score of is:",score.mean())

The accuracy score of is: 0.9718987734823784


In [19]:
clf = [('dtc', dtc),('rfc',rfc),('xgb', xgb), ('knn',knn), ('gbm', gbm)] #list of (str, estimator)
gbm = GradientBoostingClassifier()
stack_model = StackingClassifier( estimators = clf,final_estimator = gbm)
score = cross_val_score(stack_model,X,y,cv = 5,scoring = 'accuracy')
print("The accuracy score of is:",score.mean())

The accuracy score of is: 0.9701288619779538


In [20]:
clf = [('rfc',rfc),('knn',knn), ('xgb', xgb),('gbm', gbm), ('dtc', dtc)] #list of (str, estimator)

lr = LogisticRegression()
stack_model = StackingClassifier( estimators = clf,final_estimator = lr)
score = cross_val_score(stack_model,X,y,cv = 5,scoring = 'accuracy')
print("The accuracy score of is:",score.mean())

The accuracy score of is: 0.968390001552554


**The ensemble stacking model achives the highest accuracy than any other model taken alone.**

# Another datset, few other models (heart dataset)

In [21]:
import pandas as pd
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_confusion_matrix
from mlxtend.classifier import StackingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import preprocessing

In [22]:
df = pd.read_csv('heart.csv',index_col=0 ) # loading the dataset
df.head()					 # viewing top 5 rows of dataset


Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
1,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,No
2,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal,Yes
3,67,1,asymptomatic,120,229,0,2,129,1,2.6,2,2.0,reversable,Yes
4,37,1,nonanginal,130,250,0,0,187,0,3.5,3,0.0,normal,No
5,41,0,nontypical,130,204,0,2,172,0,1.4,1,0.0,normal,No


In [23]:
df.isna().sum()

Age          0
Sex          0
ChestPain    0
RestBP       0
Chol         0
Fbs          0
RestECG      0
MaxHR        0
ExAng        0
Oldpeak      0
Slope        0
Ca           4
Thal         2
AHD          0
dtype: int64

In [24]:
df = df.dropna()

In [25]:
df.isna().sum()

Age          0
Sex          0
ChestPain    0
RestBP       0
Chol         0
Fbs          0
RestECG      0
MaxHR        0
ExAng        0
Oldpeak      0
Slope        0
Ca           0
Thal         0
AHD          0
dtype: int64

In [26]:
le = preprocessing.LabelEncoder()
for i in ['ChestPain','Thal', 'AHD' ]:
    
    df[i] = le.fit_transform(df[i])

df.head()

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
1,63,1,3,145,233,1,2,150,0,2.3,3,0.0,0,0
2,67,1,0,160,286,0,2,108,1,1.5,2,3.0,1,1
3,67,1,0,120,229,0,2,129,1,2.6,2,2.0,2,1
4,37,1,1,130,250,0,0,187,0,3.5,3,0.0,1,0
5,41,0,2,130,204,0,2,172,0,1.4,1,0.0,1,0


In [27]:
df['target'] = df['AHD']

In [28]:
# Creating X and y for training
X = df.drop(['target', 'AHD'], axis = 1)
y = df['target']


In [29]:
X.head()

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal
1,63,1,3,145,233,1,2,150,0,2.3,3,0.0,0
2,67,1,0,160,286,0,2,108,1,1.5,2,3.0,1
3,67,1,0,120,229,0,2,129,1,2.6,2,2.0,2
4,37,1,1,130,250,0,0,187,0,3.5,3,0.0,1
5,41,0,2,130,204,0,2,172,0,1.4,1,0.0,1


In [30]:
# 20 % training dataset is considered for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)


In [32]:

sc = StandardScaler()
# variables that needed to be transformed
var_transform = ['Age', 'RestBP', 'Chol', 'MaxHR','Oldpeak']
X_train[var_transform] = sc.fit_transform(X_train[var_transform]) # standardizing training data
X_test[var_transform] = sc.transform(X_test[var_transform])		 # standardizing test data
print(X_train.head())


          Age  Sex  ChestPain    RestBP      Chol  Fbs  RestECG     MaxHR  \
278 -1.741679    0          1  0.319284 -0.544213    0        0  0.151798   
263  0.601114    0          3  0.985981 -0.161771    0        0  0.971399   
31   1.605169    0          3  0.430400 -0.180893    0        0  0.108661   
23   0.377991    1          2 -0.680763  0.679601    0        2  0.496893   
282 -0.849186    1          1 -0.125182  0.086816    0        0  1.316494   

     ExAng   Oldpeak  Slope   Ca  Thal  
278      0 -0.915041      2  0.0     1  
263      0 -0.121634      1  0.0     1  
31       0  0.671774      1  2.0     1  
23       0  0.671774      2  0.0     1  
282      0 -0.915041      1  0.0     1  


In [33]:
 # initialising  Classifier
SV = SVC(probability=True)
LR = LogisticRegression()

In [34]:
model_SV = SV.fit(X_train, y_train) # fitting Training Set
pred_sv = model_SV.predict(X_test) # Predicting on test dataset


In [35]:
acc_sv = accuracy_score(y_test, pred_sv) # evaluating accuracy score
print('accuracy score of KNeighbors Classifier is:', acc_sv * 100)


accuracy score of KNeighbors Classifier is: 83.33333333333334


In [36]:
model_LogisticReg = LR.fit(X_train, y_train) # fitting Training Set
pred_lr = model_LogisticReg.predict(X_test) # Predicting on test dataset


In [37]:
acc_lr = accuracy_score(y_test, pred_lr) # evaluating accuracy score
print('accuracy score of LR Classifier is:', acc_lr * 100)


accuracy score of LR Classifier is: 85.0


In [38]:
KNC = KNeighborsClassifier()
clf_stack = StackingClassifier(classifiers =[SV, LR], meta_classifier = KNC, use_probas = True, use_features_in_secondary = True)


In [39]:
model_stack = clf_stack.fit(X_train, y_train) # training of stacked model

pred_stack = model_stack.predict(X_test)       # predictions on test data using stacked model

In [40]:
acc_stack = accuracy_score(y_test, pred_stack) # evaluating accuracy
print('accuracy score of Stacked model:', acc_stack * 100)


accuracy score of Stacked model: 86.66666666666667


Notebook credits https://www.analyticsvidhya.com/blog/2021/08/ensemble-stacking-for-machine-learning-and-deep-learning/