# Ensemble methods (Stacking)

In [1]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA

In [2]:
data_1 = pd.read_csv('Modified_inputs_targets_1.csv')
data_2 = pd.read_csv('Modified_inputs_targets_2.csv')

In [3]:
inputs_1 = data_1.drop('Heart_Disease',axis=1)
targets_1 = data_1['Heart_Disease']

In [4]:
inputs_2 = data_2.drop('Heart_Disease',axis=1)
targets_2 = data_2['Heart_Disease']

In [5]:
targets_1.shape, targets_2.shape

((308854,), (308854,))

## Applying the algorithms to the first dataset with proper feature engineering.

Let's split the dataset into train and test sets.

In [6]:
xtrain, xtest, ytrain, ytest = train_test_split(inputs_1, targets_1, test_size=0.2, random_state=42)

In [7]:
ytest

302051    0
59950     0
203639    0
78768     0
216156    0
         ..
306409    0
226428    0
250297    1
259456    0
266115    0
Name: Heart_Disease, Length: 61771, dtype: int64

In [8]:
scaler = StandardScaler()
scaler.fit(X=xtrain)

xtrain = scaler.transform(xtrain)
xtest = scaler.transform(xtest)

In [9]:
def model_predictions(model, train_x, train_y, test_x):
    
    #train the model
    model.fit(train_x,train_y)
    
    #score on the training data
    score=model.score(train_x,train_y)
    print('Training Score:', score)
    
    #storing predictions for train and test
    train_pred=model.predict(train_x)
    test_pred=model.predict(test_x)
    test_score = model.score(xtest,ytest)
    print('Test score:', test_score)
    return train_pred, test_pred

## Logistic Regression

In [10]:
LR = LogisticRegression()

In [11]:
logistic_train_pred,logistic_test_pred = model_predictions(LR,xtrain,ytrain,xtest)

Training Score: 0.9192336178531101
Test score: 0.9197196095255055


## Decision Trees

In [12]:
DT = DecisionTreeClassifier()

In [13]:
dt_train_pred, dt_test_pred = model_predictions(DT,xtrain,ytrain,xtest)

Training Score: 0.998449913591789
Test score: 0.8637872140648525


## K Nearest Neighbours

In [14]:
knn=KNeighborsClassifier()
knn_train_pred, knn_test_pred = model_predictions(knn,xtrain,ytrain,xtest)

Training Score: 0.9270933249151095
Test score: 0.9103786566511793


Let's create data frames for train and test predictions

In [15]:
train_predictions = {
              'LR': logistic_train_pred,
              'DT': dt_train_pred,
              'knn': knn_train_pred
              }
train_predictions = pd.DataFrame(train_predictions)
train_predictions

Unnamed: 0,LR,DT,knn
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0
...,...,...,...
247078,0,0,0
247079,0,0,0
247080,0,0,0
247081,0,0,0


In [16]:
test_predictions = {
              'LR': logistic_test_pred,
              'DT': dt_test_pred,
              'knn': knn_test_pred
              }
test_predictions = pd.DataFrame(test_predictions)
test_predictions

Unnamed: 0,LR,DT,knn
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,1,0
...,...,...,...
61766,0,0,0
61767,0,0,0
61768,0,0,0
61769,0,0,0


In [17]:
score1 = []
score2 = []

## Stacked Model using Decision Trees

In [18]:
model = LogisticRegression()
model.fit(train_predictions, ytrain)
score1.append(model.score(test_predictions,ytest))
score1[0]

0.8637872140648525

In [19]:
# Our stacked model is giving a score of 0.67

## Trying the XGboost algorithm

In [20]:
xgc = XGBClassifier(random_state=96)

In [21]:
xgc.fit(xtrain,ytrain)

In [22]:
xgc.score(xtrain, ytrain)

0.9222649878785671

In [23]:
score1.append(xgc.score(xtest, ytest))
score1[1]

0.9187806575901313

In [24]:
# XG boost is giving us better accuracy.

## Random Forests classifier

In [25]:
forests = RandomForestClassifier(random_state=42)

In [26]:
forests.fit(xtrain,ytrain)

In [27]:
forests.score(xtrain, ytrain)

0.9984256302538014

In [28]:
score1.append(forests.score(xtest, ytest))
score1[2]

0.9134545336808535

We can see that RandomForests and XGboost models are giving a significantly better test score than the stacked model.

In [29]:
forests.feature_importances_

array([0.07471133, 0.07407704, 0.24910914, 0.0007336 , 0.00521029,
       0.00303895, 0.00854576, 0.02209113, 0.01657365, 0.0187513 ,
       0.0222541 , 0.0073835 , 0.02389726, 0.00141107, 0.02161542,
       0.01669475, 0.01530475, 0.0163467 , 0.00581485, 0.01532301,
       0.01767439, 0.08594434, 0.20550162, 0.07199205])

In [30]:
Features = pd.DataFrame({'Features':inputs_1.columns, 'Importance':forests.feature_importances_})
Features.sort_values('Importance', ascending=False)

Unnamed: 0,Features,Importance
2,Weight_(kg),0.249109
22,Fruit-Green ratio,0.205502
21,Alcohol-Fruit ratio,0.085944
0,General_Health,0.074711
1,Age_Category,0.074077
23,Alcohol-Potato product,0.071992
12,Diabetes_Yes,0.023897
10,Depression_Yes,0.022254
7,Exercise_Yes,0.022091
14,Arthritis_Yes,0.021615


It can be seen that BMI, Age category, General Health, Green_vegetable_consumption, Fruit_consumption, Fried_potato_consumption are the highest contributing factors to the outputs.

## Now let's apply the algorithms to the second dataset with improper feature engineering.

In [31]:
# Split into train and test

In [32]:
xtrain, xtest, ytrain, ytest = train_test_split(inputs_2, targets_2, test_size=0.2, random_state=42)

In [33]:
scaler = StandardScaler()
scaler.fit(X=xtrain)

xtrain = scaler.transform(xtrain)
xtest = scaler.transform(xtest)

In [34]:
def model_predictions(model, train_x, train_y, test_x):
    
    #train the model
    model.fit(train_x,train_y)
    
    #score on the training data
    score=model.score(train_x,train_y)
    print('Training Score:', score)
    
    #storing predictions for train and test
    train_pred=model.predict(train_x)
    test_pred=model.predict(test_x)
    test_score = model.score(xtest,ytest)
    print('Test score:', test_score)
    return train_pred, test_pred

## Logistic Regression

In [35]:
LR = LogisticRegression()

In [36]:
logistic_train_pred,logistic_test_pred = model_predictions(LR,xtrain,ytrain,xtest)

Training Score: 0.9192133817381204
Test score: 0.9192663223842904


## Decision Trees

In [37]:
DT = DecisionTreeClassifier()

In [38]:
dt_train_pred, dt_test_pred = model_predictions(DT,xtrain,ytrain,xtest)

Training Score: 0.9999757166620123
Test score: 0.8606142040763465


## K Nearest Neighbours

In [39]:
knn=KNeighborsClassifier()
knn_train_pred, knn_test_pred = model_predictions(knn,xtrain,ytrain,xtest)

Training Score: 0.9269597665561775
Test score: 0.9088245293098703


Let's create data frames for train and test predictions

In [40]:
train_predictions = {
              'LR': logistic_train_pred,
              'DT': dt_train_pred,
              'knn': knn_train_pred
              }
train_predictions = pd.DataFrame(train_predictions)
train_predictions

Unnamed: 0,LR,DT,knn
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0
...,...,...,...
247078,0,0,0
247079,0,0,0
247080,0,0,0
247081,0,0,0


In [41]:
test_predictions = {
              'LR': logistic_test_pred,
              'DT': dt_test_pred,
              'knn': knn_test_pred
              }
test_predictions = pd.DataFrame(test_predictions)
test_predictions

Unnamed: 0,LR,DT,knn
0,0,1,0
1,0,0,0
2,0,1,0
3,0,0,0
4,0,0,0
...,...,...,...
61766,0,0,0
61767,0,0,0
61768,0,0,0
61769,0,0,0


## Stacked Model using Decision Trees

In [42]:
model = LogisticRegression()
model.fit(train_predictions, ytrain)
score2.append(model.score(test_predictions,ytest))
score2[0]

0.8606142040763465

In [43]:
# Our stacked model is giving a score of 0.66

## Trying the XGboost algorithm

In [44]:
xgc = XGBClassifier(random_state=96)

In [45]:
xgc.fit(xtrain,ytrain)

In [46]:
xgc.score(xtrain, ytrain)

0.9239486326457101

In [47]:
score2.append(xgc.score(xtest, ytest))
score2[1]

0.9184730698871639

In [48]:
# XG boost is giving us better accuracy.

## Random Forests classifier

In [49]:
forests = RandomForestClassifier(random_state=42)

In [50]:
forests.fit(xtrain,ytrain)

In [51]:
forests.score(xtrain, ytrain)

0.9999514333240247

In [52]:
score2.append(forests.score(xtest, ytest))
score2[2]

0.9182949927959723

We can see that RandomForests and XGboost models are giving a significantly better test score than the stacked model.

In [53]:
forests.feature_importances_

array([0.09711237, 0.12979622, 0.14843122, 0.06200807, 0.09540178,
       0.09820092, 0.09220168, 0.00870342, 0.00999534, 0.01707922,
       0.02161074, 0.00064654, 0.00523478, 0.00282392, 0.00841775,
       0.01910409, 0.01472904, 0.01642491, 0.01855294, 0.00621781,
       0.02331308, 0.00118822, 0.01948706, 0.01539357, 0.00448814,
       0.00629678, 0.00918022, 0.01494391, 0.01672354, 0.01629274])

In [54]:
Features = pd.DataFrame({'Features':inputs_2.columns, 'Importance':forests.feature_importances_})
Features.sort_values('Importance', ascending=False)

Unnamed: 0,Features,Importance
2,BMI,0.148431
1,Weight_(kg),0.129796
5,Green_Vegetables_Consumption,0.098201
0,Height_(cm),0.097112
4,Fruit_Consumption,0.095402
6,FriedPotato_Consumption,0.092202
3,Alcohol_Consumption,0.062008
20,Diabetes_Yes,0.023313
10,General_Health_E,0.021611
22,Arthritis_Yes,0.019487


## Comparing the accuracies

In [55]:
accuracies = pd.DataFrame({'Dataset 1': score1, 'Dataset 2': score2}, index=['Stacked', 'XG Boost', 'Random Forests'])
accuracies

Unnamed: 0,Dataset 1,Dataset 2
Stacked,0.863787,0.860614
XG Boost,0.918781,0.918473
Random Forests,0.913455,0.918295
