In [54]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from scipy import stats
import warnings 
from sklearn.svm import SVC
warnings.filterwarnings('ignore')
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Load data
heart_data = pd.read_csv('heart.csv')
# Print the data
heart_data

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [3]:
# Check descriptive statistic of dataset
heart_data.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [4]:
# Check missing values
heart_data.isna().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [18]:
# Calculate z-scores for numerical columns
numerical_columns = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']
z_scores = stats.zscore(heart_data[numerical_columns])

# Find outliers based on threshold value of 3
outlier = heart_data[(abs(z_scores) > 3).any(axis=1)]

# remove outliers
new_heart_data = heart_data[~heart_data.index.isin(outlier.index)]

# Check data after outlier removal
new_heart_data

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [21]:
# Check descriptive statistic of dataset after outlier removal
new_heart_data.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,899.0,899.0,899.0,899.0,899.0,899.0,899.0
mean,53.497219,132.027809,198.005562,0.232481,136.917686,0.861513,0.547275
std,9.456073,17.120895,107.157779,0.422649,25.35674,1.007626,0.498037
min,28.0,80.0,0.0,0.0,63.0,-2.0,0.0
25%,47.0,120.0,174.5,0.0,120.0,0.0,0.0
50%,54.0,130.0,222.0,0.0,138.0,0.5,1.0
75%,60.0,140.0,266.0,0.0,156.0,1.5,1.0
max,77.0,185.0,518.0,1.0,202.0,4.0,1.0


In [27]:
# Convert text columns to numbers using one hot encoding
new_heart_data = pd.get_dummies(new_heart_data, drop_first=True)

# Check few rows of data after hot encoding
new_heart_data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,1,172,0,0.0,2,0
1,49,0,2,160,180,0,1,156,0,1.0,1,1
2,37,1,1,130,283,0,2,98,0,0.0,2,0
3,48,0,0,138,214,0,1,108,1,1.5,1,1
4,54,1,2,150,195,0,1,122,0,0.0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,3,110,264,0,1,132,0,1.2,1,1
914,68,1,0,144,193,1,1,141,0,3.4,1,1
915,57,1,0,130,131,0,1,115,1,1.2,1,1
916,57,0,1,130,236,0,0,174,0,0.0,1,1


In [29]:
# Check the value count for class
new_heart_data['HeartDisease'].value_counts()

HeartDisease
1    492
0    407
Name: count, dtype: int64

In [30]:
# Seperate data and labels
X = new_heart_data.drop(columns='HeartDisease', axis=1)
y = new_heart_data['HeartDisease']

In [31]:
# Standardize the data
heart_data_scaler = StandardScaler()
heart_data_scaler.fit(X)

In [32]:
# Transform the data
standardize_heart_data = heart_data_scaler.transform(X)
# Print transformed data in similar range
print(standardize_heart_data)

[[-1.42815446  0.515943    0.2245723  ... -0.8229452  -0.85546862
   1.04249607]
 [-0.47585532 -1.93819859  1.27063705 ... -0.8229452   0.13751561
  -0.62216462]
 [-1.7455875   0.515943    0.2245723  ... -0.8229452  -0.85546862
   1.04249607]
 ...
 [ 0.3706328   0.515943   -0.82149245 ...  1.21514774  0.33611246
  -0.62216462]
 [ 0.3706328  -1.93819859  0.2245723  ... -0.8229452  -0.85546862
  -0.62216462]
 [-1.63977649  0.515943    1.27063705 ... -0.8229452  -0.85546862
   1.04249607]]


In [33]:
# Define label and target for model prediction
X = standardize_heart_data
y = new_heart_data['HeartDisease']

In [34]:
# split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y, random_state = 40)

In [35]:
# Check the shape of X
X.shape

(899, 11)

In [36]:
# Check the shape of X train
X_train.shape

(719, 11)

In [37]:
# Check the shape of X test
X_test.shape

(180, 11)

In [57]:
# Create standalone svm model
classifier = SVC(kernel='linear')
# Train the classifier
classifier.fit(X_train, y_train)
# Prdict the data
X_train_prediction = classifier.predict(X_train)
# Check the accuracy of data for SVM standalone model
training_data_accuracy = accuracy_score(X_train_prediction, y_train)

In [58]:
# Bagging model with SVM base estimator
bagging_svm_model = BaggingClassifier(base_estimator=SVC(kernel='linear', random_state=42), n_estimators=10, random_state=42)
# Fit the model
bagging_svm_model.fit(X_train, y_train)
# predict the data
y_prediction_bagging_svm = bagging_svm_model.predict(X_test)
# Check the accuracy of predicted data for Bagging model
accuracy_bagging_svm = accuracy_score(y_test, y_prediction_bagging_svm)

In [63]:
# Print accuracy of data for both model
print('Accuracy score of the data for Standalone SVM model is ', training_data_accuracy)
print('Accuracy score of the data for Bagging SVM model is ', accuracy_bagging_svm)

Accuracy score of the data for Standalone SVM model is  0.866481223922114
Accuracy score of the data for Bagging SVM model is  0.8555555555555555


In [60]:
# Standalone Decision Tree model
DT_model = DecisionTreeClassifier(random_state=42)
# Fit the model
DT_model.fit(X_train, y_train)
# Predict the data
y_prediction_DT = DT_model.predict(X_test)
# Check the accuracy
accuracy_DT = accuracy_score(y_test, y_prediction_DT)

In [61]:
# Bagging model with Decision Tree base estimator
bagging_DT_model = BaggingClassifier(base_estimator=DecisionTreeClassifier(random_state=42), n_estimators=10, random_state=42)
# Fit the model
bagging_DT_model.fit(X_train, y_train)
# Predict the data
y_prediction_bagging_DT = bagging_DT_model.predict(X_test)
# Check the accuracy
accuracy_bagging_DT = accuracy_score(y_test, y_prediction_bagging_DT)

In [62]:
print("Accuracy score of the data for Standalone Decision Tree is", accuracy_DT)
print("Accuracy score of the data for Bagging Decision Tree is", accuracy_bagging_DT)

Accuracy score of the data for Standalone Decision Tree is 0.8
Accuracy score of the data for Bagging Decision Tree is 0.8611111111111112


#### In general bagging bagging tends to improve performance of the model. In decision tree model, we can clearly see that accuracy is improved compared to standalone however, it is very little improvement. But in the case of svm, bagging model has less accuracy compared to standalone model

#### Bagging often performs better when the basic model is unstable, or has large variation (for example, decision trees). By averaging the results of several different models, bagging helps to minimise variation and makes a model less prone to overfitting.