In [1]:
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
import pandas as pd
import numpy as np
from scipy.stats import zscore
from sklearn.metrics import classification_report

In [2]:
heart_df = pd.read_csv(r"C:\Users\ekyus\Downloads\heart.csv")
heart_df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [3]:
# Loop to display unique number of classes for each categorical variable
for col in heart_df.select_dtypes(include=['object']).columns:
    print(f"Column: {col}")
    print("Unique Values and Counts:")
    print(heart_df[col].value_counts())


Column: Sex
Unique Values and Counts:
Sex
M    725
F    193
Name: count, dtype: int64
Column: ChestPainType
Unique Values and Counts:
ChestPainType
ASY    496
NAP    203
ATA    173
TA      46
Name: count, dtype: int64
Column: RestingECG
Unique Values and Counts:
RestingECG
Normal    552
LVH       188
ST        178
Name: count, dtype: int64
Column: ExerciseAngina
Unique Values and Counts:
ExerciseAngina
N    547
Y    371
Name: count, dtype: int64
Column: ST_Slope
Unique Values and Counts:
ST_Slope
Flat    460
Up      395
Down     63
Name: count, dtype: int64


In [15]:
## RestingECG and Sex have only two classes

In [4]:
## The output variable class balance and the number of null values in the dataset
heart_df['HeartDisease'].value_counts() ,"\n",heart_df.isnull().sum()

(HeartDisease
 1    508
 0    410
 Name: count, dtype: int64,
 '\n',
 Age               0
 Sex               0
 ChestPainType     0
 RestingBP         0
 Cholesterol       0
 FastingBS         0
 RestingECG        0
 MaxHR             0
 ExerciseAngina    0
 Oldpeak           0
 ST_Slope          0
 HeartDisease      0
 dtype: int64)

Yay, no outliers

In [16]:
# Removing outliers using Z-score
numeric_cols = heart_df.select_dtypes(include=[np.number]).columns.tolist()
z_scores = np.abs(zscore(heart_df[numeric_cols]))
outliers = (z_scores > 3)
heart_df_no_outliers = heart_df[~outliers.any(axis=1)]
heart_df_no_outliers.shape

(899, 12)

In [18]:
## number of values removed
heart_df.shape[1] - heart_df_no_outliers.shape[1]

0

We'll use label encoding for columns with two categories and one-hot encoding for columns with more than two categories.

In [6]:
# Label Encoding
label_encode_cols = ['Sex', 'ExerciseAngina']
le = LabelEncoder()
for col in label_encode_cols:
    heart_df_no_outliers[col] = le.fit_transform(heart_df_no_outliers[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  heart_df_no_outliers[col] = le.fit_transform(heart_df_no_outliers[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  heart_df_no_outliers[col] = le.fit_transform(heart_df_no_outliers[col])


In [7]:
# One-Hot Encoding
one_hot_encode_cols = ['ChestPainType', 'RestingECG', 'ST_Slope']
heart_df_encoded = pd.get_dummies(heart_df_no_outliers, columns=one_hot_encode_cols)

In [8]:
# Splitting and Scaling the data
X = heart_df_encoded.drop("HeartDisease", axis=1)
y = heart_df_encoded["HeartDisease"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
# SVM Models
svm = SVC(kernel='linear', random_state=42)
svm.fit(X_train_scaled, y_train)
svm_predictions = svm.predict(X_test_scaled)
svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_accuracy_report = classification_report(y_test, svm_predictions)
print(svm_accuracy)
print(svm_accuracy_report)


0.8481481481481481
              precision    recall  f1-score   support

           0       0.86      0.78      0.82       119
           1       0.84      0.90      0.87       151

    accuracy                           0.85       270
   macro avg       0.85      0.84      0.84       270
weighted avg       0.85      0.85      0.85       270



In [10]:
bagging_svm = BaggingClassifier(base_estimator=SVC(kernel='linear', random_state=42), n_estimators=10, random_state=42)
bagging_svm.fit(X_train_scaled, y_train)
bagging_svm_predictions = bagging_svm.predict(X_test_scaled)
bagging_svm_accuracy = accuracy_score(y_test, bagging_svm_predictions)
bagging_svm_accuracy_report = classification_report(y_test, bagging_svm_predictions)
print(bagging_svm_accuracy)
print(bagging_svm_accuracy_report)

0.8444444444444444
              precision    recall  f1-score   support

           0       0.85      0.79      0.82       119
           1       0.84      0.89      0.86       151

    accuracy                           0.84       270
   macro avg       0.84      0.84      0.84       270
weighted avg       0.84      0.84      0.84       270





#### SVM: SVM seeks the best hyperplane to classify data. Bagging may not benefit SVM much since data subsets can have similar optimal hyperplanes.

In [11]:
# Decision Tree Models
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_scaled, y_train)
dt_predictions = dt.predict(X_test_scaled)
dt_accuracy = accuracy_score(y_test, dt_predictions)
dt_accuracy_report = classification_report(y_test, dt_predictions)
print(dt_accuracy)
print(dt_accuracy_report)

0.7703703703703704
              precision    recall  f1-score   support

           0       0.72      0.78      0.75       119
           1       0.82      0.76      0.79       151

    accuracy                           0.77       270
   macro avg       0.77      0.77      0.77       270
weighted avg       0.77      0.77      0.77       270



In [12]:
bagging_dt = BaggingClassifier(base_estimator=DecisionTreeClassifier(random_state=42), n_estimators=10, random_state=42)
bagging_dt.fit(X_train_scaled, y_train)
bagging_dt_predictions = bagging_dt.predict(X_test_scaled)
bagging_dt_accuracy = accuracy_score(y_test, bagging_dt_predictions)
bagging_dt_accuracy_report = classification_report(y_test, bagging_dt_predictions)
print(bagging_dt_accuracy)
print(bagging_dt_accuracy_report)



0.8444444444444444
              precision    recall  f1-score   support

           0       0.80      0.86      0.83       119
           1       0.88      0.83      0.86       151

    accuracy                           0.84       270
   macro avg       0.84      0.85      0.84       270
weighted avg       0.85      0.84      0.84       270



#### Decision Tree: Trees, especially deep ones, can overfit. Bagging reduces overfitting by averaging multiple tree predictions, leading to better performance.

In [14]:
### Bagging suits high variance, low bias models, like Decision Trees, that overfit. It reduces variance by averaging predictions, enhancing model generalization.