# Parveen Kumar
# 8822123

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score

# Step 1: Load the heart disease dataset into a pandas DataFrame

In [12]:
df = pd.read_csv('C:\\Users\\HP\\Downloads\\archive (3)\\heart.csv')

In [13]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [14]:
df.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [53]:
df.shape

(918, 16)

# Step 2: Remove outliers using mean, median, and Z-score

# Oulier by Mean

In [42]:
mean = np.mean(df)
std_dev = np.std(df)

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


In [68]:
threshold = 1 
outliers = df[np.abs(df - mean) > threshold * std_dev]
df_1 = df[np.abs(df - mean) <= threshold * std_dev]


In [69]:
df_1.shape

(918, 16)

# Oulier removal by Median

In [57]:
median = np.median(df)
mad = np.median(np.abs(df - median))

In [70]:
threshold = 1.2 # number of MADs away from the median
outliers = df[np.abs(df - median) > threshold * mad]
df_2 = df[np.abs(df - median) <= threshold * mad]

In [71]:
df_2.shape

(918, 16)

# remove outliers using the Z-score

In [72]:
from scipy import stats
z_scores = stats.zscore(df)
threshold = 0.8

In [73]:
# standard deviations away from the mean
outliers = df[np.abs(z_scores) > threshold]
df_3 = df[np.abs(z_scores) <= threshold]

In [74]:
df_3.shape

(918, 16)

# Step 3: Convert text columns to numbers using label encoding and one hot encoding

In [18]:
# Identify the text columns and use label encoding for them
label_encoder = LabelEncoder()

df['ChestPainType'] = label_encoder.fit_transform(df['ChestPainType'])
df['RestingECG'] = label_encoder.fit_transform(df['RestingECG'])

In [19]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,1,140,289,0,1,172,N,0.0,Up,0
1,49,F,2,160,180,0,1,156,N,1.0,Flat,1
2,37,M,1,130,283,0,2,98,N,0.0,Up,0
3,48,F,0,138,214,0,1,108,Y,1.5,Flat,1
4,54,M,2,150,195,0,1,122,N,0.0,Up,0


In [20]:
df = pd.get_dummies(df, columns=['Sex','ExerciseAngina','ST_Slope'])

In [21]:
df.head()

Unnamed: 0,Age,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,Oldpeak,HeartDisease,Sex_F,Sex_M,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,1,140,289,0,1,172,0.0,0,0,1,1,0,0,0,1
1,49,2,160,180,0,1,156,1.0,1,1,0,1,0,0,1,0
2,37,1,130,283,0,2,98,0.0,0,0,1,1,0,0,0,1
3,48,0,138,214,0,1,108,1.5,1,1,0,0,1,0,1,0
4,54,2,150,195,0,1,122,0.0,0,0,1,1,0,0,0,1


In [25]:
col=df.columns

In [26]:
print(col)

Index(['Age', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'Oldpeak', 'HeartDisease', 'Sex_F', 'Sex_M',
       'ExerciseAngina_N', 'ExerciseAngina_Y', 'ST_Slope_Down',
       'ST_Slope_Flat', 'ST_Slope_Up'],
      dtype='object')


# Step-4 Scaling

In [27]:
from sklearn.preprocessing import MinMaxScaler

In [28]:
# Assuming 'X' is your feature matrix
scaler = MinMaxScaler()
scaled_data= scaler.fit_transform(df)


In [29]:
# The scaled_data will be a numpy array, you can convert it back to a DataFrame if needed
df = pd.DataFrame(scaled_data, columns=col)

# Step 5: Build a machine learning classification model using support vector machine


In [30]:
# Split the data into features (X) and target (y)
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

In [31]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [32]:
# Build the standalone SVM model
svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_predictions = svm_model.predict(X_test)

In [34]:
# Build the Bagging SVM model
bagging_svm_model = BaggingClassifier(estimator=SVC(), n_estimators=10)
bagging_svm_model.fit(X_train, y_train)
bagging_svm_predictions = bagging_svm_model.predict(X_test)

In [35]:
# Calculate accuracy scores for both models
svm_accuracy = accuracy_score(y_test, svm_predictions)
bagging_svm_accuracy = accuracy_score(y_test, bagging_svm_predictions)

In [36]:
print("SVM Standalone Accuracy:", svm_accuracy)
print("SVM Bagging Accuracy:", bagging_svm_accuracy)


SVM Standalone Accuracy: 0.8369565217391305
SVM Bagging Accuracy: 0.8478260869565217


# Step 6: Use decision tree classifier

In [37]:
# Build the standalone Decision Tree model
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
dt_predictions = dt_model.predict(X_test)

In [39]:
# Build the Bagging Decision Tree model
bagging_dt_model = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=10)
bagging_dt_model.fit(X_train, y_train)
bagging_dt_predictions = bagging_dt_model.predict(X_test)

In [40]:
# Calculate accuracy scores for both models
dt_accuracy = accuracy_score(y_test, dt_predictions)
bagging_dt_accuracy = accuracy_score(y_test, bagging_dt_predictions)

In [41]:
print("Decision Tree Standalone Accuracy:", dt_accuracy)
print("Decision Tree Bagging Accuracy:", bagging_dt_accuracy)


Decision Tree Standalone Accuracy: 0.7989130434782609
Decision Tree Bagging Accuracy: 0.8369565217391305


# Step-7 Comparison:
Based on the provided accuracy scores for SVM and Decision Tree classifiers, it seems that the SVM classifier performs slightly better than the Decision Tree classifier in this specific scenario. The SVM bagging accuracy is 0.8478, while the Decision Tree bagging accuracy is 0.8369.

Bagging is an ensemble learning technique that aims to improve the performance and robustness of machine learning models by combining multiple base models (learners) and aggregating their predictions. It does this by training each base model on a random subset of the original data, with replacement. The final prediction is obtained by averaging or voting on the predictions of all the base models.

Bagging is especially useful in situations where:


1. High Variance: There is a danger of overfitting when utilising complicated models, such as SVM with high-dimensional feature spaces, resulting in high variance. Bagging reduces variance by integrating different models, perhaps resulting in a more stable and trustworthy forecast.

2. Low Bias: SVM has a low bias, which means it can capture complicated correlations in data. It may, however, be vulnerable to outliers or noise in the data. By averaging out the impact of outliers over different models, bagging can help reduce this sensitivity.

3. huge and Diverse Datasets: When there is a huge and diverse dataset, bagging typically works well. Bagging helps each basic model to learn from somewhat varied viewpoints by training it on a random fraction of the data, resulting in more diversified predictions.

4. Bagging is frequently more appropriate for classification jobs than regression ones, however it may be modified for regression as well.

5. Bagging can be computationally costly since it needs the training of many models. Bagging has grown more realistic, thanks to developments in parallel computing and the availability of powerful hardware.