In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [12]:
# Load the dataset
df = pd.read_csv('data/heart.csv')
df.head(5)

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [13]:
# a. Fill missing values in cholesterol, restecg, and thall columns
df['chol'].fillna(df['chol'].mean(), inplace=True)
df['restecg'].fillna(df['restecg'].mode()[0], inplace=True)
df['thall'].fillna(df['thall'].mode()[0], inplace=True)

df.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['chol'].fillna(df['chol'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['restecg'].fillna(df['restecg'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we a

age         0
sex         0
cp          0
trtbps      0
chol        0
fbs         0
restecg     0
thalachh    0
exng        0
oldpeak     0
slp         0
caa         0
thall       0
output      0
dtype: int64

In [14]:
# b. One-Hot Encoding for categorical columns (sex, cp, exng)
df = pd.get_dummies(df, columns=['sex', 'cp', 'exng'], drop_first=True)

df.head()

Unnamed: 0,age,trtbps,chol,fbs,restecg,thalachh,oldpeak,slp,caa,thall,output,sex_1,cp_1,cp_2,cp_3,exng_1
0,63,145,233,1,0,150,2.3,0,0,1,1,True,False,False,True,False
1,37,130,250,0,1,187,3.5,0,0,2,1,True,False,True,False,False
2,41,130,204,0,0,172,1.4,2,0,2,1,False,True,False,False,False
3,56,120,236,0,1,178,0.8,2,0,2,1,True,True,False,False,False
4,57,120,354,0,1,163,0.6,2,0,2,1,False,False,False,False,True


In [15]:

# c. Create an AgeGroup column based on age
bins = [0, 30, 50, 100]
labels = ['young', 'middle-aged', 'elderly']
df['AgeGroup'] = pd.cut(df['age'], bins=bins, labels=labels)

df[['age', 'AgeGroup']].head()  # Output AgeGroup column

Unnamed: 0,age,AgeGroup
0,63,elderly
1,37,middle-aged
2,41,middle-aged
3,56,elderly
4,57,elderly


In [16]:

# d. Normalize features like chol, thalachh, and oldpeak
scaler = StandardScaler()
df[['chol', 'thalachh', 'oldpeak']] = scaler.fit_transform(df[['chol', 'thalachh', 'oldpeak']])

df[['chol', 'thalachh', 'oldpeak']].head()  # Output normalized features

Unnamed: 0,chol,thalachh,oldpeak
0,-0.256334,0.015443,1.087338
1,0.072199,1.633471,2.122573
2,-0.816773,0.977514,0.310912
3,-0.198357,1.239897,-0.206705
4,2.08205,0.583939,-0.379244


In [17]:
# e. Build and evaluate a classification model to predict the presence of heart disease
X = df.drop(['output', 'age', 'AgeGroup'], axis=1)  # Features
y = df['output']  # Target

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [19]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)  # Calculate accuracy
classification_report_output = classification_report(y_test, y_pred)  # Get classification report

accuracy, classification_report_output

(0.8360655737704918,
 '              precision    recall  f1-score   support\n\n           0       0.83      0.83      0.83        29\n           1       0.84      0.84      0.84        32\n\n    accuracy                           0.84        61\n   macro avg       0.84      0.84      0.84        61\nweighted avg       0.84      0.84      0.84        61\n')