# Step 1: Load Dependencies & Set Up
In this step, we import the necessary libraries and set a fixed random seed for reproducibility.

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Fix the random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Step 2: Load the Dataset
We load the Heart Failure Prediction dataset from a CSV file and display the first few rows to understand its structure.

In [2]:
df = pd.read_csv("heart.csv")
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


# Step 3: Add a Hypothetical Categorical Feature
Most features in this dataset are numerical. If no categorical features exist, we add a hypothetical "Smoking_Status" feature to demonstrate one-hot encoding.

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


# Step 4: Separate Features and Labels
We split the dataset into:
- `X`: The feature matrix (excluding the target variable)
- `y`: The target variable (`DEATH_EVENT`)

In [4]:
# Separate features and target variable
X = df.drop(columns=["HeartDisease"])  # Features
y = df["HeartDisease"]  # Binary target

# Step 5: Identify Numerical & Categorical Features
Here, we classify features into numerical and categorical types for appropriate preprocessing.


In [5]:
# Identify numerical and categorical features
num_features = X.select_dtypes(include=["int64", "float64"]).columns
cat_features = ["Sex", "ChestPainType", "RestingECG", "ExerciseAngina", "ST_Slope"]

# Display feature types
print("Numerical Features:", num_features.tolist())
print("Categorical Features:", cat_features)


Numerical Features: ['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']
Categorical Features: ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']


# Step 6: One-Hot Encode Categorical Features
Most machine learning models require numerical inputs. We use **One-Hot Encoding** to convert categorical features into numerical format.


In [6]:
# One-hot encode categorical features
encoder = OneHotEncoder(drop="first", sparse_output=False)
X_encoded = encoder.fit_transform(X[cat_features])

# Convert to DataFrame
X_encoded = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(cat_features))

# Drop original categorical features and merge with encoded ones
X = X.drop(columns=cat_features).reset_index(drop=True)
X = pd.concat([X, X_encoded], axis=1)

# Display updated dataset
X.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,49,160,180,0,156,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
2,37,130,283,0,98,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,48,138,214,0,108,1.5,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
4,54,150,195,0,122,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


# Step 7: Stratified Train/Validation/Test Split
We split the dataset into:
- **70% Training Set**: Used for model training
- **10% Validation Set**: Used for hyperparameter tuning
- **20% Test Set**: Used only for final model evaluation

We use **stratification** to maintain the original class distribution in all subsets.


In [7]:
# Perform stratified splitting (70% train, 10% validation, 20% test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=RANDOM_SEED)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=2/3, stratify=y_temp, random_state=RANDOM_SEED)

# Display dataset sizes
print(f"Training set: {X_train.shape}, Validation set: {X_val.shape}, Test set: {X_test.shape}")

Training set: (642, 15), Validation set: (92, 15), Test set: (184, 15)


# Step 8: Standardize Numerical Features
Standardization ensures all numerical features have the same scale.


In [None]:
# Initialize scaler
scaler = StandardScaler()

# Standardize only numerical features
X_train[num_features] = scaler.fit_transform(X_train[num_features])
X_val[num_features] = scaler.transform(X_val[num_features])
X_test[num_features] = scaler.transform(X_test[num_features])

# Display standardized data
X_train.head()

# Step 9: Train a Decision Tree Model
We train a Decision Tree Classifier on the training data.

In [9]:
from DecisionTree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Step 10: Make Predictions
We use the trained Decision Tree model to predict the class labels on:
- **Training Data** (to check for overfitting)
- **Validation Data** (for hyperparameter tuning)
- **Test Data** (for final evaluation)


In [10]:
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

# Step 11: Evaluate Model Performance
We compute key classification metrics:
- **Accuracy**: Overall correctness
- **Precision & Recall**: Measures balance between false positives and false negatives
- **F1 Score**: Harmonic mean of precision & recall
- **Confusion Matrix**: Breakdown of predictions per class


In [11]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Function to evaluate model performance
def evaluate_model(y_true, y_pred, dataset_name="Dataset"):
    print(f"\nEvaluation on {dataset_name}:")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))

# Evaluate on all sets
evaluate_model(y_train, y_train_pred, "Training Set")
evaluate_model(y_val, y_val_pred, "Validation Set")
evaluate_model(y_test, y_test_pred, "Test Set")


Evaluation on Training Set:
Accuracy: 0.9860

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.98       287
           1       0.98      0.99      0.99       355

    accuracy                           0.99       642
   macro avg       0.99      0.99      0.99       642
weighted avg       0.99      0.99      0.99       642

Confusion Matrix:
[[281   6]
 [  3 352]]

Evaluation on Validation Set:
Accuracy: 0.8478

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.78      0.82        41
           1       0.84      0.90      0.87        51

    accuracy                           0.85        92
   macro avg       0.85      0.84      0.84        92
weighted avg       0.85      0.85      0.85        92

Confusion Matrix:
[[32  9]
 [ 5 46]]

Evaluation on Test Set:
Accuracy: 0.8370

Classification Report:
              precision    recall  f1-score   support

     