<a href="https://colab.research.google.com/github/adewale-codes/Banking/blob/main/feature_engineering_heart_disease.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Feature Engineering on Heart Disease Dataset
This notebook focuses on transforming and engineering features from the cleaned heart disease dataset to improve model performance.

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style='whitegrid')

##Load the Dataset

In [4]:
df = pd.read_csv('heart.csv')
print(df.shape)
df.head()

(918, 12)


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


##Drop Duplicates

In [5]:
df.drop_duplicates(inplace=True)
print(df.shape)

(918, 12)


##Create Age Group Feature

In [6]:
bins = [0, 39, 59, 100]
labels = ['Young', 'Middle-aged', 'Senior']
df['AgeGroup'] = pd.cut(df['Age'], bins=bins, labels=labels)
df[['Age', 'AgeGroup']].head()

Unnamed: 0,Age,AgeGroup
0,40,Middle-aged
1,49,Middle-aged
2,37,Young
3,48,Middle-aged
4,54,Middle-aged


##Group Cholesterol Levels

In [7]:
bins = [0, 200, 239, np.inf]
labels = ['Normal', 'Borderline', 'High']
df['CholesterolLevel'] = pd.cut(df['Cholesterol'], bins=bins, labels=labels)
df[['Cholesterol', 'CholesterolLevel']].head()

Unnamed: 0,Cholesterol,CholesterolLevel
0,289,High
1,180,Normal
2,283,High
3,214,Borderline
4,195,Normal


##Encode Categorical Features

In [8]:
df_encoded = pd.get_dummies(df, columns=['ChestPainType', 'RestingECG', 'ST_Slope', 'AgeGroup', 'CholesterolLevel'], drop_first=True)
df_encoded.head()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ST_Slope_Flat,ST_Slope_Up,AgeGroup_Middle-aged,AgeGroup_Senior,CholesterolLevel_Borderline,CholesterolLevel_High
0,40,M,140,289,0,172,N,0.0,0,True,False,False,True,False,False,True,True,False,False,True
1,49,F,160,180,0,156,N,1.0,1,False,True,False,True,False,True,False,True,False,False,False
2,37,M,130,283,0,98,N,0.0,0,True,False,False,False,True,False,True,False,False,False,True
3,48,F,138,214,0,108,Y,1.5,1,False,False,False,True,False,True,False,True,False,True,False
4,54,M,150,195,0,122,N,0.0,0,False,True,False,True,False,False,True,True,False,False,False


##Label Encode Binary Features

In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_encoded['Sex'] = le.fit_transform(df_encoded['Sex'])
df_encoded['ExerciseAngina'] = le.fit_transform(df_encoded['ExerciseAngina'])
df_encoded['FastingBS'] = df_encoded['FastingBS'].astype(int)

##Scale Numerical Features

In [10]:
scaler = MinMaxScaler()
num_cols = ['RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']
df_encoded[num_cols] = scaler.fit_transform(df_encoded[num_cols])
df_encoded[num_cols].head()

Unnamed: 0,RestingBP,Cholesterol,MaxHR,Oldpeak
0,0.7,0.47927,0.788732,0.295455
1,0.8,0.298507,0.676056,0.409091
2,0.65,0.46932,0.267606,0.295455
3,0.69,0.354892,0.338028,0.465909
4,0.75,0.323383,0.43662,0.295455


##Final Engineered Dataset

In [11]:
print(df_encoded.shape)
df_encoded.head()

(918, 20)


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ST_Slope_Flat,ST_Slope_Up,AgeGroup_Middle-aged,AgeGroup_Senior,CholesterolLevel_Borderline,CholesterolLevel_High
0,40,1,0.7,0.47927,0,0.788732,0,0.295455,0,True,False,False,True,False,False,True,True,False,False,True
1,49,0,0.8,0.298507,0,0.676056,0,0.409091,1,False,True,False,True,False,True,False,True,False,False,False
2,37,1,0.65,0.46932,0,0.267606,0,0.295455,0,True,False,False,False,True,False,True,False,False,False,True
3,48,0,0.69,0.354892,0,0.338028,1,0.465909,1,False,False,False,True,False,True,False,True,False,True,False
4,54,1,0.75,0.323383,0,0.43662,0,0.295455,0,False,True,False,True,False,False,True,True,False,False,False


##Summary
- Created new features (`AgeGroup`, `CholesterolLevel`)
- Applied encoding to categorical variables
- Scaled numerical features for better model behavior

Dataset is now ready for training machine learning models.