# import Necessary Library

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


# ignore warnings
import warnings
warnings.filterwarnings('ignore')

# About Dataset

In [None]:
# age - age in years

# sex (1 = male; 0 = female)

# cp - chest pain type
# 0: Typical angina: chest pain related decrease blood supply to the heart
# 1: Atypical angina: chest pain not related to heart
# 2: Non-anginal pain: typically esophageal spasms (non heart related)
# 3: Asymptomatic: chest pain not showing signs of disease

# trestbps - resting blood pressure (in mm Hg on admission to the hospital) anything above 130-140 is typically cause for concern

# chol - serum cholostoral in mg/dl
# serum = LDL + HDL + .2 * triglycerides
# above 200 is cause for concern

# fbs - (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
# '>126' mg/dL signals diabetes

# restecg - resting electrocardiographic results
# 0: Nothing to note
# 1: ST-T Wave abnormality
# can range from mild symptoms to severe problems
# signals non-normal heart beat
# 2: Possible or definite left ventricular hypertophy
# Enlarged heart's main pumping chamber

# thalach - maximum heart rate achieved

# exange - exercise induced angina (1 = yes; 0 = no)

# oldpeak - ST depression induced by exercise relative to rest looks at stress of heart during exercise unhealthy heart will stress more

# slope - the slope of the peak exercise ST segment
# 0: Upsloping: better heart rate with exercise (uncommon)
# 1: Flatsloping: minimal change (typically healthy heart)
# 3: Downsloping: signs of unhealthy heart

# ca - number of major vessels (0-3) colored by flourosopy
# colored vessel means the doctor can see the blood passing through
# the more blood movement the better (no clots)

# thal - thalium stress result 3 = normal; 6 = fixed defect; 7 = reversable defect
# 1,3: normal
# 6: fixed defect: used to be defect but ok now
# 7: reversable defect: no proper blood movement when exercising

# target - have disease or not (1=yes, 0=no) (=the predicted attribute)

# 1. Data Exploration:

In [None]:
# load the dataset

df = pd.read_csv("heart_disease_uci.csv")
df

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,916,54,Female,VA Long Beach,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
916,917,62,Male,VA Long Beach,typical angina,,139.0,False,st-t abnormality,,,,,,,0
917,918,55,Male,VA Long Beach,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2
918,919,58,Male,VA Long Beach,asymptomatic,,385.0,True,lv hypertrophy,,,,,,,0


In [None]:
## # print the first 5 rows of the dataframe

df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [None]:
## check for missing values and data types.

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


In [None]:
## Separate categorical and numerical columns

categorical_cols = df.select_dtypes(include=['object'])
numerical_cols = df.select_dtypes(include=['int64', 'float64'])

In [None]:
## Missing Value for categorical data

categorical_cols.isnull().sum()

sex          0
dataset      0
cp           0
fbs         90
restecg      2
exang       55
slope      309
thal       486
dtype: int64

In [None]:
## Missing Value for Numerical data

numerical_cols.isnull().sum()

id            0
age           0
trestbps     59
chol         30
thalch       55
oldpeak      62
ca          611
num           0
dtype: int64

In [None]:
## Here, I'm cheking percentage(%) Missing values for each columns

(df.isnull().sum()/df.shape[0])*100

id           0.000000
age          0.000000
sex          0.000000
dataset      0.000000
cp           0.000000
trestbps     6.413043
chol         3.260870
fbs          9.782609
restecg      0.217391
thalch       5.978261
exang        5.978261
oldpeak      6.739130
slope       33.586957
ca          66.413043
thal        52.826087
num          0.000000
dtype: float64

# 2. Handling Missing Data:

# Missing Value Imputing for categorical feature

In [None]:
for col in ['fbs', 'restecg', 'exang','slope','thal']:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Missing Value Imputing for numerical feature

In [None]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)
df[['trestbps','chol','thalch','oldpeak','ca']] = imputer.fit_transform(df[['trestbps','chol','thalch','oldpeak','ca']])

In [None]:
# Now, again we'll check missing value

df.isnull().sum()

id                 0
age                0
sex                0
dataset            0
cp                 0
trestbps           0
chol               0
fbs                0
restecg            0
thalch             0
exang              0
oldpeak            0
slope              0
ca                 0
thal               0
num                0
restecg_imputed    0
dtype: int64

# 3. Feature Creation

In [None]:
def age_group(age):
    if age<40:
        return ("<40")
    elif 40<= age <=60:
        return ("40-60")
    else:
        return (">60")

In [None]:
df["age_group"] = df["age"].apply(age_group)
df

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num,restecg_imputed,age_group
0,1,63,Male,Cleveland,typical angina,145.000000,233.0,True,lv hypertrophy,150.000000,False,2.300000,downsloping,0.000000,fixed defect,0,lv hypertrophy,>60
1,2,67,Male,Cleveland,asymptomatic,160.000000,286.0,False,lv hypertrophy,108.000000,True,1.500000,flat,3.000000,normal,2,lv hypertrophy,>60
2,3,67,Male,Cleveland,asymptomatic,120.000000,229.0,False,lv hypertrophy,129.000000,True,2.600000,flat,2.000000,reversable defect,1,lv hypertrophy,>60
3,4,37,Male,Cleveland,non-anginal,130.000000,250.0,False,normal,187.000000,False,3.500000,downsloping,0.000000,normal,0,normal,<40
4,5,41,Female,Cleveland,atypical angina,130.000000,204.0,False,lv hypertrophy,172.000000,False,1.400000,upsloping,0.000000,normal,0,lv hypertrophy,40-60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,916,54,Female,VA Long Beach,asymptomatic,127.000000,333.0,True,st-t abnormality,154.000000,False,0.000000,flat,0.000000,normal,1,st-t abnormality,40-60
916,917,62,Male,VA Long Beach,typical angina,132.132404,139.0,False,st-t abnormality,137.545665,False,0.878788,flat,1.333333,normal,0,st-t abnormality,>60
917,918,55,Male,VA Long Beach,asymptomatic,122.000000,223.0,True,st-t abnormality,100.000000,False,0.000000,flat,0.333333,fixed defect,2,st-t abnormality,40-60
918,919,58,Male,VA Long Beach,asymptomatic,132.132404,385.0,True,lv hypertrophy,137.545665,False,0.878788,flat,1.000000,normal,0,lv hypertrophy,40-60


In [None]:
def chol_types(chol):
    if chol < 200:
        return "Low"
    elif 200 <= chol <= 239:
        return "Medium"
    else:
        return "High"

In [2]:
# df["chol_types"] = df["chol"].apply(chol_types)
# df

In [None]:
# Create binary feature 'IsRisk'
df['IsRisk'] = ((df['chol'] > 240) & (df['trestbps'] > 140) & (df['age'] > 60)).astype(int)
df

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num,restecg_imputed,age_group,chol_types,IsRisk
0,1,63,Male,Cleveland,typical angina,145.000000,233.0,True,lv hypertrophy,150.000000,False,2.300000,downsloping,0.000000,fixed defect,0,lv hypertrophy,>60,Medium,0
1,2,67,Male,Cleveland,asymptomatic,160.000000,286.0,False,lv hypertrophy,108.000000,True,1.500000,flat,3.000000,normal,2,lv hypertrophy,>60,High,1
2,3,67,Male,Cleveland,asymptomatic,120.000000,229.0,False,lv hypertrophy,129.000000,True,2.600000,flat,2.000000,reversable defect,1,lv hypertrophy,>60,Medium,0
3,4,37,Male,Cleveland,non-anginal,130.000000,250.0,False,normal,187.000000,False,3.500000,downsloping,0.000000,normal,0,normal,<40,High,0
4,5,41,Female,Cleveland,atypical angina,130.000000,204.0,False,lv hypertrophy,172.000000,False,1.400000,upsloping,0.000000,normal,0,lv hypertrophy,40-60,Medium,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,916,54,Female,VA Long Beach,asymptomatic,127.000000,333.0,True,st-t abnormality,154.000000,False,0.000000,flat,0.000000,normal,1,st-t abnormality,40-60,High,0
916,917,62,Male,VA Long Beach,typical angina,132.132404,139.0,False,st-t abnormality,137.545665,False,0.878788,flat,1.333333,normal,0,st-t abnormality,>60,Low,0
917,918,55,Male,VA Long Beach,asymptomatic,122.000000,223.0,True,st-t abnormality,100.000000,False,0.000000,flat,0.333333,fixed defect,2,st-t abnormality,40-60,Medium,0
918,919,58,Male,VA Long Beach,asymptomatic,132.132404,385.0,True,lv hypertrophy,137.545665,False,0.878788,flat,1.000000,normal,0,lv hypertrophy,40-60,High,0


# 4. Feature Transformation:

In [None]:
# Encode sex, cp (chest pain type), thal, and AgeGroup
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
for col in ['sex','cp','thal','age_group']:
    df[col] = label_encoder.fit_transform(df[col])

In [None]:
## Normalize numerical features (chol, trestbps, thalach) using MinMaxScaler or StandardScaler.

# I have done this step on the other jupyter notebook.

# 5. Feature Interaction:

In [None]:
# BP-Chol Interaction: Create a new feature by multiplying trestbps and chol.

df['BP-Chol_Interaction'] = df['trestbps'] * df['chol']

In [None]:
# Exercise-Induced Angina Risk: Create a binary feature to indicate risk based on exang and thalach.

df['Exercise_Angina_Risk'] = ((df['exang'] == 1) & (df['thalch'] < 100)).astype(int)

# 6. Feature Selection:

In [None]:
# Perform feature selection by evaluating feature importance using a basic model
# (optional).

# Drop redundant or less useful features (e.g., columns with low variance or highly
# correlated with each other).