In [1]:
import pandas as pd

# نقرأ الملف
df = pd.read_csv("../data/heart_disease.csv")

# نعرض أول 5 صفوف
print(df.head())

# معلومات عن الأعمدة
print(df.info())


   id  age  trestbps   chol  thalch  oldpeak   ca  num  sex_Male  \
0   1   63     145.0  233.0   150.0      2.3  0.0    0      True   
1   2   67     160.0  286.0   108.0      1.5  3.0    2      True   
2   3   67     120.0  229.0   129.0      2.6  2.0    1      True   
3   4   37     130.0  250.0   187.0      3.5  0.0    0      True   
4   5   41     130.0  204.0   172.0      1.4  0.0    0     False   

   dataset_Hungary  ...  cp_non-anginal  cp_typical angina  fbs_True  \
0            False  ...           False               True      True   
1            False  ...           False              False     False   
2            False  ...           False              False     False   
3            False  ...            True              False     False   
4            False  ...           False              False     False   

   restecg_normal  restecg_st-t abnormality  exang_True  slope_flat  \
0           False                     False       False       False   
1           Fals

# 01 Data Preprocessing & Cleaning

**Objective:**  
- Load the Heart Disease dataset from the `data/` folder.  
- Clean the dataset by replacing `?` with NaN and converting columns to the proper data types.  
- Handle missing values using imputation (numeric → median, categorical → most frequent).  
- Encode categorical variables using One-Hot Encoding.  
- Standardize numerical features using `StandardScaler`.  
- Save the cleaned dataset for modeling.  

**Expected Output:**  
- A cleaned dataset saved as `data/heart_disease_cleaned.csv`.  
- A preprocessing pipeline saved as `models/preprocessor.pkl`.

**Project Folder Assumptions:**  
- Project root: `Heart_Disease_Project/`  
- Raw dataset: `Heart_Disease_Project/data/heart_disease.csv`  
- Notebook location: `Heart_Disease_Project/notebooks/01_data_preprocessing.ipynb`



In [2]:
# --- Import required libraries ---
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Display settings
pd.set_option("display.max_columns", None)
pd.set_option("display.precision", 3)

print("✅ Libraries imported successfully")


✅ Libraries imported successfully


In [3]:
# --- Load the Heart Disease dataset ---
# Note: This assumes the notebook is inside "notebooks/" folder
# so we go one level up "../data/heart_disease.csv"

df = pd.read_csv("../data/heart_disease.csv")

# Preview first 5 rows
print("First 5 rows of the dataset:")
display(df.head())

# Dataset info (column types, non-null counts)
print("\nDataset Info:")
print(df.info())

# Basic statistics
print("\nStatistical Summary:")
display(df.describe())


First 5 rows of the dataset:


Unnamed: 0,id,age,trestbps,chol,thalch,oldpeak,ca,num,sex_Male,dataset_Hungary,dataset_VA Long Beach,cp_atypical angina,cp_non-anginal,cp_typical angina,fbs_True,restecg_normal,restecg_st-t abnormality,exang_True,slope_flat,slope_upsloping,thal_normal,thal_reversable defect
0,1,63,145.0,233.0,150.0,2.3,0.0,0,True,False,False,False,False,True,True,False,False,False,False,False,False,False
1,2,67,160.0,286.0,108.0,1.5,3.0,2,True,False,False,False,False,False,False,False,False,True,True,False,True,False
2,3,67,120.0,229.0,129.0,2.6,2.0,1,True,False,False,False,False,False,False,False,False,True,True,False,False,True
3,4,37,130.0,250.0,187.0,3.5,0.0,0,True,False,False,False,True,False,False,True,False,False,False,False,True,False
4,5,41,130.0,204.0,172.0,1.4,0.0,0,False,False,False,True,False,False,False,False,False,False,False,True,True,False



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        299 non-null    int64  
 1   age                       299 non-null    int64  
 2   trestbps                  299 non-null    float64
 3   chol                      299 non-null    float64
 4   thalch                    299 non-null    float64
 5   oldpeak                   299 non-null    float64
 6   ca                        299 non-null    float64
 7   num                       299 non-null    int64  
 8   sex_Male                  299 non-null    bool   
 9   dataset_Hungary           299 non-null    bool   
 10  dataset_VA Long Beach     299 non-null    bool   
 11  cp_atypical angina        299 non-null    bool   
 12  cp_non-anginal            299 non-null    bool   
 13  cp_typical angina         299 non-null    bool   


Unnamed: 0,id,age,trestbps,chol,thalch,oldpeak,ca,num
count,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0
mean,153.873,54.522,131.716,246.786,149.328,1.059,0.672,0.946
std,95.896,9.03,17.748,52.533,23.121,1.163,0.937,1.23
min,1.0,29.0,94.0,100.0,71.0,0.0,0.0,0.0
25%,75.5,48.0,120.0,211.0,132.5,0.0,0.0,0.0
50%,151.0,56.0,130.0,242.0,152.0,0.8,0.0,0.0
75%,227.5,61.0,140.0,275.5,165.5,1.6,1.0,2.0
max,749.0,77.0,200.0,564.0,202.0,6.2,3.0,4.0


In [4]:
# --- Handle missing values ---
# Replace "?" with NaN (if present in dataset)
df.replace("?", np.nan, inplace=True)

# Check number of missing values per column
print("Missing values per column (before imputation):")
print(df.isnull().sum())


Missing values per column (before imputation):
id                          0
age                         0
trestbps                    0
chol                        0
thalch                      0
oldpeak                     0
ca                          0
num                         0
sex_Male                    0
dataset_Hungary             0
dataset_VA Long Beach       0
cp_atypical angina          0
cp_non-anginal              0
cp_typical angina           0
fbs_True                    0
restecg_normal              0
restecg_st-t abnormality    0
exang_True                  0
slope_flat                  0
slope_upsloping             0
thal_normal                 0
thal_reversable defect      0
dtype: int64


In [6]:
from sklearn.impute import SimpleImputer

# Separate numerical and categorical columns
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

print("Numeric columns:", num_cols)
print("Categorical columns (before fix):", cat_cols)

# Convert bool columns to object so they can be treated as categorical
for col in cat_cols:
    if df[col].dtype == "bool":
        df[col] = df[col].astype("object")

# Update categorical columns list after conversion
cat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
print("Categorical columns (after fix):", cat_cols)

# Imputation for numeric columns
num_imputer = SimpleImputer(strategy="median")
df[num_cols] = num_imputer.fit_transform(df[num_cols])

# Imputation for categorical columns
if len(cat_cols) > 0:
    cat_imputer = SimpleImputer(strategy="most_frequent")
    df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

# Check again for missing values
print("\nMissing values per column (after imputation):")
print(df.isnull().sum())


Numeric columns: ['id', 'age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca', 'num']
Categorical columns (before fix): ['sex_Male', 'dataset_Hungary', 'dataset_VA Long Beach', 'cp_atypical angina', 'cp_non-anginal', 'cp_typical angina', 'fbs_True', 'restecg_normal', 'restecg_st-t abnormality', 'exang_True', 'slope_flat', 'slope_upsloping', 'thal_normal', 'thal_reversable defect']
Categorical columns (after fix): ['sex_Male', 'dataset_Hungary', 'dataset_VA Long Beach', 'cp_atypical angina', 'cp_non-anginal', 'cp_typical angina', 'fbs_True', 'restecg_normal', 'restecg_st-t abnormality', 'exang_True', 'slope_flat', 'slope_upsloping', 'thal_normal', 'thal_reversable defect']

Missing values per column (after imputation):
id                          0
age                         0
trestbps                    0
chol                        0
thalch                      0
oldpeak                     0
ca                          0
num                         0
sex_Male                    0
dat

In [14]:
from sklearn.preprocessing import OneHotEncoder

# Separate features (X) and target (y)
X = df.drop(columns=["target"])
y = df["target"]

# One-Hot Encoding for categorical columns
if len(cat_cols) > 0:
    try:
        encoder = OneHotEncoder(drop="first", handle_unknown="ignore", sparse_output=False)
    except TypeError:
        encoder = OneHotEncoder(drop="first", handle_unknown="ignore")

    encoded = encoder.fit_transform(X[cat_cols])

    # If result is sparse, convert to dense
    if not isinstance(encoded, np.ndarray):
        encoded = encoded.toarray()

    # Get new column names
    encoded_cols = encoder.get_feature_names_out(cat_cols)

    # Build encoded DataFrame
    encoded_df = pd.DataFrame(encoded, columns=encoded_cols, index=X.index)

    # Drop old categorical columns and join encoded ones
    X = X.drop(columns=cat_cols).join(encoded_df)

# Add target back
df = X.copy()
df["target"] = y

print("✅ Encoding done successfully")
print("Shape after encoding:", df.shape)
display(df.head())


KeyError: "['target'] not found in axis"

In [11]:
from sklearn.preprocessing import StandardScaler

# Separate X and y again
X = df.drop(columns=["target"])
y = df["target"]

# Apply StandardScaler to all features (now all numeric)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Put back into DataFrame
df_scaled = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)
df_scaled["target"] = y

df = df_scaled.copy()

print("✅ Standardization complete")
display(df.head())


KeyError: "['target'] not found in axis"