In [1]:
# 📘 Step 1: Data Preprocessing

# 📦 Import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from ucimlrepo import fetch_ucirepo
import joblib

# 📥 Step 1: Fetch dataset from UCI ML Repository (ID 45 = Heart Disease)
heart_disease = fetch_ucirepo(id=45)

# Split into features and target
X = heart_disease.data.features
y = heart_disease.data.targets

# Combine into one DataFrame
df = X.copy()
df['target'] = y

# 💾 Step 2: Save raw dataset
df.to_csv('../data/heart_disease.csv', index=False)

# 🔄 Step 3: Reload and clean missing data
df = pd.read_csv('../data/heart_disease.csv')
df.fillna(df.mean(), inplace=True)

# 🎯 Step 4: One-Hot Encoding for categorical columns
df_encoded = pd.get_dummies(df, drop_first=True)

# 📊 Step 5: Feature scaling
X = df_encoded.drop('target', axis=1)
y = df_encoded['target']

scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Combine scaled features and target again
df_cleaned = X_scaled.copy()
df_cleaned['target'] = y

# 💾 Step 6: Save cleaned dataset
df_cleaned.to_csv('../data/heart_cleaned.csv', index=False)

# 💾 Save the scaler for future use
joblib.dump(scaler, '../models/scaler.pkl')

print("✅ Data preprocessing complete. Saved to '../data/heart_cleaned.csv'")




✅ Data preprocessing complete. Saved to '../data/heart_cleaned.csv'
