In [22]:
#heart
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

heart = pd.read_csv("Datasets/heart.csv")

# Encode categorical
heart = pd.get_dummies(heart, columns=["ChestPainType", "RestingECG", "ExerciseAngina", "ST_Slope"], drop_first=True)
heart["Sex"] = heart["Sex"].map({"M": 1, "F": 0})

# Convert boolean columns to int
bool_cols = heart.select_dtypes(include='bool').columns
heart[bool_cols] = heart[bool_cols].astype(int)

# Scale numeric features
scaler = StandardScaler()
numeric_cols = ["Age", "RestingBP", "Cholesterol", "MaxHR", "Oldpeak"]
heart[numeric_cols] = scaler.fit_transform(heart[numeric_cols])

heart.head()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,-1.43314,1,0.410909,0.82507,0,1.382928,-0.832432,0,1,0,0,1,0,0,0,1
1,-0.478484,0,1.491752,-0.171961,0,0.754157,0.105664,1,0,1,0,1,0,0,1,0
2,-1.751359,1,-0.129513,0.770188,0,-1.525138,-0.832432,0,1,0,0,0,1,0,0,1
3,-0.584556,0,0.302825,0.13904,0,-1.132156,0.574711,1,0,0,0,1,0,1,1,0
4,0.051881,1,0.951331,-0.034755,0,-0.581981,-0.832432,0,0,1,0,1,0,0,0,1


In [2]:
#hepatitis
import numpy as np

hep = pd.read_csv("Datasets/hepatitis.csv")

# Clean column names
hep.columns = hep.columns.str.replace('"', '').str.strip()
if 'Unnamed: 0' in hep.columns:
    hep = hep.drop(columns=['Unnamed: 0'])

# Handle missing numeric data (marked as NA): replace with np.nan, then fill with median
hep = hep.replace('NA', np.nan)
for col in hep.select_dtypes(include=['object', 'float', 'int']).columns:
    try:
        hep[col] = pd.to_numeric(hep[col])
    except:
        pass
numeric_cols = hep.select_dtypes(include=['float', 'int']).columns
hep[numeric_cols] = hep[numeric_cols].fillna(hep[numeric_cols].median())

# Encode Sex: m=1, f=0
hep['Sex'] = hep['Sex'].map({'m': 1, 'f': 0})

# Label encode 'Category'
le = LabelEncoder()
hep['Category'] = le.fit_transform(hep['Category'])

# Feature scaling: All continuous lab values except Category and Sex
scale_cols = [c for c in hep.columns if c not in ['Category', 'Sex']]
scaler = StandardScaler()
hep[scale_cols] = scaler.fit_transform(hep[scale_cols])

# View processed data
hep.head()

Unnamed: 0,Category,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
0,0,-1.533616,1,-0.540739,-0.613566,-0.815675,-0.383693,-0.198236,-0.574734,-1.903634,0.49707,-0.502286,-0.564419
1,0,-1.533616,1,-0.540739,0.081055,-0.410629,-0.305057,-0.381375,1.349161,-0.505077,-0.14659,-0.438203,0.826054
2,0,-1.533616,1,0.914755,0.252759,0.305084,0.538767,-0.269457,0.291926,-0.148756,0.094783,-0.115957,1.345164
3,0,-1.533616,1,0.273645,-0.633077,0.084865,-0.368571,0.381706,-0.393234,-0.558525,-0.025903,-0.104971,0.677737
4,0,-1.533616,1,-0.419448,0.229345,0.163515,-0.302033,-0.091404,0.432588,-0.932661,-0.106361,-0.176378,-0.620038


In [21]:
#stroke
stroke = pd.read_csv("Datasets/stroke.csv")

# Drop ID column
stroke = stroke.drop(columns=['id'])

# Handle missing BMI values: Replace 'N/A' with np.nan, convert to float, fill with median
stroke['bmi'] = stroke['bmi'].replace('N/A', np.nan).astype(float)
stroke['bmi'] = stroke['bmi'].fillna(stroke['bmi'].median())

# Encode gender (if 'Other' exists, one-hot encode for all genders)
gender_ohe = pd.get_dummies(stroke['gender'], prefix='gender', drop_first=True)
stroke = pd.concat([stroke.drop('gender', axis=1), gender_ohe], axis=1)

# Encode ever_married (Yes=1, No=0)
stroke['ever_married'] = stroke['ever_married'].map({'Yes': 1, 'No': 0})

# One-hot encode work_type, Residence_type, and smoking_status
cat_cols = ['work_type', 'Residence_type', 'smoking_status']
stroke = pd.get_dummies(stroke, columns=cat_cols, drop_first=False)

# Convert boolean columns to int
bool_cols = stroke.select_dtypes(include='bool').columns
stroke[bool_cols] = stroke[bool_cols].astype(int)

# Feature scaling for age, avg_glucose_level, and bmi
scaler = StandardScaler()
num_cols = ['age', 'avg_glucose_level', 'bmi']
stroke[num_cols] = scaler.fit_transform(stroke[num_cols])

# View processed data
stroke.head()

Unnamed: 0,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,stroke,gender_Male,gender_Other,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,1.051434,0,1,1,2.706375,1.005086,1,1,0,0,0,1,0,0,0,1,0,1,0,0
1,0.78607,0,0,1,2.121559,-0.098981,1,0,0,0,0,0,1,0,1,0,0,0,1,0
2,1.62639,0,1,1,-0.005028,0.472536,1,1,0,0,0,1,0,0,1,0,0,0,1,0
3,0.255342,0,0,1,1.437358,0.719327,1,0,0,0,0,1,0,0,0,1,0,0,0,1
4,1.582163,1,0,1,1.501184,-0.631531,1,0,0,0,0,0,1,0,1,0,0,0,1,0


In [23]:
stroke.to_csv("Datasets/stroke_clean.csv", index=False)
heart.to_csv("Datasets/heart_clean.csv", index=False)
hep.to_csv("Datasets/hepatitis_clean.csv", index=False)
