In [21]:
import pandas as pd
df=pd.read_csv('../../datasets/heart_disease_uci.csv')
df

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,916,54,Female,VA Long Beach,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
916,917,62,Male,VA Long Beach,typical angina,,139.0,False,st-t abnormality,,,,,,,0
917,918,55,Male,VA Long Beach,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2
918,919,58,Male,VA Long Beach,asymptomatic,,385.0,True,lv hypertrophy,,,,,,,0


In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# -------------------------------
# 1. Load dataset
file_path = "../../datasets/heart_disease_uci.csv"
df = pd.read_csv(file_path)
print("Initial dataset shape:", df.shape)

# -------------------------------
# 2. Drop irrelevant columns
df = df.drop(['id','dataset'], axis=1)
df.drop(columns=['thal','ca'], inplace = True)

# -------------------------------
# 3. Handle missing values
# Fill numeric columns with median
df.loc[df['trestbps'] == 0, 'trestbps'] = np.nan
df.loc[df['chol'] == 0, 'chol'] = np.nan

numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
print('numeric: ', numeric_cols)
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())


# Fill categorical columns with mode
categorical_cols = df.select_dtypes(include=['object']).columns
print('categorical: ', categorical_cols)
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# -------------------------------
# 4. Encode categorical variables (one-hot encoding)
df = pd.get_dummies(df, drop_first=True)
bool_cols = df.select_dtypes(bool).columns
df[bool_cols] = df[bool_cols].astype(int)

# -------------------------------
# Add age groups
bins = [0, 40, 50, 60, 150]
labels = ['<40', '40-50', '50-60', '60+']
df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels)

# -------------------------------
# 5. Split features and target
X = df.drop('num', axis=1)
y = df['num']

# -------------------------------
# 6. Feature scaling
scaler = StandardScaler()
# Scale only numeric columns (excluding 'num')
numeric_cols = numeric_cols.drop('num')
X_scaled_part = scaler.fit_transform(X[numeric_cols])

# Replace the numeric columns in X with their scaled versions
X_scaled = X.copy()
X_scaled[numeric_cols] = X_scaled_part

# -------------------------------
# 7. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# -------------------------------
# Final dataset info
print("Original training set shape:", X_train.shape, y_train.shape)
print("Test set shape:", X_test.shape, y_test.shape)

Initial dataset shape: (920, 16)
numeric:  Index(['age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'num'], dtype='object')
categorical:  Index(['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope'], dtype='object')
Original training set shape: (736, 16) (736,)
Test set shape: (184, 16) (184,)


  df[col] = df[col].fillna(df[col].mode()[0])


In [23]:
X[['age_group']].value_counts()

age_group
50-60        382
40-50        224
60+          221
<40           93
Name: count, dtype: int64

In [25]:
import os

out_dir = "../../data/age_analysis"
os.makedirs(out_dir, exist_ok=True)

X_train_age = pd.DataFrame(X_train, columns=X.columns)
y_train_age = pd.Series(y_train, name='num') 
Xy_train_age = pd.concat([X_train.reset_index(drop=True), y_train.reset_index(drop=True)], axis=1)
Xy_train_age.to_csv(os.path.join(out_dir, "Xy_train_age_analysis_with_groups.csv"), index=False)


X_train = pd.DataFrame(X_train, columns=X.columns).drop(columns=['age_group'])
y_train = pd.Series(y_train, name='num') 
Xy_train = pd.concat([X_train.reset_index(drop=True), y_train.reset_index(drop=True)], axis=1)
Xy_train.to_csv(os.path.join(out_dir, "Xy_train_age_analysis.csv"), index=False)


X_test = pd.DataFrame(X_test, columns=X.columns)
y_test = pd.Series(y_test, name='num') 
Xy_test = pd.concat([X_test.reset_index(drop=True), y_test.reset_index(drop=True)], axis=1)
Xy_test.to_csv(os.path.join(out_dir, "Xy_test_age_analysis.csv"), index=False)

print("Saved CSVs to", out_dir)


Saved CSVs to ../../data/age_analysis


In [26]:
Xy_train

Unnamed: 0,age,trestbps,chol,fbs,thalch,exang,oldpeak,sex_Male,cp_atypical angina,cp_non-anginal,cp_typical angina,restecg_normal,restecg_st-t abnormality,slope_flat,slope_upsloping,num
0,-0.054235,-0.677285,-0.110571,0,-1.698756,0,-0.806886,1,1,0,0,1,0,1,0,3
1,2.175168,-0.119250,-0.110571,0,0.091821,0,-0.334061,1,0,1,0,1,0,1,0,0
2,-0.054235,-0.119250,-0.110571,0,-0.107132,1,0.138764,1,0,0,0,0,0,1,0,2
3,0.901224,-0.956303,-0.110571,0,-2.613939,1,-1.279711,1,0,0,0,1,0,1,0,1
4,0.901224,1.443249,-1.453380,0,0.012240,1,-0.806886,1,0,0,0,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
731,-0.903531,-1.235320,0.362794,0,-0.226504,0,0.327894,1,0,0,1,1,0,1,0,1
732,-0.478883,-0.119250,0.459399,0,1.007005,0,-0.806886,0,0,0,0,1,0,0,1,0
733,0.051927,-0.454071,0.401436,0,-1.141687,1,1.273544,1,0,0,0,0,0,1,0,1
734,0.476575,0.996821,0.729892,1,0.967214,0,0.138764,0,0,0,1,0,0,0,1,0
