In [7]:
%cd /Users/anuranjani/Desktop/my_projects/stroke-prediction-model/stroke-prediction-project/src/data_processing
!python cleaning.py

/Users/anuranjani/Desktop/my_projects/stroke-prediction-model/stroke-prediction-project/src/data_processing
2025-04-13 01:45:04 - INFO: Loading dataset from /Users/anuranjani/Desktop/my_projects/stroke-prediction-model/stroke-prediction-project/data/raw/healthcare-dataset-stroke-data.csv
2025-04-13 01:45:04 - INFO: Dataset shape: (5110, 12)
2025-04-13 01:45:04 - INFO: Missing values:
id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64
2025-04-13 01:45:04 - INFO: Processed data saved at /Users/anuranjani/Desktop/my_projects/stroke-prediction-model/stroke-prediction-project/data/processed/cleaned_dataset.csv
2025-04-13 01:45:04 - INFO: Processed dataset shape: (5110, 16)
2025-04-13 01:45:04 - INFO: Feature statistics saved at /User

In [13]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
# For the data scaling and encoding of the categorical features in dataset:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from collections import Counter

In [14]:
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))
DATA_PATH = os.path.join(BASE_DIR, 'data', 'processed', 'cleaned_dataset.csv')

In [15]:
# 1. Load and split
df = pd.read_csv(DATA_PATH)
X = df.drop('stroke', axis=1)
y = df['stroke']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.3, random_state=42
)

# 2. Identify categorical and numerical
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# 3. Encode categorical (fit only on train!)
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_cat = encoder.fit_transform(X_train[categorical_cols])
X_test_cat = encoder.transform(X_test[categorical_cols])  # ✅ safe because of `handle_unknown='ignore'`

# 4. Scale numeric features
scaler = StandardScaler()
X_train_num = scaler.fit_transform(X_train[numerical_cols])
X_test_num = scaler.transform(X_test[numerical_cols])

# 5. Combine numeric + encoded categorical
X_train_processed = np.hstack((X_train_num, X_train_cat))
X_test_processed = np.hstack((X_test_num, X_test_cat))

# ✅ Get correct feature names
ohe_feature_names = encoder.get_feature_names_out(categorical_cols)
full_feature_names = numerical_cols + list(ohe_feature_names)

# 6. SMOTE on train only
smote = SMOTE(sampling_strategy='minority', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)

# ✅ Wrap train/test in DataFrames
X_train_resampled = pd.DataFrame(X_train_resampled, columns=full_feature_names)
y_train_resampled = pd.Series(y_train_resampled, name='stroke')

X_test_processed = pd.DataFrame(X_test_processed, columns=full_feature_names)
y_test = pd.Series(y_test, name='stroke')


In [18]:
# Saving the processed data after SMOTE to CSV
X_train_resampled_df = pd.DataFrame(X_train_resampled)
y_train_resampled_df = pd.DataFrame(y_train_resampled)
X_test_processed_df = pd.DataFrame(X_test_processed)
X_train_resampled_df.to_csv(os.path.join(BASE_DIR, 'data', 'processed', 'X_train_resampled.csv'), index=False)
y_train_resampled_df.to_csv(os.path.join(BASE_DIR, 'data', 'processed', 'y_train_resampled.csv'), index=False)
X_test_processed_df.to_csv(os.path.join(BASE_DIR, 'data', 'processed', 'X_test_processed.csv'), index=False)
y_test.to_csv(os.path.join(BASE_DIR, 'data', 'processed', 'y_test.csv'), index=False)
X_test.to_csv(os.path.join(BASE_DIR, 'data', 'processed', 'X_test.csv'), index=False)



In [17]:
# Group by age_group and compute mean stroke rate
stroke_risk_by_age_group = df.groupby('age_group')['stroke'].mean() * 100

# Round to 2 decimal places for nice display
stroke_risk_by_age_group = stroke_risk_by_age_group.round(2)

print(stroke_risk_by_age_group)

age_group
Elderly        13.57
Middle-Aged     1.05
Senior          4.97
Young           0.13
Name: stroke, dtype: float64
