In [7]:
%cd /Users/anuranjani/Desktop/my_projects/stroke-prediction-model/stroke-prediction-project/src/data_processing
!python cleaning.py

/Users/anuranjani/Desktop/my_projects/stroke-prediction-model/stroke-prediction-project/src/data_processing
2025-04-13 01:45:04 - INFO: Loading dataset from /Users/anuranjani/Desktop/my_projects/stroke-prediction-model/stroke-prediction-project/data/raw/healthcare-dataset-stroke-data.csv
2025-04-13 01:45:04 - INFO: Dataset shape: (5110, 12)
2025-04-13 01:45:04 - INFO: Missing values:
id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64
2025-04-13 01:45:04 - INFO: Processed data saved at /Users/anuranjani/Desktop/my_projects/stroke-prediction-model/stroke-prediction-project/data/processed/cleaned_dataset.csv
2025-04-13 01:45:04 - INFO: Processed dataset shape: (5110, 16)
2025-04-13 01:45:04 - INFO: Feature statistics saved at /User

In [32]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
# For the data scaling and encoding of the categorical features in dataset:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from collections import Counter

In [33]:
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
DATA_PATH = os.path.join(BASE_DIR, 'data', 'processed', 'cleaned_dataset.csv')

In [None]:
# 2. Separate features and target
df = pd.read_csv(DATA_PATH)
X = df.drop('stroke', axis=1)
y = df['stroke']

# 3. Split into train/test before any preprocessing!
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.3, random_state=42
)

# 4. Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# 5. Fit encoders on training data only
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_cat = encoder.fit_transform(X_train[categorical_cols])
X_test_cat = encoder.transform(X_test[categorical_cols])

# 6. Scale numerical features
scaler = StandardScaler()
X_train_num = scaler.fit_transform(X_train[numerical_cols])
X_test_num = scaler.transform(X_test[numerical_cols])

# 7. Combine processed categorical + numerical features
import numpy as np
X_train_processed = np.hstack((X_train_num, X_train_cat))
X_test_processed = np.hstack((X_test_num, X_test_cat))

# 8. Apply SMOTE only on the training data
smote = SMOTE(sampling_strategy='minority', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)

# 9. Print the class distributions
print("Before SMOTE:", Counter(y_train))
print("After SMOTE :", Counter(y_train_resampled))
print("Test Set    :", Counter(y_test))

Before SMOTE: Counter({0: 3403, 1: 174})
After SMOTE : Counter({0: 3403, 1: 3403})
Test Set    : Counter({0: 1458, 1: 75})


In [43]:
# Saving the processed data after SMOTE to CSV
X_train_resampled_df = pd.DataFrame(X_train_resampled)
y_train_resampled_df = pd.DataFrame(y_train_resampled)
X_test_processed_df = pd.DataFrame(X_test_processed)
X_train_resampled_df.to_csv(os.path.join(BASE_DIR, 'data', 'processed', 'X_train_resampled.csv'), index=False)
y_train_resampled_df.to_csv(os.path.join(BASE_DIR, 'data', 'processed', 'y_train_resampled.csv'), index=False)
X_test_processed_df.to_csv(os.path.join(BASE_DIR, 'data', 'processed', 'X_test_processed.csv'), index=False)
y_test.to_csv(os.path.join(BASE_DIR, 'data', 'processed', 'y_test.csv'), index=False)

