In [3]:
import sys
sys.path.append('..')

from src.data.data_preprocessor import DataPreprocessor
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Initialize
preprocessor = DataPreprocessor()
print("✓ Preprocessor initialized")

✓ Preprocessor initialized


In [4]:
# Load data
column_names = [
    'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
    'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'
]

df = pd.read_csv('../data/raw/uci_heart/processed.cleveland.data', 
                 names=column_names, na_values='?')

# Create binary target
df['target'] = (df['target'] > 0).astype(int)

print(f"Dataset shape: {df.shape}")
print(f"Missing values: {df.isnull().sum().sum()}")

Dataset shape: (303, 14)
Missing values: 6


In [5]:
# Handle missing values
df_imputed = preprocessor.handle_missing_values(df, strategy='iterative')

# Verify no missing values
print(f"\nMissing values after imputation: {df_imputed.isnull().sum().sum()}")


Handling missing values - Strategy: iterative
✓ Missing values handled

Missing values after imputation: 0


In [6]:
# Separate features and target
X = df_imputed.drop('target', axis=1)
y = df_imputed['target']

# Define numerical columns
numerical_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

# Scale features
X_scaled = preprocessor.scale_features(X, numerical_cols)

print(f"✓ Features prepared: {X_scaled.shape}")


Scaling features...
✓ Features scaled
✓ Features prepared: (303, 13)


In [7]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"\nTraining class distribution:")
print(y_train.value_counts())

Training set: (242, 13)
Test set: (61, 13)

Training class distribution:
target
0    131
1    111
Name: count, dtype: int64


In [8]:
# Handle class imbalance on training set
X_train_balanced, y_train_balanced = preprocessor.handle_class_imbalance(
    X_train, y_train
)

print(f"\nBalanced training set: {X_train_balanced.shape}")


Handling class imbalance...
Before: {0: np.int64(131), 1: np.int64(111)}
After: {1: np.int64(119), 0: np.int64(119)}
✓ Resampling complete

Balanced training set: (238, 13)


In [10]:
# Save processed data
import os
os.makedirs('../data/processed', exist_ok=True)
os.makedirs('../results/models', exist_ok=True)

# Save datasets
np.save('../data/processed/X_train.npy', X_train_balanced)
np.save('../data/processed/X_test.npy', X_test)
np.save('../data/processed/y_train.npy', y_train_balanced)
np.save('../data/processed/y_test.npy', y_test)

# Save preprocessors with corrected path
preprocessor.save_preprocessors(filepath='../results/models/preprocessors.pkl')

print("\n✓ All data saved successfully!")


✓ Preprocessors saved to ../results/models/preprocessors.pkl

✓ All data saved successfully!
