In [None]:
# Default parameters (will be overridden by Papermill)
normalize = True
data_dir = 'outputs/data'
output_dir = 'outputs'

In [None]:
import pandas as pd
import numpy as np
import os
import json
import pickle
from sklearn.preprocessing import StandardScaler

print(f"Preprocessing data with parameters:")
print(f"  - normalize: {normalize}")
print(f"  - data_dir: {data_dir}")

In [None]:
# Load data from previous step
X_train = pd.read_csv(os.path.join(data_dir, 'X_train.csv'))
X_test = pd.read_csv(os.path.join(data_dir, 'X_test.csv'))
y_train = pd.read_csv(os.path.join(data_dir, 'y_train.csv')).values.ravel()
y_test = pd.read_csv(os.path.join(data_dir, 'y_test.csv')).values.ravel()

print(f"Data loaded successfully")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

In [None]:
# Check for missing values
print(f"\nMissing values in X_train: {X_train.isnull().sum().sum()}")
print(f"Missing values in X_test: {X_test.isnull().sum().sum()}")
print(f"Missing values in y_train: {pd.Series(y_train).isnull().sum()}")
print(f"Missing values in y_test: {pd.Series(y_test).isnull().sum()}")

In [None]:
# Normalize features if enabled
if normalize:
    print(f"\nApplying StandardScaler normalization...")
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    print(f"X_train mean after scaling: {X_train_scaled.mean(axis=0)[:3]}...") # Show first 3 features
    print(f"X_train std after scaling: {X_train_scaled.std(axis=0)[:3]}...")
else:
    print(f"\nSkipping normalization")
    X_train_scaled = X_train.values
    X_test_scaled = X_test.values
    scaler = None

# Convert back to DataFrame
X_train_processed = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_processed = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [None]:
# Save preprocessed data
processed_dir = os.path.join(output_dir, 'processed')
os.makedirs(processed_dir, exist_ok=True)

X_train_processed.to_csv(os.path.join(processed_dir, 'X_train_processed.csv'), index=False)
X_test_processed.to_csv(os.path.join(processed_dir, 'X_test_processed.csv'), index=False)
y_train_df = pd.DataFrame({'target': y_train})
y_test_df = pd.DataFrame({'target': y_test})
y_train_df.to_csv(os.path.join(processed_dir, 'y_train.csv'), index=False)
y_test_df.to_csv(os.path.join(processed_dir, 'y_test.csv'), index=False)

# Save scaler if used
if scaler is not None:
    with open(os.path.join(processed_dir, 'scaler.pkl'), 'wb') as f:
        pickle.dump(scaler, f)

# Save preprocessing metadata
preprocess_metadata = {
    'normalize': normalize,
    'scaler_used': scaler is not None,
    'scaler_type': 'StandardScaler' if scaler else None
}

with open(os.path.join(processed_dir, 'preprocessing_metadata.json'), 'w') as f:
    json.dump(preprocess_metadata, f, indent=2)

print(f"\nPreprocessed data saved to {processed_dir}")
print(f"Files created:")
print(f"  - X_train_processed.csv")
print(f"  - X_test_processed.csv")
print(f"  - y_train.csv")
print(f"  - y_test.csv")
if scaler is not None:
    print(f"  - scaler.pkl")
print(f"  - preprocessing_metadata.json")