In [None]:
# In: notebooks/04_Fit_Preprocessors.ipynb
# Purpose: Load the TRAINING split data, fit data scalers (StandardScaler)
#          and imputers (SimpleImputer) based ONLY on this training data,
#          and save these fitted objects for later use in the Dataset class.

# Notebook 04: Fit Preprocessors

**Purpose:** This notebook prepares the data preprocessing components required for training the model. It performs the following critical steps:

1.  **Load Training Data:** Loads *only* the training data split (`cohort_train.parquet`) created in Notebook 03. This is crucial to prevent data leakage from validation/test sets into the preprocessing steps.
2.  **Identify Preprocessing Columns:** Analyzes the training data to determine which columns require imputation (due to missing values) and which numerical columns should be scaled.
3.  **Log Configuration:** Logs the identified column lists and the chosen preprocessing strategies (e.g., median imputation, standard scaling) to the Weights & Biases (W&B) configuration for this run. This configuration will be loaded by subsequent notebooks (like NB05, NB06) to ensure consistent feature handling.
4.  **Fit Preprocessors:** Initializes and fits the chosen imputer (`SimpleImputer`) and scaler (`StandardScaler`) using **only** the training data.
5.  **Save & Log Preprocessors:** Saves the *fitted* imputer and scaler objects locally as `.joblib` files and logs them as versioned artifacts to W&B. This allows later notebooks/scripts to load these exact fitted objects to apply the *same* imputation and scaling transformations consistently to training, validation, and test data.

**Input:**
* `cohort_train.parquet` (Output from NB 03)
* `config.json` (For base paths and W&B details)

**Output:**
* `simple_imputer_median.joblib` (Saved locally and as W&B artifact)
* `standard_scaler.joblib` (Saved locally and as W&B artifact)
* Updated W&B Run Configuration (Includes `preprocess` and `features` lists)

In [None]:
# --- Import Libraries ---
import pandas as pd
import numpy as np
import wandb
import json
from pathlib import Path
import time
import os
import joblib # For saving sklearn objects
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [None]:
# --- Config Loading ---
print("--- Loading Configuration ---")
CONFIG_PATH = Path('../config.json') # Path relative to the notebook location
try:
    PROJECT_ROOT = CONFIG_PATH.parent.resolve()
    print(f"Project Root detected as: {PROJECT_ROOT}")
    with open(CONFIG_PATH, 'r', encoding='utf-8') as f:
        config = json.load(f)
    print("Configuration loaded successfully.")

    # Define key variables from config
    OUTPUT_DIR_BASE = PROJECT_ROOT / config['data']['output_dir_base']
    WANDB_PROJECT = config['wandb']['project_name']
    WANDB_ENTITY = config['wandb'].get('entity', None)

    # Define input path (output from Notebook 03)
    NB03_OUTPUT_DIR = OUTPUT_DIR_BASE / "03_Feature_Engineering_Splitting"
    TRAIN_DATA_PATH = NB03_OUTPUT_DIR / "cohort_train.parquet" # Load ONLY training data

    # Define specific output dir for this notebook's results (preprocessors) and create it
    NOTEBOOK_NAME = "04_Fit_Preprocessors"
    output_dir = OUTPUT_DIR_BASE / NOTEBOOK_NAME
    output_dir.mkdir(parents=True, exist_ok=True)
    print(f"Fitted preprocessor objects will be saved to: {output_dir}")

    # Get planned features from config if logged, otherwise define defaults
    # This assumes NB 03 logged these to config - might need manual definition otherwise
    # For robustness, define them here based on NB 03 plan, checking columns later
    planned_time_varying = ['Age', 'MMSE', 'nWBV', 'Days_from_Baseline', 'Time_since_Last_Visit_Days']
    planned_static = ['M/F', 'EDUC', 'SES', 'Baseline_CDR', 'Baseline_MMSE', 'eTIV', 'ASF']


except Exception as e:
    print(f"Error loading config or setting up paths: {e}")
    exit()

In [None]:
# --- Initialize W&B Run ---
print("\n--- Initializing Weights & Biases Run ---")
run = None # Initialize run to None
try:
    run = wandb.init(
        project=WANDB_PROJECT,
        entity=WANDB_ENTITY,
        job_type="fit-preprocessors", # New job type
        name=f"{NOTEBOOK_NAME}-run-{time.strftime('%Y%m%d-%H%M')}",
        config={ # Log key config choices for this job
            "train_data_artifact": f"cohort-split-train:latest", # Link to input artifact
            "train_data_path": str(TRAIN_DATA_PATH),
            "output_dir": str(output_dir),
            # Imputation/Scaling strategies will be added
        }
    )
    print(f"W&B run '{run.name}' initialized successfully. View at: {run.url}")
except Exception as e:
    print(f"Error initializing W&B: {e}")
    print("Proceeding without W&B logging.")

## Load Training Data Split

Load the training dataset (`cohort_train.parquet`) generated in the previous notebook (NB 03). All subsequent fitting steps will use **only** this data.

In [None]:
print(f"\n--- Loading TRAINING Data from: {TRAIN_DATA_PATH} ---")
try:
    if not TRAIN_DATA_PATH.is_file():
         raise FileNotFoundError(f"Training data file not found at {TRAIN_DATA_PATH}.")
    train_df = pd.read_parquet(TRAIN_DATA_PATH)
    print(f"Training data loaded successfully. Shape: {train_df.shape}")
    if run: run.log({'fit_preprocessors/input_train_rows': len(train_df)})

except FileNotFoundError as e:
    print(f"Error: {e}")
    if run: run.finish()
    exit()
except Exception as e:
    print(f"An error occurred loading the training data: {e}")
    if run: run.finish()
    exit()

## Identify Features for Preprocessing

Analyze the loaded training data (`train_df`) to:
1.  Identify columns with missing values that require imputation (e.g., SES, MMSE, nWBV).
2.  Identify numerical columns suitable for scaling (excluding identifiers, targets, and potentially already encoded categoricals).
Define the strategies to be used (e.g., median imputation, standard scaling) and log these choices to W&B config.

In [None]:
print("\n--- Defining Features for Imputation & Scaling ---")

# Identify columns actually present in the loaded data
available_cols = train_df.columns.tolist()

# Define columns needing imputation (based on previous EDA and plan)
# Check missing values specifically in train_df now
missing_in_train = train_df.isnull().sum()
print("\nMissing values in Training Set:")
print(missing_in_train[missing_in_train > 0])

imputation_cols = []
if 'SES' in available_cols and missing_in_train.get('SES', 0) > 0:
    imputation_cols.append('SES')
if 'MMSE' in available_cols and missing_in_train.get('MMSE', 0) > 0:
    imputation_cols.append('MMSE')
if 'nWBV' in available_cols and missing_in_train.get('nWBV', 0) > 0:
    imputation_cols.append('nWBV')
# Add EDUC, Baseline_MMSE etc. if they show missing values and need imputation
if 'EDUC' in available_cols and missing_in_train.get('EDUC', 0) > 0:
     imputation_cols.append('EDUC')
if 'Baseline_MMSE' in available_cols and missing_in_train.get('Baseline_MMSE', 0) > 0:
     imputation_cols.append('Baseline_MMSE')

print(f"\nColumns selected for imputation: {imputation_cols}")

# Define numerical columns needing scaling (identifiers, target, categorical excluded)
# Combine planned time-varying and static, filter based on availability & numeric type
potential_scaling_cols = planned_time_varying + planned_static
scaling_cols = [
    col for col in potential_scaling_cols
    if col in available_cols and pd.api.types.is_numeric_dtype(train_df[col]) and col not in ['M/F'] # Exclude Sex explicitly if numeric coding used
]
# Remove target base if present, just to be safe (target handled separately)
if 'CDR' in scaling_cols: scaling_cols.remove('CDR')

print(f"Columns selected for scaling: {scaling_cols}")

print("\n--- Finalizing and Logging Feature Lists for Model Input ---")

# Define final lists based on columns intended for the model AFTER preprocessing
# Time-varying: Typically the scaled numeric time-varying columns
final_time_varying = [col for col in planned_time_varying if col in scaling_cols]
# Add any other processed time-varying features if they exist

# Static: Scaled numeric static columns + encoded categorical static columns
final_static_numeric = [col for col in planned_static if col in scaling_cols]
final_static_categorical = []
if 'M/F' in available_cols: # If M/F is present and needs encoding
    final_static_categorical.append('M/F_encoded') # Assuming you create this name later

# Baseline CDR might not be scaled, but should be included
if 'Baseline_CDR' in available_cols and 'Baseline_CDR' not in scaling_cols:
     final_static_numeric.append('Baseline_CDR') # Add if present but not scaled

final_static = sorted(list(set(final_static_numeric + final_static_categorical))) # Ensure unique and sort

print(f"Final Time-Varying Features planned: {final_time_varying}")
print(f"Final Static Features planned: {final_static}")

if run:
    wandb.config.update({
        'preprocess': {
            'imputation_cols': imputation_cols,
            'scaling_cols': scaling_cols,
            'imputation_strategy': 'median',
            'scaling_strategy': 'StandardScaler'
        },
        'features': { # Log the final lists expected by the model
            'time_varying': final_time_varying,
            'static': final_static
        }
    }, allow_val_change=True)
    print("Final preprocess and features configuration logged to W&B.")

## Fit and Save Imputer(s)

Based on the columns identified above, initialize and fit the chosen imputer (e.g., `SimpleImputer(strategy='median')`) using **only** the training data. Save the *fitted* imputer object locally (e.g., using `joblib`) and log it as a versioned artifact to W&B. This ensures the exact imputation learned from the training data can be applied consistently later.

In [None]:
print("\n--- Fitting and Saving Imputer(s) ---")

imputer = None
imputer_path = output_dir / "simple_imputer_median.joblib"

if imputation_cols:
    try:
        print(f"Fitting Median Imputer for columns: {imputation_cols}")
        # Use median strategy - generally robust to outliers
        imputer = SimpleImputer(strategy='median')
        imputer.fit(train_df[imputation_cols])
        print("Imputer fitted successfully.")

        # Save the fitted imputer
        joblib.dump(imputer, imputer_path)
        print(f"Fitted imputer saved locally to: {imputer_path}")

        # Log artifact to W&B
        if run:
            print("Logging imputer as W&B artifact...")
            artifact_name = 'simple_imputer_median'
            artifact_type = 'preprocessor'
            description = f"SimpleImputer(strategy='median') fitted on training data columns: {imputation_cols}"
            imputer_artifact = wandb.Artifact(artifact_name, type=artifact_type, description=description)
            imputer_artifact.add_file(str(imputer_path))
            run.log_artifact(imputer_artifact)
            print("Imputer artifact logged.")

    except Exception as e:
        print(f"Error fitting or saving imputer: {e}")
        imputer = None # Ensure imputer is None if saving failed
else:
    print("No columns identified for imputation based on missing values in training set.")

## Fit and Save Scaler(s)

Initialize and fit the chosen scaler (e.g., `StandardScaler`) on the specified numerical columns using **only** the training data (potentially after imputation if the same columns needed both). Save the *fitted* scaler object locally and log it as a versioned artifact to W&B. This captures the mean and standard deviation from the training data for consistent scaling later.

In [None]:
print("\n--- Fitting and Saving Scaler(s) ---")

scaler = None
scaler_path = output_dir / "standard_scaler.joblib"

if scaling_cols:
    try:
        print(f"Fitting StandardScaler for columns: {scaling_cols}")
        scaler = StandardScaler()
        # Fit scaler ONLY on the training data
        scaler.fit(train_df[scaling_cols])
        print("Scaler fitted successfully.")

        # Save the fitted scaler
        joblib.dump(scaler, scaler_path)
        print(f"Fitted scaler saved locally to: {scaler_path}")

        # Log artifact to W&B
        if run:
            print("Logging scaler as W&B artifact...")
            artifact_name = 'standard_scaler'
            artifact_type = 'preprocessor'
            description = f"StandardScaler fitted on training data columns: {scaling_cols}"
            scaler_artifact = wandb.Artifact(artifact_name, type=artifact_type, description=description)
            scaler_artifact.add_file(str(scaler_path))
            run.log_artifact(scaler_artifact)
            print("Scaler artifact logged.")

    except Exception as e:
        print(f"Error fitting or saving scaler: {e}")
        scaler = None # Ensure scaler is None if saving failed
else:
    print("No columns identified for scaling.")

## Finalize Run

Finish the Weights & Biases run associated with fitting and saving the preprocessors. The saved `.joblib` files and W&B artifacts are now ready for use in the data loading pipeline.

In [None]:
# --- Finish W&B Run ---
print("\n--- Preprocessor Fitting complete. Finishing W&B run. ---")
if run:
    run.finish()
    print("W&B run finished.")
else:
    print("No active W&B run to finish.")

print("\nScript execution finished.")