## Import all the required packages in this cell

In [None]:
# ==============================================================================
# STARTER CODE: D-321 Data Preprocessing Evaluation Assignment
# NOTE: This code assumes the 'generate_missingness_datasets.py' script has been
# run and all required CSV files and 'metadata.json' are in 'generated_datasets/'.
# ==============================================================================

import pandas as pd
import numpy as np
import json

## Helper functions, metadata and logging

In [None]:
def load_dataset_from_metadata(file_meta):
    """
    Loads a specific dataset CSV using its metadata dictionary.

    Parameters:
        file_meta (dict): Dictionary containing file path information (key 'file')
                          and dataset type (key 'dataset').

    Returns:
        tuple:
            - X (pd.DataFrame): Feature matrix.
            - y (pd.Series): Target vector (named '__target__').
            - task_type (str): 'classification' or 'regression'.

    Raises:
        ValueError: If the target column '__target__' is not found.
    """
    file_path = f"{dataset_dir}/{file_meta['file']}"
    df = pd.read_csv(file_path)

    if '__target__' not in df.columns:
        raise ValueError(f"Target column '__target__' not found in {file_meta['file']}")

    X = df.drop(columns=['__target__'])
    y = df['__target__']

    dname = file_meta['dataset']
    task_type = 'classification' if dname in ['breast_cancer', 'wine', 'synthetic_clf'] else 'regression'

    return X, y, task_type


def log_results(dataset, task, m_type, m_percent, impute_tech, encoder_tech, scaler_tech,
                model_name, fold, seed, metric_name, metric_value):
    """
    Appends the results of a single experiment run to the global results_log DataFrame.

    Parameters:
        dataset (str): Name of the base dataset (e.g., 'breast_cancer').
        task (str): Type of evaluation task ('intrinsic' or 'extrinsic').
        m_type (str): Missingness mechanism ('MCAR', 'MAR', 'MNAR', or 'CLEAN').
        m_percent (float): Percentage of missing values (0.0, 10.0, 30.0, etc.).
        impute_tech (str): Imputation technique used.
        encoder_tech (str): Encoder technique used.
        scaler_tech (str): Scaler technique used.
        model_name (str): Model family used ('RandomForest', 'Logistic', 'NN', etc.).
        fold (int): Cross-validation fold index.
        seed (int): Random seed used for missingness generation or CV.
        metric_name (str): The name of the performance metric (e.g., 'Accuracy', 'RMSE').
        metric_value (float): The calculated value of the performance metric.

    Outputs:
        (None): Appends a row to the global results_log DataFrame.
    """
    global results_log
    new_row = {
        'dataset': dataset, 'task': task, 'missingness_type': m_type,
        'missing_percent': m_percent, 'imputation_technique': impute_tech,
        'encoder_technique': encoder_tech, 'scaler_technique': scaler_tech,
        'model': model_name, 'fold': fold, 'seed': seed,
        'metric_name': metric_name, 'metric_value': metric_value
    }
    results_log = pd.concat([results_log, pd.Series(new_row).to_frame().T], ignore_index=True)


In [None]:
dataset_dir = "generated_datasets" # path to datasets
metadata_path = f"{dataset_dir}/metadata.json" # path to metadata.json
N_SPLITS = 5 # For cross-validation

# DataFrame to store all experiment results
results_log = pd.DataFrame(columns=[
    'dataset', 'task', 'missingness_type', 'missing_percent',
    'imputation_technique', 'encoder_technique', 'scaler_technique',
    'model', 'fold', 'seed', 'metric_name', 'metric_value'
])

# Load metadata to map file names to experimental parameters
try:
    with open(metadata_path, 'r') as f:
        METADATA = json.load(f)
except FileNotFoundError:
    print(f"Error: Metadata file not found at {metadata_path}. Please ensure the generation script has been run.")
    METADATA = []

## Task 1. Implementation of Imputation/Encoding and Scaling Methods

1. Numeric imputation: drop attribute (column removal), mean (numeric only), median
(numeric/ordinal), mode, Last Observation Carried Forward imputation (LOCF, only if
time-series or grouped; otherwise explain limitations), linear interpolation. (10 marks)
2. Categorical encoders: Label, one-hot, binary, target encoding with out-of-fold (OOF) safe
scheme. Provide an OOF implementation or use category encoders with OOF. (10 marks)
3. Scaling/normalization: min-max, z-score, log (handle zeros), box-cox / yeo-johnson, quan-
tile, sigmoid, max-abs, unit-length. (5 marks)

In [None]:
"""
Task 1: Implementation Guide

You should replace the placeholder comments with the correct initialized Scikit-learn
or category_encoders classes.

For custom methods (LOCF, Linear Interpolation, Log, Sigmoid), ensure the implementations fit them into
the Pipeline structure.

Using raw Pandas functions (.fillna(method='ffill')) without wrapping them breaks the scikit-learn framework.
"""
NUMERIC_IMPUTERS = {
    'drop_col': 'drop_col',
    'mean': # TODO: Replace with proper implementation
    'median': # TODO: Replace with proper implementation
    'mode': # TODO: Replace with proper implementation
    'locf': # TODO: Replace with proper implementation
    'linear_interp': # TODO: Replace with proper implementation
}

CATEGORICAL_ENCODERS = {
    'label': # TODO: Replace with proper implementation
    'one_hot': # TODO: Replace with proper implementation
    'binary': # TODO: Replace with proper implementation
    'target_oof':  # TODO: Implement OOF Target Encoder
}

SCALERS = {
    'min_max': # Implement min-max scaler
    'z_score': # Implement z-score scaler
    'log':  # TODO: Implement Log with zero handling
    'box_cox_yeo_johnson':  # TODO: Replace with proper implementation
    'quantile':  # TODO: Replace with proper implementation
    'sigmoid':  # TODO: Replace with proper implementation
    'max_abs':  # TODO: Replace with proper implementation
    'unit':  # TODO: Replace with proper implementation
}

In [None]:
def get_preprocessor(numeric_features, categorical_features, impute_tech, encoder_tech, scaler_tech):
    """
    Builds the ColumnTransformer for the preprocessing pipeline based on
    the current experiment's configuration.

    Parameters:
        numeric_features (list): List of column names that are numeric.
        categorical_features (list): List of column names that are categorical.
        impute_tech (str): The chosen numeric imputation technique key (e.g., 'mean', 'drop_col').
        encoder_tech (str): The chosen categorical encoder technique key (e.g., 'one_hot', 'none').
        scaler_tech (str): The chosen scaling technique key (e.g., 'min_max', 'none').

    Returns:
        ColumnTransformer: A scikit-learn ColumnTransformer object ready to be
                           integrated into a Pipeline.
    """

    # Numeric Pipeline: TODO include all the preprocessing to be applied to numerical columns


    # Categorical Pipeline: TODO include all the preprocessing to be applied to categorical coluns

    return

## Task 2. Experiment Pipeline & Evaluation Functions

1. Missing value imputation evaluation: For each dataset with missing values, \
-- Evaluate intrinsic imputation quality (RMSE for numeric; accuracy for categorical) by
masking a separate holdout of observed values. (8 marks) \
-- Evaluate extrinsic model performance (Classification: Accuracy + ROC-AUC; Regres-
sion: RMSE + R²) using three model families: Random Forest, Logistic/Linear (with
regularization), and Neural Networks(NN), against the testing dataset. (12 marks)  \
-- Scaling/normalization: min-max, z-score, log (handle zeros), box-cox / yeo-johnson,
quantile, sigmoid, max-abs, unit-length. (5 marks)
2. Category encoder evaluation: For each original training dataset, evaluate extrinsic model
performance (Classification: Accuracy + ROC-AUC; Regression: RMSE + R²) using three
model families: Random Forest, Logistic/Linear (with regularization), and NN, against the
testing dataset. (12 marks)
3. Log results (in CSV) including dataset name, missingness, technique, model, fold, seed,
metric. (5 marks)

In [None]:
"""
Task 2: Model Implementation Guide

You should initialize the appropriate Scikit-learn model classes.

Crucial Reminder for Consistency and Reproducibility:
- For models that use randomness (RandomForest, MLPClassifier/Regressor),
  the 'random_state' parameter MUST be set to ensure results are repeatable.
- For iterative models (LogisticRegression, MLP), set a sufficiently high
  'max_iter' (e.g., 500-1000) to ensure convergence across different datasets.
"""
MODELS = {
    'classification': {
        'RandomForest':  # TODO: Replace with proper implementation
        'Logistic':  # TODO: Replace with proper implementation
        'NN':  # TODO: Replace with proper implementation
    },
    'regression': {
        'RandomForest': # TODO: Replace with proper implementation
        'Linear':  # TODO: Replace with proper implementation
        'NN':  # TODO: Replace with proper implementation
    }
}

In [None]:
def evaluate_imputation_intrinsic(X_train_miss, X_train_full, dname, m_type, m_percent, seed):
    """
    Task 2: Intrinsic Imputation Quality Evaluation

    Compares the imputed values against the known true values (original observed
    values that were intentionally masked for this evaluation).

    Parameters:
        X_train_miss (pd.DataFrame): Training features with the assignment-defined missing values.
        X_train_full (pd.DataFrame): The original, complete training features (ground truth).
        dname (str): Base dataset name.
        m_type (str): Missingness mechanism (e.g., 'MCAR').
        m_percent (float): Missing value percentage.
        seed (int): Random seed used for missing value generation.

    TODO:
        1. Create a **separate intrinsic evaluation mask** on the observed values of X_train_miss.
        2. Introduce temporary NaNs based on this mask.
        3. Loop through all **NUMERIC_IMPUTERS**.
        4. Apply the imputer and calculate **RMSE** between imputed values and X_train_full (ground truth).
        5. Log the results using `log_results` with `task='intrinsic'`.
    """
    print(f"--- TODO: Implementing Intrinsic Imputation Quality for {dname} ({m_type} {m_percent}%) ---")
    pass


def run_extrinsic_evaluation(X_train, y_train, X_test, y_test, file_meta,
                             impute_tech='none', encoder_tech='none', scaler_tech='none'):
    """
    Task 2: Extrinsic Model Performance Evaluation

    Performs cross-validation on the X_train set to fit the full preprocessing
    pipeline and model, and then evaluates the final pipeline on the external
    X_test holdout set. This ensures no data leakage.

    Parameters:
        X_train (pd.DataFrame): Training features (potentially with missing values).
        y_train (pd.Series): Training target.
        X_test (pd.DataFrame): External testing features (always clean/full).
        y_test (pd.Series): External testing target.
        file_meta (dict): Metadata dictionary for logging experiment details.
        impute_tech (str): Imputation technique being tested/used.
        encoder_tech (str): Encoder technique being tested/used.
        scaler_tech (str): Scaler technique being tested/used.

    Outputs:
        (None): Logs results to the global results_log DataFrame for each model and fold.
    """
    pass


In [None]:
def main_experiment_loop():
    """
    Main function to orchestrate the entire experiment by loading pre-generated
    files and running all required evaluation tasks (A & B).

    The main loop ensures that:
        1. A consistent 80/20 Train/Test split is used for all experiments.
        2. Experiments are run across all defined preprocessing configurations.
        3. Results are logged to 'results.csv'.
    """
    if not METADATA:
        return

    # ----------------------------------------------------------------------
    # A. TODO: Task 2 -> Missing Value Imputation Evaluation
    # ----------------------------------------------------------------------
        # Load the pre-generated training set with missingness

        # Retrieve the consistent 20% test set and original clean training set

        # 1. Intrinsic evaluation (Task 2)

        # 2. Extrinsic evaluation (Task 2)


    # ----------------------------------------------------------------------
    # B. TODO: Task 2 -> Category Encoder and Scaling Evaluation (on CLEAN data)
    # ----------------------------------------------------------------------
      # 1. Category Encoder Evaluation
      # 2. Scaling/Normalization Evaluation


    # Save results (Task 2 - Log results)



In [None]:
main_experiment_loop()

## Task 3. Visualizations

1. Compare imputation methods across missingness & datasets & model. (7 marks)
2. Compare category encoding methods across datasets and models. (7 marks)
3. Compare scaling/normalization methods across datasets and models. (7 marks)
4. Summarize key findings: which techniques are robust, interactions with model type, surpris-
ing results. (7 marks)

In [None]:
# TODO: Include all the visualizations below