# Mutual Information Setup

This notebook lets you compute MI scores between a target column and a set of features. Steps:

1. Set `DATA_PATH` to a processed CSV (e.g., `data/processed/baseline_all_processed.csv`).
2. Set `TARGET_COL` to the column you want to predict.
3. Set `FEATURE_COLS` to the candidate feature list.
4. Set `TASK_TYPE` to `"regression"` (default) or `"classification"`.
5. Run the cells to load data, filter columns, and compute MI scores.

Notes:
- Rows with NA in target or selected features are dropped for MI.
- MI is non-negative; higher is more informative with respect to the target.
- Adjust the feature list as you iterate on feature engineering.


In [None]:
# Imports
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.feature_selection import mutual_info_regression, mutual_info_classif


In [None]:
# Configuration
# TODO: Set these before running.
DATA_PATH = Path("../data/processed/baseline_all_processed.csv")  # adjust as needed
TARGET_COL = ""  # e.g., "set_volume" or "effective_load" or your label
FEATURE_COLS = [
    # e.g., "set_volume", "effective_load", "rpe", "set_order", "reps", "weight"
]
TASK_TYPE = "regression"  # or "classification"

# Drop rows with NA in target/features
DROP_NA = True


In [None]:
# Load data
assert DATA_PATH.exists(), f"Missing data file: {DATA_PATH}"
df = pd.read_csv(DATA_PATH)
print(f"Loaded shape: {df.shape}")

# Apply DROP_NA mask for selected columns
cols_needed = [c for c in FEATURE_COLS if c] + ([TARGET_COL] if TARGET_COL else [])
if DROP_NA and cols_needed:
    before = len(df)
    df = df.dropna(subset=cols_needed)
    print(f"Dropped {before - len(df)} rows due to NA in target/features; new shape: {df.shape}")

if TARGET_COL:
    print("Target sample:")
    print(df[TARGET_COL].head())
else:
    print("Set TARGET_COL before running MI.")


In [None]:
# Compute MI scores

def compute_mi_scores(df: pd.DataFrame, features: list[str], target: str, task_type: str = "regression") -> pd.DataFrame:
    if not features:
        raise ValueError("FEATURE_COLS is empty; set features before running MI.")
    if not target:
        raise ValueError("TARGET_COL is empty; set a target before running MI.")

    X = df[features]
    y = df[target]

    if task_type == "classification":
        mi = mutual_info_classif(X, y, discrete_features="auto", random_state=0)
    else:
        mi = mutual_info_regression(X, y, random_state=0)

    out = pd.DataFrame({"feature": features, "mi": mi})
    out = out.sort_values("mi", ascending=False).reset_index(drop=True)
    return out

mi_df = compute_mi_scores(df, FEATURE_COLS, TARGET_COL, TASK_TYPE)
print(mi_df)
