In [None]:
# ============================================
# The purpose of this script is to efficiently generate
# and select all pairwise feature interactions using L1-regularized logistic regression.
# ============================================
# - Generates all 2-way interactions (after one-hot)
# - Uses L1 (Lasso) logistic regression to select features
# - Much faster and memory-friendly than dense GLM(**2)

import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    precision_recall_curve, f1_score
)

# ---------------- CONFIG ----------------
DATA_PATH = "../data/processed/diabetes_cleaned_data.csv"
TARGET = "readmitted_binary"
DROP_COLS = ["encounter_id", "patient_nbr"]  # exclude IDs
TEST_SIZE = 0.25
SEED = 42

# L1 regularization strength: smaller C => stronger regularization
C_REG = 0.5
MAX_ITER = 600

# Optional: downsample while iterating if you need speed
SAMPLE_N = None  # e.g., 20000; set None to use all rows

# ---------------- Load ----------------
df = pd.read_csv(DATA_PATH)
df = df.drop(columns=[c for c in DROP_COLS if c in df.columns], errors="ignore")
df = df.dropna(subset=[TARGET]).copy()
df[TARGET] = df[TARGET].astype(int)

# Optional sampling for speed while prototyping
if SAMPLE_N is not None and len(df) > SAMPLE_N:
    # stratified sampling to keep outcome prevalence
    pos = df[df[TARGET]==1].sample(frac=min(1.0, SAMPLE_N/len(df)), random_state=SEED)
    neg_needed = SAMPLE_N - len(pos)
    neg = df[df[TARGET]==0].sample(n=max(0, neg_needed), random_state=SEED)
    df = pd.concat([pos, neg]).sample(frac=1, random_state=SEED)

X = df.drop(columns=[TARGET])
y = df[TARGET].values

# Identify categoricals heuristically (object dtype or known categorical names)
likely_cats = {
    'race','gender','age','admission_type_id','discharge_disposition_id','admission_source_id',
    'payer_code','medical_specialty','A1Cresult','max_glu_serum','change','diabetesMed','insulin',
    'diag_1','diag_2','diag_3','diab_type','diab_control','diab_complication_binary','diab_complication_categories'
}
cat_cols = [c for c in X.columns if (X[c].dtype == 'object') or (c in likely_cats)]
num_cols = [c for c in X.columns if c not in cat_cols]

# ---------------- Split ----------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, stratify=y, random_state=SEED
)

# ---------------- Pipeline ----------------
# 1) One-hot encode categoricals (sparse)
# 2) Generate all 2-way interactions (no self-squared terms), keep sparse
# 3) L1-logistic (saga) to select useful interactions & main effects
pre = ColumnTransformer(
    transformers=[
        ("num", "passthrough", num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=True), cat_cols)
    ],
    remainder="drop",
    sparse_threshold=1.0  # keep sparse matrices
)

pipe = Pipeline([
    ("prep", pre),
    ("poly", PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
    ("logit", LogisticRegression(
        penalty="l1", solver="saga", C=C_REG, max_iter=MAX_ITER, n_jobs=-1, random_state=SEED
    ))
])

# ---------------- Fit ----------------
pipe.fit(X_train, y_train)

# ---------------- Evaluate ----------------
probs = pipe.predict_proba(X_test)[:, 1]
preds = (probs >= 0.5).astype(int)

roc = roc_auc_score(y_test, probs)
ap = average_precision_score(y_test, probs)
prec, rec, _ = precision_recall_curve(y_test, probs)
f1 = f1_score(y_test, preds)

print(f"ROC-AUC: {roc:.3f}")
print(f"PR-AUC (AP): {ap:.3f}")
print(f"F1 (thr=0.5): {f1:.3f}")

# ---------------- Inspect selected interactions/features ----------------
# Get names after one-hot
ohe = pipe.named_steps["prep"].named_transformers_.get("cat")
num_names = num_cols
cat_names = list(ohe.get_feature_names_out(cat_cols)) if ohe is not None else []
base_feature_names = np.array(num_names + cat_names)

# Names after PolynomialFeatures (interaction_only=True)
poly = pipe.named_steps["poly"]
# sklearn >= 1.0: get_feature_names_out is available
try:
    poly_names = poly.get_feature_names_out(base_feature_names)
except:
    # fallback for very old versions
    poly_names = poly.get_feature_names(base_feature_names)

coef = pipe.named_steps["logit"].coef_.ravel()
nz_idx = np.flatnonzero(coef)  # selected by L1
selected = pd.DataFrame({
    "feature": np.array(poly_names)[nz_idx],
    "coef": coef[nz_idx]
}).sort_values("coef", key=np.abs, ascending=False)

# Save reports
Path("reports").mkdir(parents=True, exist_ok=True)
selected.to_csv("../reports/selected_interactions_l1.csv", index=False)
pd.DataFrame({
    "metric":["ROC_AUC","PR_AUC","F1_thr_0.5"],
    "value":[roc, ap, f1]
}).to_csv("../reports/model_metrics.csv", index=False)

print(f"\nTop selected terms (absolute coef):")
print(selected.head(20).to_string(index=False))
print("\nSaved:")
print(" - reports/selected_interactions_l1.csv")
print(" - reports/model_metrics.csv")


KeyboardInterrupt: 