In [None]:
# --- Cell 1: Imports & paths ---
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import balanced_accuracy_score

# diagnostics
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

IN = Path("../data/interim")
OUT = Path("../models")
DIA = OUT / "diagnostics"
OUT.mkdir(parents=True, exist_ok=True)
DIA.mkdir(parents=True, exist_ok=True)

In [None]:
# --- Cell 2: Load data & meta ---
X_train = pd.read_parquet(IN / "X_train.parquet")
y_train = pd.read_parquet(IN / "y_train.parquet")["party"]

with open(IN / "split_meta.json") as f:
    meta = json.load(f)
NUMERIC = meta["numeric"]
CATEGORICAL = meta["categorical"]

In [None]:
# --- Cell 3: Build base pipeline ---
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(drop="first", sparse_output=False), CATEGORICAL),
    ("num", StandardScaler(), NUMERIC),
])

base_pipeline = Pipeline([
    ("pre", preprocessor),
    ("reg", LinearRegression())
])

In [None]:
# --- Cell 4: Fit baseline & threshold tuning ---
base_pipeline.fit(X_train, y_train)
y_score_train_base = base_pipeline.predict(X_train).clip(0, 1)

def pick_threshold(y_true, scores, metric="balanced_accuracy"):
    scores = np.asarray(scores)
    grid = np.linspace(0.0, 1.0, 201)
    best_thr, best_val = 0.5, -1.0
    for thr in grid:
        y_hat = (scores >= thr).astype(int)
        if metric == "balanced_accuracy":
            val = balanced_accuracy_score(y_true, y_hat)
        else:
            from sklearn.metrics import f1_score
            val = f1_score(y_true, y_hat, zero_division=0)
        if val > best_val:
            best_val, best_thr = val, thr
    return float(best_thr), float(best_val)

thr_base, bal_train_base = pick_threshold(y_train, y_score_train_base, "balanced_accuracy")
print(f"[BASE] threshold={thr_base:.3f}, balanced-acc(train)={bal_train_base:.3f}")

In [None]:
# --- Cell 5: Recover train design matrix & feature names for diagnostics ---
# Use the fitted preprocessor to get dense design matrix
X_train_design = base_pipeline.named_steps["pre"].transform(X_train)  # numpy array
# Compose names: OHE then numeric
ohe = base_pipeline.named_steps["pre"].named_transformers_["cat"]
ohe_names = list(ohe.get_feature_names_out(CATEGORICAL))
feat_names = ohe_names + NUMERIC

# Add intercept for statsmodels
X_sm = sm.add_constant(X_train_design)
model_sm = sm.OLS(y_train.values, X_sm).fit()

In [None]:
# --- Cell 6: VIF diagnostics ---
vif_data = []
for i in range(1, X_sm.shape[1]):  # skip intercept at 0
    vif_val = variance_inflation_factor(X_sm[:, 1:], i-1)
    vif_data.append((feat_names[i-1], float(vif_val)))

vif_df = pd.DataFrame(vif_data, columns=["feature", "VIF"]).sort_values("VIF", ascending=False)
vif_df.to_csv(DIA / "vif_baseline.csv", index=False)

plt.figure()
plt.barh(vif_df["feature"], vif_df["VIF"])
plt.title("VIF (Baseline, Train)")
plt.xlabel("VIF")
plt.tight_layout()
plt.savefig(DIA / "vif_baseline.png", dpi=200)
plt.close()

In [None]:
# --- Cell 7: Residual diagnostics (baseline) ---
residuals = y_train.values - y_score_train_base
fitted = y_score_train_base

# residuals vs fitted
plt.figure()
plt.scatter(fitted, residuals)
plt.axhline(0, linestyle="--")
plt.title("Residuals vs Fitted (Baseline, Train)")
plt.xlabel("Fitted")
plt.ylabel("Residuals")
plt.tight_layout()
plt.savefig(DIA / "residuals_vs_fitted_baseline.png", dpi=200)
plt.close()

# histogram of residuals
plt.figure()
plt.hist(residuals, bins=12)
plt.title("Residuals Histogram (Baseline, Train)")
plt.tight_layout()
plt.savefig(DIA / "residuals_hist_baseline.png", dpi=200)
plt.close()

# QQ plot
fig = sm.qqplot(residuals, line='45', fit=True)
plt.title("QQ Plot of Residuals (Baseline, Train)")
plt.tight_layout()
fig.savefig(DIA / "qqplot_residuals_baseline.png", dpi=200)
plt.close()

In [None]:
# --- Cell 8: Influence/Cook's distance (baseline) ---
influence = model_sm.get_influence()
cooks_d = influence.cooks_distance[0]  # array
leverage = influence.hat_matrix_diag

infl_df = pd.DataFrame({
    "index": X_train.index,
    "cooks_d": cooks_d,
    "leverage": leverage,
    "residual": residuals
}).set_index("index").sort_values("cooks_d", ascending=False)
infl_df.to_csv(DIA / "influence_baseline.csv")

plt.figure()
plt.scatter(leverage, residuals)
plt.title("Leverage vs Residuals (Baseline, Train)")
plt.xlabel("Leverage")
plt.ylabel("Residual")
plt.tight_layout()
plt.savefig(DIA / "leverage_vs_residuals_baseline.png", dpi=200)
plt.close()

In [None]:
# --- Cell 9: Remedial options (auto) ---
# 9a. High multicollinearity: drop features with VIF > 10 and refit
VIF_THRESHOLD = 10.0
high_vif_features = vif_df.loc[vif_df["VIF"] > VIF_THRESHOLD, "feature"].tolist()

def drop_features_in_X(X: pd.DataFrame, to_drop_ohe_names: list):
    """
    For OHE features, each name corresponds to a one-hot column, not original input.
    We can drop high-VIF OHE features by setting OneHotEncoder to drop specific categories.
    For simplicity, we *filter columns after transformation* using a custom selector.
    Here, we rebuild a new ColumnTransformer that excludes those specific OHE columns.
    """
    # Build a mask of OHE columns to keep
    ohe_keep = [name for name in ohe_names if name not in to_drop_ohe_names]
    return ohe_keep

models = {
    "baseline": {"pipeline": base_pipeline, "thr": thr_base, "bal_train": bal_train_base}
}

if high_vif_features:
    # Build a filterable pipeline by selecting OHE columns post-transform
    # Approach: use preprocessor as-is but append a SelectFromColumns step (custom)
    from sklearn.base import BaseEstimator, TransformerMixin

    class ColumnFilter(BaseEstimator, TransformerMixin):
        def __init__(self, keep_ohe_names, numeric_names):
            self.keep_ohe_names = keep_ohe_names
            self.numeric_names = numeric_names
            self.feature_names_ = keep_ohe_names + numeric_names
        def fit(self, X, y=None):
            return self
        def transform(self, X):
            # X is the dense array [OHE || scaled numeric]
            ohe_count = len(ohe_names)
            # Indices to keep among OHE
            indices_ohe = [ohe_names.index(n) for n in self.keep_ohe_names]
            X_ohe = X[:, indices_ohe]
            X_num = X[:, len(ohe_names):]
            return np.concatenate([X_ohe, X_num], axis=1)

    keep_ohe = drop_features_in_X(X_train, high_vif_features)
    col_filter = ColumnFilter(keep_ohe_names=keep_ohe, numeric_names=NUMERIC)

    vif_pruned_pipeline = Pipeline([
        ("pre", preprocessor),
        ("filter", col_filter),
        ("reg", LinearRegression())
    ])
    vif_pruned_pipeline.fit(X_train, y_train)
    y_score_train_vif = vif_pruned_pipeline.predict(X_train).clip(0, 1)
    thr_vif, bal_train_vif = pick_threshold(y_train, y_score_train_vif, "balanced_accuracy")
    print(f"[VIF-PRUNED] drop={high_vif_features}, thr={thr_vif:.3f}, bal-acc(train)={bal_train_vif:.3f}")

    models["vif_pruned"] = {
        "pipeline": vif_pruned_pipeline,
        "thr": thr_vif,
        "bal_train": bal_train_vif,
        "metadata": {"dropped_ohe_features": high_vif_features}
    }

# 9b. Influence pruning: remove points with Cook's D > 4/n and refit
n = len(X_train)
COOKS_THRESHOLD = 4 / n
keep_mask = cooks_d < COOKS_THRESHOLD

if keep_mask.mean() < 0.95:
    # Only apply if more than 5% flagged (to avoid overfitting to tiny changes)
    X_keep = X_train.loc[keep_mask]
    y_keep = y_train.loc[keep_mask]

    infl_pruned_pipeline = Pipeline([
        ("pre", preprocessor),
        ("reg", LinearRegression())
    ])
    infl_pruned_pipeline.fit(X_keep, y_keep)
    y_score_train_infl = infl_pruned_pipeline.predict(X_train).clip(0, 1)  # evaluate on full train for comparison
    thr_infl, bal_train_infl = pick_threshold(y_train, y_score_train_infl, "balanced_accuracy")
    print(f"[INFL-PRUNED] kept={keep_mask.sum()}/{n}, thr={thr_infl:.3f}, bal-acc(train)={bal_train_infl:.3f}")

    models["influence_pruned"] = {
        "pipeline": infl_pruned_pipeline,
        "thr": thr_infl,
        "bal_train": bal_train_infl,
        "metadata": {"cooks_threshold": COOKS_THRESHOLD, "kept": int(keep_mask.sum()), "n": int(n)}
    }

In [None]:
# --- Cell 10: Pick best train model variant (by balanced accuracy on train) ---
best_name = max(models.keys(), key=lambda k: models[k]["bal_train"])
best = models[best_name]
print(f"[SELECT] Best variant: {best_name} (bal-acc(train)={best['bal_train']:.3f})")

In [None]:
# --- Cell 11: Save active model, threshold, and artifacts ---
joblib.dump(best["pipeline"], OUT / "active_model.pkl")
with open(OUT / "train_threshold.json", "w") as f:
    json.dump({"threshold": best["thr"], "balanced_accuracy_on_train": best["bal_train"], "variant": best_name}, f, indent=2)

# Also save baseline for reference
joblib.dump(base_pipeline, OUT / "linear_regression_pipeline_baseline.pkl")

# Save coefficient chart for active model
# We must reconstruct the active feature names (could be filtered)
pipe = best["pipeline"]
if "filter" in pipe.named_steps:
    # Rebuild feature names using filter
    keep_ohe = pipe.named_steps["filter"].keep_ohe_names
    feat_active = keep_ohe + NUMERIC
    # Get coefficients from reg
    coefs = pipe.named_steps["reg"].coef_
else:
    feat_active = feat_names
    coefs = pipe.named_steps["reg"].coef_

coef_df = pd.DataFrame({"feature": feat_active, "coef": coefs}).sort_values(
    "coef", key=lambda s: s.abs(), ascending=False
)
coef_df.to_csv(OUT / "linear_regression_coefficients_active.csv", index=False)

plt.figure()
plt.barh(coef_df["feature"], coef_df["coef"])
plt.title(f"Linear Regression Coefficients ({best_name})")
plt.xlabel("Coefficient Value")
plt.tight_layout()
plt.savefig(OUT / "linear_regression_coefficients_active.png", dpi=200)
plt.close()

# Save diagnostic summaries
with open(OUT / "training_summary.json", "w") as f:
    json.dump({
        "selected_variant": best_name,
        "variants": {k: {"balanced_accuracy_train": v["bal_train"]} for k,v in models.items()},
        "high_vif_features": high_vif_features if high_vif_features else [],
        "cooks_threshold": COOKS_THRESHOLD,
    }, f, indent=2)

print("Training complete. Saved active_model.pkl, threshold, coefficients, and diagnostics.")
