<a href="https://colab.research.google.com/github/apropos0/Scheduling_Inference/blob/main/notebooks/02_models_robustness_figures.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 02 â€” Models + Robustness + Figures

Goal:
- Train baseline classifiers to infer `policy`
- Report holdout + cross-validation
- Run robustness checks (session split, workload split)
- Generate a few simple plots you can reuse in the paper

Input:
- `clean_results.parquet` generated by Notebook 01


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 120)


In [None]:
df = pd.read_parquet("clean_results.parquet")
print("Loaded:", df.shape)
df.head()

## Feature set

We keep the feature set small and interpretable.

In [None]:
features = [
    "cs_per_sec",
    "mig_per_sec",
    "cycles_per_sec",
    "instr_per_sec",
    "branches_per_sec",
    "ipc",
    "branch_miss_rate",
]

model_df = df.dropna(subset=features + ["policy", "workload", "session_id"]).copy()
X = model_df[features]
y = model_df["policy"]

print("Model rows:", model_df.shape[0])
print("Class balance:\n", y.value_counts())

## Holdout split (quick baseline)

This is a quick check. Cross-validation and robustness checks matter more.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=42,
    stratify=y,
)

print("Train:")
print(y_train.value_counts())
print("\nTest:")
print(y_test.value_counts())

In [None]:
logreg = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=2000)),
])

logreg.fit(X_train, y_train)
pred = logreg.predict(X_test)

print("LogReg (holdout)")
print(classification_report(y_test, pred))

cm = confusion_matrix(y_test, pred, labels=logreg.named_steps["clf"].classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=logreg.named_steps["clf"].classes_)
disp.plot()
plt.title("Confusion matrix (LogReg holdout)")
plt.show()

In [None]:
rf = RandomForestClassifier(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)

print("RandomForest (holdout)")
print(classification_report(y_test, pred_rf))

cm = confusion_matrix(y_test, pred_rf, labels=rf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=rf.classes_)
disp.plot()
plt.title("Confusion matrix (RF holdout)")
plt.show()

## Cross-validation

With limited data, CV gives a better sense of stability than one split.

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores_lr = cross_val_score(logreg, X, y, cv=cv, scoring="accuracy")
scores_rf = cross_val_score(rf, X, y, cv=cv, scoring="accuracy")

print("LogReg CV acc:", float(scores_lr.mean()), "+/-", float(scores_lr.std()))
print("RF CV acc:", float(scores_rf.mean()), "+/-", float(scores_rf.std()))

## Interpretability

- Logistic regression coefficients (after scaling)
- Random forest feature importances


In [None]:
clf = logreg.named_steps["clf"]
coefs = pd.DataFrame(clf.coef_, columns=features, index=clf.classes_)
coefs

In [None]:
importances = pd.Series(rf.feature_importances_, index=features).sort_values(ascending=False)
importances

## Robustness 1: session-based generalization

If you have multiple sessions (different days), train on one session and test on another.
This is one of the best checks against "single-session overfitting".


In [None]:
sessions = sorted(model_df["session_id"].unique().tolist())
print("Sessions:", sessions)

if len(sessions) < 2:
    print("Only one session found. Collect another session to run session generalization.")
else:
    train_sess = sessions[0]
    test_sess = sessions[1]
    print("Train session:", train_sess)
    print("Test session:", test_sess)

    train_df = model_df[model_df["session_id"] == train_sess]
    test_df = model_df[model_df["session_id"] == test_sess]

    X_tr, y_tr = train_df[features], train_df["policy"]
    X_te, y_te = test_df[features], test_df["policy"]

    logreg.fit(X_tr, y_tr)
    pred_s = logreg.predict(X_te)

    print(classification_report(y_te, pred_s))
    cm = confusion_matrix(y_te, pred_s, labels=logreg.named_steps["clf"].classes_)
    ConfusionMatrixDisplay(cm, display_labels=logreg.named_steps["clf"].classes_).plot()
    plt.title(f"Session split: train={train_sess}, test={test_sess}")
    plt.show()

## Robustness 2: workload-based generalization

Train on one workload and test on another.
This checks whether the model is learning policy behavior rather than workload artifacts.


In [None]:
workloads = sorted(model_df["workload"].unique().tolist())
print("Workloads:", workloads)

if len(workloads) < 2:
    print("Need at least 2 workloads for workload generalization.")
else:
    train_w = workloads[0]
    test_w = workloads[1]
    print("Train workload:", train_w)
    print("Test workload:", test_w)

    train_df = model_df[model_df["workload"] == train_w]
    test_df = model_df[model_df["workload"] == test_w]

    X_tr, y_tr = train_df[features], train_df["policy"]
    X_te, y_te = test_df[features], test_df["policy"]

    logreg.fit(X_tr, y_tr)
    pred_w = logreg.predict(X_te)

    print(classification_report(y_te, pred_w))
    cm = confusion_matrix(y_te, pred_w, labels=logreg.named_steps["clf"].classes_)
    ConfusionMatrixDisplay(cm, display_labels=logreg.named_steps["clf"].classes_).plot()
    plt.title(f"Workload split: train={train_w}, test={test_w}")
    plt.show()

## Simple distribution plots

These are basic plots you can reuse in the paper.
Keep them readable and minimal.


In [None]:
plot_features = ["cs_per_sec", "mig_per_sec", "ipc", "branch_miss_rate"]

for f in plot_features:
    plt.figure()
    for pol in sorted(model_df["policy"].unique()):
        vals = model_df[model_df["policy"] == pol][f].values
        plt.hist(vals, bins=30, alpha=0.5, label=pol)
    plt.title(f"{f} by policy")
    plt.xlabel(f)
    plt.ylabel("count")
    plt.legend()
    plt.show()