<a href="https://colab.research.google.com/github/apropos0/Scheduling_Inference/blob/main/notebooks/02_models_robustness_figures.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 02 â€” Models + Robustness + Figures

Loads processed features if present; otherwise rebuilds from raw CSV (load-or-build).


In [None]:
!rm -rf Scheduling_Inference
!git clone https://github.com/apropos0/Scheduling_Inference.git

In [None]:
SESSION_ID = "2025-12-31_A"

In [None]:
import sys
sys.path.append("Scheduling_Inference")

from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

from src.paths import raw_csv, clean_parquet
from src.features import add_features


In [None]:
p = clean_parquet(SESSION_ID)

if p.exists():
    df = pd.read_parquet(p)
    print("Loaded processed:", p, "shape:", df.shape)
else:
    rp = raw_csv(SESSION_ID)
    if not rp.exists():
        raise FileNotFoundError(
            f"Missing raw CSV: {rp}\n"
            f"Expected: Scheduling_Inference/data/raw/results_{SESSION_ID}.csv"
        )
    raw = pd.read_csv(rp)
    df = add_features(raw)
    p.parent.mkdir(parents=True, exist_ok=True)
    df.to_parquet(p, index=False)
    print("Built + wrote processed:", p, "shape:", df.shape)

df.head()

In [None]:
features = [
    "cs_per_sec",
    "mig_per_sec",
    "cycles_per_sec",
    "instr_per_sec",
    "branches_per_sec",
    "ipc",
    "branch_miss_rate",
]

model_df = df.dropna(subset=features + ["policy"]).copy()
X = model_df[features]
y = model_df["policy"]

print("Rows:", model_df.shape[0])
print("Class balance:\n", y.value_counts())

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=42,
    stratify=y,
)

logreg = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=2000)),
])

logreg.fit(X_train, y_train)
pred = logreg.predict(X_test)

print("LogReg (holdout)")
print(classification_report(y_test, pred))

cm = confusion_matrix(y_test, pred, labels=logreg.named_steps["clf"].classes_)
ConfusionMatrixDisplay(cm, display_labels=logreg.named_steps["clf"].classes_).plot()
plt.title("Confusion matrix (LogReg holdout)")
plt.show()

In [None]:
rf = RandomForestClassifier(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)

print("RandomForest (holdout)")
print(classification_report(y_test, pred_rf))

cm = confusion_matrix(y_test, pred_rf, labels=rf.classes_)
ConfusionMatrixDisplay(cm, display_labels=rf.classes_).plot()
plt.title("Confusion matrix (RF holdout)")
plt.show()

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores_lr = cross_val_score(logreg, X, y, cv=cv, scoring="accuracy")
scores_rf = cross_val_score(rf, X, y, cv=cv, scoring="accuracy")

print("LogReg CV acc:", float(scores_lr.mean()), "+/-", float(scores_lr.std()))
print("RF CV acc:", float(scores_rf.mean()), "+/-", float(scores_rf.std()))

In [None]:
clf = logreg.named_steps["clf"]
coef_df = pd.DataFrame(clf.coef_, columns=features, index=clf.classes_)
coef_df

In [None]:
imp = pd.Series(rf.feature_importances_, index=features).sort_values(ascending=False)
imp