<a href="https://colab.research.google.com/github/apropos0/Scheduling_Inference/blob/main/notebooks/01_data_and_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 01 â€” Data + Features

Loads a session CSV, runs feature engineering, and writes a processed Parquet artifact.

Expected raw input path:
- `data/raw/results_<SESSION_ID>.csv`

Output:
- `data/processed/clean_<SESSION_ID>.parquet`


In [None]:
!rm -rf Scheduling_Inference
!git clone https://github.com/apropos0/Scheduling_Inference.git

In [None]:
# Change this per run
SESSION_ID = "2025-12-31_A"

In [None]:
import sys
sys.path.append("Scheduling_Inference")

import pandas as pd

from src.paths import raw_csv, clean_parquet
from src.features import add_features


In [None]:
rp = raw_csv(SESSION_ID)
if not rp.exists():
    raise FileNotFoundError(
        f"Missing raw CSV: {rp}\n"
        f"Expected: Scheduling_Inference/data/raw/results_{SESSION_ID}.csv"
    )

raw = pd.read_csv(rp)
print("Loaded:", rp)
print("Shape:", raw.shape)
raw.head()

In [None]:
print("Policies:\n", raw["policy"].value_counts(), "\n")
print("Workloads:\n", raw["workload"].value_counts(), "\n")
if "session_id" in raw.columns:
    print("Sessions:\n", raw["session_id"].value_counts(), "\n")

raw.describe(include="all").transpose().head(20)

In [None]:
df = add_features(raw)
df.head()

In [None]:
cols = ["cs_per_sec","mig_per_sec","ipc","branch_miss_rate"]
print(df[cols].describe())
print("\nMissing values:\n", df[cols].isna().sum())

In [None]:
outp = clean_parquet(SESSION_ID)
outp.parent.mkdir(parents=True, exist_ok=True)
df.to_parquet(outp, index=False)
print("Wrote:", outp)