In [2]:

import os, time, json, sys, warnings, gc
warnings.filterwarnings("ignore", message=".*glibc.*")
warnings.filterwarnings("ignore")
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, roc_curve, roc_auc_score, average_precision_score
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import StratifiedShuffleSplit

# ---- SETTINGS: flip these for speed/quality trade-offs ----
FAST = True             # True = much faster; False = full
FAST_FRAC = 0.25        # use 25% of train/test (stratified) when FAST
MAKE_HEAVY_PLOTS = not FAST  # skip heavy plots in FAST
TOPK_IMPORTANCE = 12 if FAST else 20

def status(msg):
    print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}")

# ---- Paths ----
OUTDIR = "Data"
FIGDIR = os.path.join(OUTDIR, "figs")
os.makedirs(FIGDIR, exist_ok=True)
TRAIN_PATH = os.path.join(OUTDIR, "Train.txt")
TEST_PATH  = os.path.join(OUTDIR, "Test.txt")

# ---- Load ----
status("Loading data…")
COLS = ["duration","protocoltype","service","flag","srcbytes","dstbytes","land","wrongfragment","urgent","hot",
        "numfailedlogins","loggedin","numcompromised","rootshell","suattempted","numroot","numfilecreations",
        "numshells","numaccessfiles","numoutboundcmds","ishostlogin","isguestlogin","count","srvcount","serrorrate",
        "srvserrorrate","rerrorrate","srvrerrorrate","samesrvrate","diffsrvrate","srvdiffhostrate","dsthostcount",
        "dsthostsrvcount","dsthostsamesrvrate","dsthostdiffsrvrate","dsthostsamesrcportrate","dsthostsrvdiffhostrate",
        "dsthostserrorrate","dsthostsrvserrorrate","dsthostrerrorrate","dsthostsrvrerrorrate","attack","lastflag"]
DROP_COLS = ["land","urgent","numfailedlogins","numoutboundcmds"]

train_df = pd.read_csv(TRAIN_PATH, sep=",", names=COLS)
test_df  = pd.read_csv(TEST_PATH,  sep=",", names=COLS)
status(f"Loaded Train: {train_df.shape} | Test: {test_df.shape}")

# ---- Prep ----
def to_binary(df):
    df = df.copy()
    df["attack"] = np.where(df["attack"]=="normal", "normal", "attack")
    return df

train_df = to_binary(train_df)
test_df  = to_binary(test_df)

for c in DROP_COLS:
    if c in train_df: train_df.drop(columns=c, inplace=True)
    if c in test_df:  test_df.drop(columns=c, inplace=True)

# Label encoders fit on combined to avoid unseen categories crash
status("Encoding categorical columns…")
LE_protocol = LabelEncoder()
LE_service  = LabelEncoder()
LE_flag     = LabelEncoder()
LE_attack   = LabelEncoder()

proto_all  = pd.concat([train_df["protocoltype"], test_df["protocoltype"]], axis=0)
serv_all   = pd.concat([train_df["service"],      test_df["service"]],      axis=0)
flag_all   = pd.concat([train_df["flag"],         test_df["flag"]],         axis=0)
attack_all = pd.concat([train_df["attack"],       test_df["attack"]],       axis=0)

LE_protocol.fit(proto_all)
LE_service.fit(serv_all)
LE_flag.fit(flag_all)
LE_attack.fit(attack_all)

for df in (train_df, test_df):
    df["protocoltype"] = LE_protocol.transform(df["protocoltype"])
    df["service"]      = LE_service.transform(df["service"])
    df["flag"]         = LE_flag.transform(df["flag"])
    df["attack"]       = LE_attack.transform(df["attack"]).astype(int)

X_train = train_df.drop(columns=["attack"])
y_train = train_df["attack"]
X_test  = test_df.drop(columns=["attack"])
y_test  = test_df["attack"]

# ---- FAST subsample (stratified) ----
if FAST:
    status(f"FAST mode ON → subsampling {int(FAST_FRAC*100)}% stratified…")
    def strat_subsample(X, y, frac):
        if frac >= 1.0: return X, y
        sss = StratifiedShuffleSplit(n_splits=1, test_size=1-frac, random_state=42)
        idx_keep, _ = next(sss.split(X, y))
        return X.iloc[idx_keep], y.iloc[idx_keep]
    X_train, y_train = strat_subsample(X_train, y_train, FAST_FRAC)
    X_test,  y_test  = strat_subsample(X_test,  y_test,  FAST_FRAC)
    status(f"New Train: {X_train.shape} | New Test: {X_test.shape}")

# ---- Scale ----
status("Scaling features…")
scaler = StandardScaler().fit(X_train)
Xtr = scaler.transform(X_train)
Xte = scaler.transform(X_test)

# ---- Models (trimmed & fast) ----
status("Setting up models…")
models = {
    "HistGradBoost (FAST)": HistGradientBoostingClassifier(
        max_iter=150 if FAST else 300,
        learning_rate=0.1,
        random_state=42
    ),
    "RandomForest (small)": RandomForestClassifier(
        n_estimators=200 if FAST else 400,
        max_depth=None,
        min_samples_leaf=2,
        n_jobs=-1,
        random_state=42
    ),
    "Logistic Reg.": LogisticRegression(max_iter=400, n_jobs=-1)
}

# Optional XGBoost with early stopping (kept small)
if "XGBoost" in name:
    # early stopping with a small valid set from train to keep it fast
    split = int(0.85 * len(Xtr))
    Xtr_fit, ytr_fit = Xtr[:split], y_train.iloc[:split]
    Xval,    yval    = Xtr[split:], y_train.iloc[split:]

    try:
        import inspect
        if "early_stopping_rounds" in inspect.signature(model.fit).parameters:
            # Older/compatible API
            model.fit(
                Xtr_fit, ytr_fit,
                eval_set=[(Xval, yval)],
                early_stopping_rounds=20,
                verbose=False
            )
        else:
            # Newer API: use callbacks
            from xgboost.callback import EarlyStopping
            model.fit(
                Xtr_fit, ytr_fit,
                eval_set=[(Xval, yval)],
                callbacks=[EarlyStopping(rounds=20, save_best=True, maximize=False)],
                verbose=False
            )
    except TypeError:
        # Last-resort fallback: no early stopping, fewer trees to stay FAST
        model.set_params(n_estimators=min(model.get_params().get("n_estimators", 250), 200))
        model.fit(Xtr_fit, ytr_fit, verbose=False)

# ---- Train/Eval helpers ----
def eval_scores(y_true, y_pred, y_score):
    acc = accuracy_score(y_true, y_pred)
    pre = precision_score(y_true, y_pred, average="weighted")
    rec = recall_score(y_true, y_pred, average="weighted")
    f1  = f1_score(y_true, y_pred, average="weighted")
    roc = roc_auc_score(y_true, y_score) if y_score is not None else np.nan
    ap  = average_precision_score(y_true, y_score) if y_score is not None else np.nan
    return acc, pre, rec, f1, roc, ap

# ---- Run ----
RUN_TAG = datetime.now().strftime("%Y%m%d_%H%M%S")
run_log = {"run_tag": RUN_TAG, "env": {"python": sys.version}, "FAST": FAST, "models": {}}
cms, curves = {}, {}

status("Training & evaluating models…")
for name, model in models.items():
    status(f"→ Fitting: {name} …")
    t0 = time.time()
    if "XGBoost" in name:
        # early stopping with a small valid set from train to keep it fast
        split = int(0.85 * len(Xtr))
        Xtr_fit, ytr_fit = Xtr[:split], y_train.iloc[:split]
        Xval,    yval    = Xtr[split:], y_train.iloc[split:]
        model.fit(Xtr_fit, ytr_fit, eval_set=[(Xval, yval)], verbose=False, early_stopping_rounds=20)
    else:
        model.fit(Xtr, y_train)
    fit_s = time.time() - t0
    status(f"   fit done in {fit_s:.2f}s — scoring…")

    y_pred = model.predict(Xte)
    if hasattr(model, "predict_proba"):
        y_score = model.predict_proba(Xte)[:, 1]
    elif hasattr(model, "decision_function"):
        y_score = model.decision_function(Xte)
    else:
        y_score = None

    acc, pre, rec, f1, roc, ap = eval_scores(y_test, y_pred, y_score)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    cms[name] = (tn, fp, fn, tp)

    if y_score is not None:
        fpr, tpr, _ = roc_curve(y_test, y_score)
        curves[name] = (fpr, tpr, roc)

    run_log["models"][name] = {
        "fit_seconds": round(fit_s, 2),
        "metrics": {
            "accuracy": round(acc, 4),
            "precision_weighted": round(pre, 4),
            "recall_weighted": round(rec, 4),
            "f1_weighted": round(f1, 4),
            "roc_auc": round(float(roc), 4) if not np.isnan(roc) else None,
            "avg_precision": round(float(ap), 4) if not np.isnan(ap) else None,
        },
        "confusion_matrix": {"tn": int(tn), "fp": int(fp), "fn": int(fn), "tp": int(tp)},
    }
    status(f"   {name} — Acc:{acc:.3f} | F1_w:{f1:.3f} | AUC:{(roc if not np.isnan(roc) else float('nan')):.3f}")

# ---- Save log ----
log_path = os.path.join(OUTDIR, f"run_log_{RUN_TAG}.json")
with open(log_path, "w") as f:
    json.dump(run_log, f, indent=2)
status(f"Saved log → {log_path}")

# ---- Quick leaderboard print ----
rows = []
for name, rec in run_log["models"].items():
    m = rec["metrics"]
    rows.append([name, m["accuracy"], m["precision_weighted"], m["recall_weighted"], m["f1_weighted"], m["roc_auc"], rec["fit_seconds"]])
tbl = pd.DataFrame(rows, columns=["Model","Acc","Prec_w","Rec_w","F1_w","ROC_AUC","Fit(s)"]).sort_values("F1_w", ascending=False)
status("Leaderboard (Test):")
print(tbl.to_string(index=False))

# ---- Minimal plots in FAST; full only if MAKE_HEAVY_PLOTS ----
plt.rcParams.update({
    "figure.facecolor": "#0b1020",
    "axes.facecolor": "#0b1020",
    "axes.labelcolor": "#e9ecf1",
    "text.color": "#e9ecf1",
    "xtick.color": "#8b93a6",
    "ytick.color": "#8b93a6",
    "grid.color": "#2a3346",
    "grid.alpha": 0.35,
    "axes.grid": True,
})

def savefig(path):
    plt.tight_layout()
    plt.savefig(path, dpi=160, bbox_inches="tight")
    plt.close()

# Confusion matrices (small grid)
status("Plotting confusion matrices…")
n = len(cms)
ncols = 2 if FAST else 3
nrows = (n + ncols - 1) // ncols
plt.figure(figsize=(6*ncols, 3.7*nrows))
for i, (name, (tn, fp, fn, tp)) in enumerate(cms.items(), 1):
    plt.subplot(nrows, ncols, i)
    cm = np.array([[tn, fp],[fn, tp]])
    im = plt.imshow(cm, cmap="magma")
    for (r,c), val in np.ndenumerate(cm):
        plt.text(c, r, f"{val:,}", ha="center", va="center", color="#e9ecf1")
    plt.xticks([0,1], ["Attack(0)","Normal(1)"], rotation=15)
    plt.yticks([0,1], ["Attack(0)","Normal(1)"])
    plt.title(f"{name}")
    plt.colorbar(im, fraction=0.046, pad=0.04)
savefig(os.path.join(FIGDIR, "confusion_matrices_fast.png"))

# ROC (only if scores exist)
if curves:
    status("Plotting ROC curves…")
    plt.figure(figsize=(6.5,5.5))
    for name, (fpr, tpr, aucv) in curves.items():
        plt.plot(fpr, tpr, label=f"{name} (AUC={aucv:.3f})", linewidth=2)
    plt.plot([0,1],[0,1],"--", color="#8b93a6")
    plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
    plt.title("ROC — Test")
    plt.legend(frameon=False, loc="lower right")
    savefig(os.path.join(FIGDIR, "roc_curves_fast.png"))

# Feature importance (only for tree models with attr)
status("Plotting feature importance…")
def plot_feature_importance(model, name, feature_names, topk=TOPK_IMPORTANCE):
    if hasattr(model, "feature_importances_"):
        imp = model.feature_importances_
        idx = np.argsort(imp)[::-1][:topk]
        plt.figure(figsize=(7,0.4*topk+2))
        y = np.array(feature_names)[idx]
        x = imp[idx]
        plt.barh(y, x, color="#7c83ff", edgecolor="black")
        plt.gca().invert_yaxis()
        plt.xlabel("Importance"); plt.title(f"{name} — Top {topk}")
        savefig(os.path.join(FIGDIR, f"featimp_{name.replace(' ','_')}.png"))

for name, mdl in models.items():
    plot_feature_importance(mdl, name, X_train.columns, topk=TOPK_IMPORTANCE)

status(f"Done. Figures in {FIGDIR}")
gc.collect();


[20:23:54] Loading data…
[20:23:54] Loaded Train: (125973, 43) | Test: (22544, 43)
[20:23:54] Encoding categorical columns…
[20:23:54] FAST mode ON → subsampling 25% stratified…
[20:23:54] New Train: (31493, 38) | New Test: (5636, 38)
[20:23:54] Scaling features…
[20:23:54] Setting up models…
[20:23:55] Training & evaluating models…
[20:23:55] → Fitting: HistGradBoost (FAST) …
[20:23:55]    fit done in 0.54s — scoring…
[20:23:55]    HistGradBoost (FAST) — Acc:0.843 | F1_w:0.843 | AUC:0.980
[20:23:55] → Fitting: RandomForest (small) …
[20:23:56]    fit done in 0.53s — scoring…
[20:23:56]    RandomForest (small) — Acc:0.816 | F1_w:0.815 | AUC:0.983
[20:23:56] → Fitting: Logistic Reg. …
[20:23:57]    fit done in 1.33s — scoring…
[20:23:57]    Logistic Reg. — Acc:0.836 | F1_w:0.836 | AUC:0.924
[20:23:57] Saved log → Data/run_log_20251027_202355.json
[20:23:57] Leaderboard (Test):
                Model     Acc  Prec_w   Rec_w    F1_w  ROC_AUC  Fit(s)
 HistGradBoost (FAST)  0.8426  0.8731  0

In [3]:
# ===== Better Confusion Matrices (counts + row % with nicer colors) =====
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def _fmt_cm_annot(cm):
    """Return string annotations like '9,443\n97.2%' for each cell."""
    cm = cm.astype(float)
    row_sum = cm.sum(axis=1, keepdims=True)
    row_sum[row_sum == 0] = 1.0
    pct = cm / row_sum * 100.0
    annot = np.empty_like(cm, dtype=object)
    for r in range(cm.shape[0]):
        for c in range(cm.shape[1]):
            annot[r, c] = f"{int(cm[r, c]):,}\n{pct[r, c]:.1f}%"
    return annot, pct

def plot_confusion_matrix_pretty(cm_counts, labels=("Attack(0)", "Normal(1)"), title="Confusion Matrix", save_path=None):
    """
    cm_counts = [[tn, fp],
                 [fn, tp]]
    Shows counts + row-normalized percentages; colors by percentage.
    """
    cm = np.array(cm_counts, dtype=float)
    annot, pct = _fmt_cm_annot(cm)

    plt.figure(figsize=(5.8, 4.6))
    # good perceptual map; works on light/dark backgrounds
    ax = sns.heatmap(
        pct,                     # color by row %
        annot=annot,             # show both values
        fmt="", 
        cmap="crest",            # pleasant green/teal gradient
        vmin=0, vmax=100,
        cbar_kws={"label": "Row %"},
        linewidths=0.5, linecolor="white"
    )
    ax.set_xticklabels(labels, rotation=15)
    ax.set_yticklabels(labels, rotation=0)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("Actual")
    ax.set_title(title, pad=10)
    plt.tight_layout()
    if save_path:
        plt.savefig(save_path, dpi=160, bbox_inches="tight")
        plt.close()
    else:
        plt.show()

# --- Use the counts we already stored in `cms` ---
# `cms[name] = (tn, fp, fn, tp)` from your loop
status("Plotting improved confusion matrices with percentages…")
n = len(cms)
ncols = 2 if FAST else 3
nrows = (n + ncols - 1) // ncols
plt.figure(figsize=(6*ncols, 4.6*nrows))

for i, (name, (tn, fp, fn, tp)) in enumerate(cms.items(), 1):
    plt.subplot(nrows, ncols, i)
    cm_counts = [[tn, fp],[fn, tp]]
    annot, pct = _fmt_cm_annot(np.array(cm_counts))
    sns.heatmap(
        pct, annot=annot, fmt="",
        cmap="crest", vmin=0, vmax=100,
        cbar_kws={"label": "Row %"},
        linewidths=0.5, linecolor="white"
    )
    plt.xticks([0.5,1.5], ["Attack(0)","Normal(1)"], rotation=15)
    plt.yticks([0.5,1.5], ["Attack(0)","Normal(1)"], rotation=0)
    plt.title(name, pad=8)

plt.tight_layout()
savefig(os.path.join(FIGDIR, "confusion_matrices_pretty.png"))
status("Saved → " + os.path.join(FIGDIR, "confusion_matrices_pretty.png"))


[20:26:28] Plotting improved confusion matrices with percentages…
[20:26:28] Saved → Data/figs/confusion_matrices_pretty.png
