In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cybersecurity-dataset/CIC.csv
/kaggle/input/cybersecurity-dataset/unswnb15.csv
/kaggle/input/cybersecurity-dataset/IOT-23.csv


In [None]:
from __future__ import annotations
from dataclasses import dataclass
from typing import Optional, List, Tuple

import os
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix

import matplotlib.pyplot as plt

from torch_geometric.data import HeteroData
from torch_geometric.nn import Linear, RGCNConv

# HGT optional
try:
    from torch_geometric.nn import HGTConv  # noqa
    _HGT_AVAILABLE = True
except Exception:
    _HGT_AVAILABLE = False


# =============================
# Utils
# =============================
def set_seed(seed: int = 42) -> None:
    import random
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


def ensure_dir(path: str) -> None:
    os.makedirs(path, exist_ok=True)


def save_text(path: str, text: str) -> None:
    with open(path, "w", encoding="utf-8") as f:
        f.write(text)


def signed_log1p_np(x: np.ndarray) -> np.ndarray:
    return np.sign(x) * np.log1p(np.abs(x))


def sanitize_numeric_frame(
    df: pd.DataFrame,
    cols: Optional[List[str]] = None,
    *,
    transform: str = "none",  # "none" | "log1p" | "signed_log1p"
    clip_lower: Optional[float] = None,  # e.g. 0.0 if you know features are non-negative
) -> pd.DataFrame:
    """
    Returns a numeric-only DataFrame with robust handling of NaN/inf.
    """
    if cols is None:
        X = df.select_dtypes(include=["number"]).copy()
    else:
        X = df[cols].copy()

    X = X.replace([np.inf, -np.inf], np.nan).fillna(0.0)

    if clip_lower is not None:
        X = X.clip(lower=clip_lower)

    arr = X.values.astype(np.float64, copy=False)

    if transform == "log1p":
        # log1p requires x > -1; if you expect non-negative features, set clip_lower=0.0
        arr = np.log1p(arr)
    elif transform == "signed_log1p":
        arr = signed_log1p_np(arr)
    elif transform != "none":
        raise ValueError("transform must be one of: none, log1p, signed_log1p")

    arr = np.nan_to_num(arr, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
    return pd.DataFrame(arr, columns=X.columns)


def debug_tensor_stats(name: str, x: torch.Tensor) -> str:
    x_cpu = x.detach().cpu()
    finite = torch.isfinite(x_cpu)
    frac = float(finite.float().mean().item())
    mn = float(torch.nan_to_num(x_cpu, nan=0.0, posinf=0.0, neginf=0.0).min().item())
    mx = float(torch.nan_to_num(x_cpu, nan=0.0, posinf=0.0, neginf=0.0).max().item())
    return f"{name}: shape={tuple(x_cpu.shape)} finite_frac={frac:.6f} min={mn:.4f} max={mx:.4f}"


# =============================
# Confusion matrix figures
# =============================
def save_confusion_matrix(
    cm: np.ndarray,
    labels: List[str],
    out_base: str,
    title: str,
    normalize: Optional[str] = None,  # None | "true" (row) | "pred" (col)
    dpi: int = 300,
):
    cm = np.asarray(cm, dtype=float)

    if normalize is None:
        disp = cm
        annot = cm.astype(int).astype(str)
    else:
        if normalize == "true":
            denom = cm.sum(axis=1, keepdims=True)
        elif normalize == "pred":
            denom = cm.sum(axis=0, keepdims=True)
        else:
            raise ValueError("normalize must be None, 'true', or 'pred'")
        denom[denom == 0] = 1.0
        disp = cm / denom
        annot = np.vectorize(lambda x: f"{x:.2f}")(disp)

    n = len(labels)
    figsize = (max(6.5, 0.38 * n), max(5.5, 0.38 * n))

    fig = plt.figure(figsize=figsize)
    ax = fig.add_subplot(111)
    if normalize is None:
        im = ax.imshow(disp, interpolation="nearest", cmap="Blues")
        vmax = disp.max()
        thresh = vmax * 0.6
    else:
        im = ax.imshow(disp, interpolation="nearest",
                       cmap="Blues", vmin=0, vmax=1)
        thresh = 0.6


    ax.set_title(title, fontsize=14)
    ax.set_xlabel("Predicted", fontsize=12)
    ax.set_ylabel("True", fontsize=12)

    ax.set_xticks(np.arange(n))
    ax.set_yticks(np.arange(n))
    ax.set_xticklabels(labels, fontsize=10, rotation=90)
    ax.set_yticklabels(labels, fontsize=10)


    for i in range(n):
        for j in range(n):
            val = disp[i, j]
            ax.text(
                j, i, annot[i, j],
                ha="center", va="center",
                fontsize=9,
                color=("white" if val > thresh else "black")
            )

    fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
    fig.tight_layout()

    fig.savefig(out_base + ".png", dpi=dpi, bbox_inches="tight")
    fig.savefig(out_base + ".pdf", bbox_inches="tight")
    plt.close(fig)


# =============================
# Baselines
# =============================
@dataclass
class BaselineResult:
    name: str
    report: str
    cm: np.ndarray


def run_baselines(
    X: pd.DataFrame,
    y_enc: pd.Series,
    labels: List[str],
    seed: int = 42,
) -> List[BaselineResult]:
    X_num = X.select_dtypes(include=["number"]).copy()
    X_num = X_num.replace([np.inf, -np.inf], np.nan).fillna(0.0)

    X_train, X_test, y_train, y_test = train_test_split(
        X_num.values, y_enc.values,
        test_size=0.30,
        random_state=seed,
        stratify=y_enc.values
    )

    lr = Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(max_iter=3000, n_jobs=-1))
    ])
    lr.fit(X_train, y_train)
    pred_lr = lr.predict(X_test)
    rep_lr = classification_report(y_test, pred_lr, target_names=labels, zero_division=0)
    cm_lr = confusion_matrix(y_test, pred_lr, labels=np.arange(len(labels)))

    rf = RandomForestClassifier(n_estimators=300, random_state=seed, n_jobs=-1)
    rf.fit(X_train, y_train)
    pred_rf = rf.predict(X_test)
    rep_rf = classification_report(y_test, pred_rf, target_names=labels, zero_division=0)
    cm_rf = confusion_matrix(y_test, pred_rf, labels=np.arange(len(labels)))

    mlp = Pipeline([
        ("scaler", StandardScaler()),
        ("clf", MLPClassifier(hidden_layer_sizes=(256, 128), activation="relu", max_iter=50, random_state=seed))
    ])
    mlp.fit(X_train, y_train)
    pred_mlp = mlp.predict(X_test)
    rep_mlp = classification_report(y_test, pred_mlp, target_names=labels, zero_division=0)
    cm_mlp = confusion_matrix(y_test, pred_mlp, labels=np.arange(len(labels)))

    return [
        BaselineResult("LogisticRegression", rep_lr, cm_lr),
        BaselineResult("RandomForest", rep_rf, cm_rf),
        BaselineResult("MLP", rep_mlp, cm_mlp),
    ]


# =============================
# Graph utilities
# =============================
def make_stratified_masks_from_y(
    y: np.ndarray,
    train: float = 0.7,
    val: float = 0.15,
    seed: int = 42,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    n = len(y)
    idx = np.arange(n)

    idx_train, idx_tmp, y_train, y_tmp = train_test_split(
        idx, y, test_size=(1.0 - train), random_state=seed, stratify=y
    )
    val_frac_of_tmp = val / (1.0 - train)
    idx_val, idx_test = train_test_split(
        idx_tmp, test_size=(1.0 - val_frac_of_tmp), random_state=seed, stratify=y_tmp
    )

    train_mask = torch.zeros(n, dtype=torch.bool)
    val_mask = torch.zeros(n, dtype=torch.bool)
    test_mask = torch.zeros(n, dtype=torch.bool)

    train_mask[idx_train] = True
    val_mask[idx_val] = True
    test_mask[idx_test] = True
    return train_mask, val_mask, test_mask


def add_reverse_edges_hetero(data: HeteroData, rev_prefix: str = "rev_") -> HeteroData:
    for (src, rel, dst) in list(data.edge_types):
        if rel.startswith(rev_prefix):
            continue
        rev_rel = f"{rev_prefix}{rel}"
        if (dst, rev_rel, src) in data.edge_types:
            continue
        ei = data[(src, rel, dst)].edge_index
        data[(dst, rev_rel, src)].edge_index = ei.flip(0)
    return data


def torch_confusion_matrix(y_true: torch.Tensor, y_pred: torch.Tensor, num_classes: int) -> torch.Tensor:
    cm = torch.zeros((num_classes, num_classes), dtype=torch.long, device=y_true.device)
    for t, p in zip(y_true.view(-1), y_pred.view(-1)):
        cm[t.long(), p.long()] += 1
    return cm


def macro_f1_from_cm_np(cm: np.ndarray) -> float:
    cm = cm.astype(float)
    tp = np.diag(cm)
    precision = tp / np.clip(cm.sum(axis=0), 1, None)
    recall = tp / np.clip(cm.sum(axis=1), 1, None)
    f1 = 2 * precision * recall / np.clip(precision + recall, 1e-12, None)
    return float(np.mean(f1))


def standardize_flow_x_inplace(data: HeteroData) -> None:
    x = data["flow"].x.detach().cpu().numpy()
    train_mask = data["flow"].train_mask.detach().cpu().numpy().astype(bool)
    scaler = StandardScaler()
    scaler.fit(x[train_mask])
    x_all = scaler.transform(x)
    x_all = np.nan_to_num(x_all, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
    data["flow"].x = torch.tensor(x_all, dtype=torch.float32)


def compute_class_weights_from_train(data: HeteroData) -> torch.Tensor:
    y = data["flow"].y.detach().cpu().numpy()
    train_mask = data["flow"].train_mask.detach().cpu().numpy().astype(bool)
    y_tr = y[train_mask]
    counts = np.bincount(y_tr)
    counts = np.clip(counts, 1, None)
    w = 1.0 / counts
    w = w * (len(counts) / w.sum())
    return torch.tensor(w, dtype=torch.float32)


# =============================
# HetGNN
# =============================
@torch.no_grad()
def sample_fixed_neighbors(edge_index: torch.Tensor, num_src: int, k: int = 1) -> Tuple[torch.Tensor, torch.Tensor]:
    src, dst = edge_index[0], edge_index[1]
    device = edge_index.device

    perm = torch.argsort(src)
    src_s, dst_s = src[perm], dst[perm]

    uniq, cnt = torch.unique_consecutive(src_s, return_counts=True)
    ptr = torch.zeros(num_src + 1, device=device, dtype=torch.long)
    ptr[uniq + 1] = cnt
    ptr = ptr.cumsum(0)

    out = torch.full((num_src, k), -1, device=device, dtype=torch.long)
    mask = torch.zeros((num_src, k), device=device, dtype=torch.bool)

    deg = ptr[1:] - ptr[:-1]
    for j in range(k):
        has = deg > j
        idx = ptr[:-1] + j
        out[has, j] = dst_s[idx[has]]
        mask[has, j] = True
    return out, mask


@torch.no_grad()
def build_flow_neighbor_cache(data: HeteroData, k: int = 1, src_type: str = "flow"):
    num_src = data[src_type].num_nodes
    cache = {}
    for et in data.edge_types:
        if et[0] != src_type:
            continue
        neigh, mask = sample_fixed_neighbors(data[et].edge_index, num_src=num_src, k=k)
        cache[et] = (neigh, mask)
    return cache


class NodeEncoder(nn.Module):
    def __init__(self, in_dim: Optional[int], num_nodes: int, hidden: int, max_id_embed: int = 200_000):
        super().__init__()
        self.has_x = in_dim is not None
        if self.has_x:
            self.lin = nn.Linear(in_dim, hidden)
            self.emb = None
            self.emb_size = None
        else:
            self.emb_size = int(min(num_nodes, max_id_embed))
            self.emb = nn.Embedding(self.emb_size, hidden)
            self.lin = None

    def forward(self, x_or_ids: torch.Tensor) -> torch.Tensor:
        if self.has_x:
            return self.lin(x_or_ids)
        ids = x_or_ids % self.emb_size
        return self.emb(ids)


class HetGNN(nn.Module):
    def __init__(self, data: HeteroData, hidden: int = 128, out: Optional[int] = None, k: int = 1, dropout: float = 0.2):
        super().__init__()
        self.metadata = data.metadata()
        node_types, edge_types = self.metadata
        if out is None:
            out = int(data["flow"].y.max().item() + 1)

        self.dropout = dropout
        self.enc = nn.ModuleDict()
        for nt in node_types:
            in_dim = data[nt].x.size(-1) if ("x" in data[nt]) else None
            self.enc[nt] = NodeEncoder(in_dim, data[nt].num_nodes, hidden)

        self.groups = [et for et in edge_types if et[0] == "flow"]
        self.cache = build_flow_neighbor_cache(data, k=k, src_type="flow")

        self.gru = nn.ModuleDict()
        self.nei_att = nn.ModuleDict()
        for et in self.groups:
            name = self._gname(et)
            self.gru[name] = nn.GRU(hidden, hidden // 2, batch_first=True, bidirectional=True)
            self.nei_att[name] = nn.Linear(hidden, 1)

        self.mix_att = nn.Linear(2 * hidden, 1)
        self.leakyrelu = nn.LeakyReLU(0.2)

        self.classifier = nn.Sequential(
            nn.Linear(hidden, hidden),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden, out),
        )

    def _gname(self, et) -> str:
        return f"{et[0]}__{et[1]}__{et[2]}"

    def forward(self, data: HeteroData) -> torch.Tensor:
        device = data["flow"].y.device
        N = data["flow"].num_nodes

        if "x" in data["flow"]:
            f_self = self.enc["flow"](data["flow"].x)
        else:
            ids = torch.arange(N, device=device)
            f_self = self.enc["flow"](ids)

        f_self = F.dropout(f_self, p=self.dropout, training=self.training)
        F_list = [f_self]

        for et in self.groups:
            name = self._gname(et)
            dst_type = et[2]

            neigh_ids, neigh_mask = self.cache[et]
            neigh_ids = neigh_ids.to(device)
            neigh_mask = neigh_mask.to(device)
            safe_ids = neigh_ids.clamp(min=0)

            if "x" in data[dst_type]:
                x = data[dst_type].x[safe_ids]
                h0 = self.enc[dst_type](x)
            else:
                h0 = self.enc[dst_type](safe_ids)

            h0 = h0 * neigh_mask.unsqueeze(-1)
            gru_out, _ = self.gru[name](h0)

            scores = self.nei_att[name](gru_out).squeeze(-1)
            scores = scores.masked_fill(~neigh_mask, -1e9)
            alpha = F.softmax(scores, dim=1)
            alpha = alpha * neigh_mask.float()
            alpha = alpha / alpha.sum(dim=1, keepdim=True).clamp(min=1e-12)

            f_nei = (alpha.unsqueeze(-1) * gru_out).sum(dim=1)
            f_nei = F.dropout(f_nei, p=self.dropout, training=self.training)
            F_list.append(f_nei)

        F_stack = torch.stack(F_list, dim=1)
        f_self_rep = f_self.unsqueeze(1).expand(-1, F_stack.size(1), -1)
        mix_in = torch.cat([F_stack, f_self_rep], dim=-1)
        mix_scores = self.leakyrelu(self.mix_att(mix_in)).squeeze(-1)
        mix_alpha = F.softmax(mix_scores, dim=1)
        E = (mix_alpha.unsqueeze(-1) * F_stack).sum(dim=1)

        return self.classifier(E)


# =============================
# HGT / R-GCN
# =============================
class HGTFlowClassifier(nn.Module):
    def __init__(self, data: HeteroData, hidden=64, out=None, layers=2, heads=2, dropout=0.2):
        super().__init__()
        if not _HGT_AVAILABLE:
            raise RuntimeError("HGTConv is not available in this environment.")
        assert hidden % heads == 0, "hidden must be divisible by heads"

        self.metadata = data.metadata()
        node_types, _ = self.metadata
        if out is None:
            out = int(data["flow"].y.max().item() + 1)

        self.dropout = dropout
        self.proj = nn.ModuleDict()
        self.emb = nn.ModuleDict()

        for n in node_types:
            if "x" in data[n]:
                self.proj[n] = Linear(data[n].x.size(-1), hidden)
            else:
                self.emb[n] = nn.Embedding(data[n].num_nodes, hidden)

        try:
            _ = HGTConv(hidden, hidden, metadata=self.metadata, heads=heads)
        except TypeError as e:
            raise RuntimeError("HGTConv failed to initialize (likely torch/pyg mismatch).") from e

        self.convs = nn.ModuleList([
            HGTConv(hidden, hidden, metadata=self.metadata, heads=heads)
            for _ in range(layers)
        ])
        self.cls = Linear(hidden, out)

    def forward(self, data: HeteroData) -> torch.Tensor:
        x_dict = {}
        for n in data.node_types:
            if n in self.proj:
                x = self.proj[n](data[n].x)
            else:
                x = self.emb[n].weight
            x_dict[n] = x

        for conv in self.convs:
            x_dict = conv(x_dict, data.edge_index_dict)
            x_dict = {k: F.relu(v) for k, v in x_dict.items()}
            x_dict = {k: F.dropout(v, p=self.dropout, training=self.training) for k, v in x_dict.items()}
        return self.cls(x_dict["flow"])


class RGCNFlowClassifier(nn.Module):
    def __init__(self, data: HeteroData, hidden=64, out=None, layers=2, num_bases=8, dropout=0.2):
        super().__init__()
        self.metadata = data.metadata()
        node_types, edge_types = self.metadata

        if out is None:
            out = int(data["flow"].y.max().item() + 1)

        self.node_types = list(node_types)
        self.node_type_to_id = {n: i for i, n in enumerate(self.node_types)}
        self.flow_type_id = self.node_type_to_id["flow"]

        homo = data.to_homogeneous()
        self.register_buffer("edge_index", homo.edge_index)
        self.register_buffer("edge_type", homo.edge_type)
        self.register_buffer("node_type", homo.node_type)

        for n, tid in self.node_type_to_id.items():
            idx = (self.node_type == tid).nonzero(as_tuple=False).view(-1)
            self.register_buffer(f"idx__{n}", idx)
        self.register_buffer("flow_idx", (self.node_type == self.flow_type_id).nonzero(as_tuple=False).view(-1))

        self.num_nodes = int(homo.num_nodes)
        self.num_relations = len(edge_types)
        self.hidden = hidden
        self.dropout = dropout

        self.proj = nn.ModuleDict()
        self.emb = nn.ModuleDict()
        for n in self.node_types:
            if "x" in data[n]:
                self.proj[n] = Linear(data[n].x.size(-1), hidden)
            else:
                self.emb[n] = nn.Embedding(data[n].num_nodes, hidden)

        self.convs = nn.ModuleList([
            RGCNConv(hidden, hidden, num_relations=self.num_relations, num_bases=num_bases)
            for _ in range(layers)
        ])
        self.cls = Linear(hidden, out)

    def forward(self, data: HeteroData) -> torch.Tensor:
        x = torch.empty((self.num_nodes, self.hidden), device=self.edge_index.device)

        for n in self.node_types:
            idx = getattr(self, f"idx__{n}")
            if n in self.proj:
                x[idx] = self.proj[n](data[n].x)
            else:
                x[idx] = self.emb[n].weight

        for conv in self.convs:
            x = conv(x, self.edge_index, self.edge_type)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)

        logits_all = self.cls(x)
        return logits_all[self.flow_idx]


# =============================
# Graph train/eval
# =============================
def train_epoch_fullbatch(model: nn.Module, data: HeteroData, opt: torch.optim.Optimizer, class_w: Optional[torch.Tensor]) -> float:
    model.train()
    opt.zero_grad()
    logits = model(data)
    y = data["flow"].y
    loss = F.cross_entropy(
        logits[data["flow"].train_mask],
        y[data["flow"].train_mask],
        weight=class_w
    )
    loss.backward()
    opt.step()
    return float(loss.item())


@torch.no_grad()
def eval_fullbatch(model: nn.Module, data: HeteroData, split: str = "val"):
    model.eval()
    logits = model(data)
    y = data["flow"].y
    mask = data["flow"].val_mask if split == "val" else data["flow"].test_mask
    pred = logits[mask].argmax(dim=-1)
    y_true = y[mask]

    num_classes = int(y.max().item() + 1)
    cm = torch_confusion_matrix(y_true, pred, num_classes=num_classes).detach().cpu().numpy()

    acc = float((pred == y_true).float().mean().item())
    f1 = macro_f1_from_cm_np(cm)
    return acc, f1, cm, y_true.detach().cpu().numpy(), pred.detach().cpu().numpy()


def fit_graph(
    model: nn.Module,
    data: HeteroData,
    *,
    epochs: int = 50,
    lr: float = 2e-3,
    wd: float = 1e-4,
    standardize_flow: bool = True,
    class_weighted_loss: bool = True,
    device: Optional[torch.device] = None,
    name: str = "graph",
):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if standardize_flow:
        standardize_flow_x_inplace(data)

    class_w = None
    if class_weighted_loss:
        class_w = compute_class_weights_from_train(data)

    data = data.to(device)
    model = model.to(device)
    if class_w is not None:
        class_w = class_w.to(device)

    opt = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

    best_val = -1.0
    best_state = None

    for ep in range(1, epochs + 1):
        loss = train_epoch_fullbatch(model, data, opt, class_w)
        val_acc, val_f1, _, _, _ = eval_fullbatch(model, data, split="val")

        if val_f1 > best_val:
            best_val = val_f1
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}

        if ep % max(1, epochs // 5) == 0 or ep == 1:
            print(f"[{name}] ep={ep:03d} loss={loss:.4f} val_acc={val_acc:.4f} val_macroF1={val_f1:.4f}")

    if best_state is not None:
        model.load_state_dict(best_state)

    test_acc, test_f1, test_cm, y_true, y_pred = eval_fullbatch(model, data, split="test")
    return test_acc, test_f1, test_cm, y_true, y_pred


# =============================
# Dataset loaders/builders
# =============================
def load_iot23_csv(path: str = "IOT-23.csv") -> pd.DataFrame:
    df = pd.read_csv(path, na_values=["", "NA", "NaN", "-"])
    comb = "tunnel_parents   label   detailed-label"
    if comb in df.columns and "label" not in df.columns:
        df[["tunnel_parents", "label", "detailed_label"]] = df[comb].astype(str).str.split(r"\s+", n=2, expand=True)
        df = df.drop(columns=[comb])
    return df


def build_iot23_binary(df_iot: pd.DataFrame, seed: int = 42) -> Tuple[pd.DataFrame, pd.Series, HeteroData, List[str]]:
    df = df_iot.copy()
    df = df[df["label"].notna()].reset_index(drop=True)

    df["id.orig_p"] = df["id.orig_p"].astype(int)
    df["id.resp_p"] = df["id.resp_p"].astype(int)
    df["proto"] = df["proto"].astype(str)
    df["service"] = df["service"].fillna("unknown").astype(str)
    df["conn_state"] = df["conn_state"].astype(str)

    df["ts_dt"] = pd.to_datetime(df["ts"], unit="s", errors="coerce")
    df = df.dropna(subset=["ts_dt"]).reset_index(drop=True)
    df["time_window"] = df["ts_dt"].dt.floor("5min").astype(str)

    label_map = {"Benign": 0, "Malicious": 1}
    y = df["label"].map(label_map).astype(int)
    labels = ["Benign", "Malicious"]

    FLOW_FEATS = ["duration", "orig_bytes", "resp_bytes", "orig_pkts", "resp_pkts", "missed_bytes"]
    X = sanitize_numeric_frame(df, cols=FLOW_FEATS, transform="log1p", clip_lower=0.0)

    df["flow_id"] = np.arange(len(df), dtype=np.int64)

    def make_id_map(series: pd.Series):
        uniq = series.astype(str).unique()
        return {v: i for i, v in enumerate(uniq)}

    ip_map = make_id_map(pd.concat([df["id.orig_h"].astype(str), df["id.resp_h"].astype(str)], ignore_index=True))
    port_map = make_id_map(pd.concat([df["id.orig_p"].astype(str), df["id.resp_p"].astype(str)], ignore_index=True))
    proto_map = make_id_map(df["proto"].astype(str))
    service_map = make_id_map(df["service"].astype(str))
    time_map = make_id_map(df["time_window"].astype(str))

    flow_ids = df["flow_id"].values
    src_ip = df["id.orig_h"].astype(str).map(ip_map).astype(np.int64).values
    dst_ip = df["id.resp_h"].astype(str).map(ip_map).astype(np.int64).values
    src_port = df["id.orig_p"].astype(str).map(port_map).astype(np.int64).values
    dst_port = df["id.resp_p"].astype(str).map(port_map).astype(np.int64).values
    proto = df["proto"].astype(str).map(proto_map).astype(np.int64).values
    service = df["service"].astype(str).map(service_map).astype(np.int64).values
    timew = df["time_window"].astype(str).map(time_map).astype(np.int64).values

    data = HeteroData()
    data["flow"].x = torch.tensor(X.values, dtype=torch.float32)
    data["flow"].y = torch.tensor(y.values.astype(np.int64), dtype=torch.long)

    data["ip"].num_nodes = len(ip_map)
    data["port"].num_nodes = len(port_map)
    data["proto"].num_nodes = len(proto_map)
    data["service"].num_nodes = len(service_map)
    data["time"].num_nodes = len(time_map)

    data["flow", "to_ip_orig", "ip"].edge_index = torch.tensor(np.vstack([flow_ids, src_ip]), dtype=torch.long)
    data["flow", "to_ip_resp", "ip"].edge_index = torch.tensor(np.vstack([flow_ids, dst_ip]), dtype=torch.long)
    data["flow", "to_port_orig", "port"].edge_index = torch.tensor(np.vstack([flow_ids, src_port]), dtype=torch.long)
    data["flow", "to_port_resp", "port"].edge_index = torch.tensor(np.vstack([flow_ids, dst_port]), dtype=torch.long)
    data["flow", "to_proto", "proto"].edge_index = torch.tensor(np.vstack([flow_ids, proto]), dtype=torch.long)
    data["flow", "to_service", "service"].edge_index = torch.tensor(np.vstack([flow_ids, service]), dtype=torch.long)
    data["flow", "to_time", "time"].edge_index = torch.tensor(np.vstack([flow_ids, timew]), dtype=torch.long)

    data = add_reverse_edges_hetero(data)
    tr, va, te = make_stratified_masks_from_y(y.values.astype(int), seed=seed)
    data["flow"].train_mask, data["flow"].val_mask, data["flow"].test_mask = tr, va, te
    return X, pd.Series(y.values.astype(int)), data, labels


def load_cic_csv(path: str = "CIC.csv") -> pd.DataFrame:
    return pd.read_csv(path, low_memory=False)


def build_cic_label(df_cic: pd.DataFrame, seed: int = 42) -> Tuple[pd.DataFrame, pd.Series, HeteroData, List[str]]:
    df = df_cic.copy()
    df.columns = [c.strip() for c in df.columns]

    label_col = "label" if "label" in df.columns else ("Label" if "Label" in df.columns else None)
    if label_col is None:
        raise KeyError("CIC: couldn't find label/Label column.")

    required = ["Source IP", "Destination IP", "Source Port", "Destination Port", "Protocol", label_col]
    for c in required:
        if c not in df.columns:
            raise KeyError(f"CIC missing column: {c}")

    df = df.dropna(subset=required).reset_index(drop=True)

    le = LabelEncoder()
    y_enc = le.fit_transform(df[label_col].astype(str).values).astype(np.int64)
    labels = list(le.classes_)

    entity = {"Source IP", "Destination IP", "Source Port", "Destination Port", "Protocol", label_col}
    num_cols = [c for c in df.select_dtypes(include=["number"]).columns if c not in entity]
    if len(num_cols) == 0:
        raise ValueError("CIC: no numeric flow features found after excluding entity columns.")

    X = sanitize_numeric_frame(df, cols=num_cols, transform="signed_log1p", clip_lower=None)

    def make_id_map(series: pd.Series):
        uniq = series.astype(str).unique()
        return {v: i for i, v in enumerate(uniq)}

    ip_map = make_id_map(pd.concat([df["Source IP"].astype(str), df["Destination IP"].astype(str)], ignore_index=True))
    port_map = make_id_map(pd.concat([df["Source Port"].astype(str), df["Destination Port"].astype(str)], ignore_index=True))
    proto_map = make_id_map(df["Protocol"].astype(str))

    flow_ids = np.arange(len(df), dtype=np.int64)
    src_ip = df["Source IP"].astype(str).map(ip_map).astype(np.int64).values
    dst_ip = df["Destination IP"].astype(str).map(ip_map).astype(np.int64).values
    src_port = df["Source Port"].astype(str).map(port_map).astype(np.int64).values
    dst_port = df["Destination Port"].astype(str).map(port_map).astype(np.int64).values
    proto = df["Protocol"].astype(str).map(proto_map).astype(np.int64).values

    data = HeteroData()
    data["flow"].x = torch.tensor(X.values, dtype=torch.float32)
    data["flow"].y = torch.tensor(y_enc, dtype=torch.long)

    data["ip"].num_nodes = len(ip_map)
    data["port"].num_nodes = len(port_map)
    data["proto"].num_nodes = len(proto_map)

    data["flow", "src_ip", "ip"].edge_index = torch.tensor(np.vstack([flow_ids, src_ip]), dtype=torch.long)
    data["flow", "dst_ip", "ip"].edge_index = torch.tensor(np.vstack([flow_ids, dst_ip]), dtype=torch.long)
    data["flow", "src_port", "port"].edge_index = torch.tensor(np.vstack([flow_ids, src_port]), dtype=torch.long)
    data["flow", "dst_port", "port"].edge_index = torch.tensor(np.vstack([flow_ids, dst_port]), dtype=torch.long)
    data["flow", "uses_proto", "proto"].edge_index = torch.tensor(np.vstack([flow_ids, proto]), dtype=torch.long)

    data = add_reverse_edges_hetero(data)
    tr, va, te = make_stratified_masks_from_y(y_enc, seed=seed)
    data["flow"].train_mask, data["flow"].val_mask, data["flow"].test_mask = tr, va, te
    return X, pd.Series(y_enc), data, labels


def load_unsw_csv(path: str = "unswnb15.csv") -> pd.DataFrame:
    return pd.read_csv(path)


def build_unsw_attackcat(df_unsw: pd.DataFrame, min_samples: int = 1000, seed: int = 42) -> Tuple[pd.DataFrame, pd.Series, HeteroData, List[str]]:
    df = df_unsw.copy()

    # Baseline preprocessing aligned with your snippet
    df_mc = df.copy()
    drop_cols = ["proto", "service", "state", "label"]
    drop_cols = [c for c in drop_cols if c in df_mc.columns]
    df_mc = df_mc.drop(columns=drop_cols)

    y_raw = df_mc["attack_cat"].astype(str)
    X_tab = df_mc.drop(columns=["attack_cat"])

    valid = y_raw.value_counts()
    valid = valid[valid >= min_samples].index
    mask = y_raw.isin(valid)

    X_tab = X_tab.loc[mask].reset_index(drop=True)
    y_raw = y_raw.loc[mask].reset_index(drop=True)

    le = LabelEncoder()
    y_enc = le.fit_transform(y_raw.values).astype(np.int64)
    labels = list(le.classes_)

    # Graph build (flow -> proto/service/state)
    df_g = df.loc[mask].reset_index(drop=True)
    df_g["flow_id"] = np.arange(len(df_g), dtype=np.int64)

    proto_vals = df_g["proto"].astype(str).unique()
    service_vals = df_g["service"].astype(str).unique()
    state_vals = df_g["state"].astype(str).unique()
    proto_map = {v: i for i, v in enumerate(proto_vals)}
    service_map = {v: i for i, v in enumerate(service_vals)}
    state_map = {v: i for i, v in enumerate(state_vals)}

    flow_ids = df_g["flow_id"].values
    proto = df_g["proto"].astype(str).map(proto_map).astype(np.int64).values
    service = df_g["service"].astype(str).map(service_map).astype(np.int64).values
    state = df_g["state"].astype(str).map(state_map).astype(np.int64).values

    preferred = ["dur", "spkts", "dpkts", "sbytes", "dbytes", "rate", "sload", "dload", "tcprtt", "synack", "ackdat"]
    if all(c in df_g.columns for c in preferred):
        feat_cols = preferred
    else:
        exclude = {"label", "attack_cat", "proto", "service", "state", "flow_id"}
        feat_cols = [c for c in df_g.select_dtypes(include=["number"]).columns if c not in exclude]
        if len(feat_cols) == 0:
            raise ValueError("UNSW: no numeric features found for flow.x.")

    X_flow = sanitize_numeric_frame(df_g, cols=feat_cols, transform="signed_log1p", clip_lower=None)

    data = HeteroData()
    data["flow"].x = torch.tensor(X_flow.values, dtype=torch.float32)
    data["flow"].y = torch.tensor(y_enc, dtype=torch.long)

    data["proto"].num_nodes = len(proto_map)
    data["service"].num_nodes = len(service_map)
    data["state"].num_nodes = len(state_map)

    data["flow", "uses_proto", "proto"].edge_index = torch.tensor(np.vstack([flow_ids, proto]), dtype=torch.long)
    data["flow", "uses_service", "service"].edge_index = torch.tensor(np.vstack([flow_ids, service]), dtype=torch.long)
    data["flow", "has_state", "state"].edge_index = torch.tensor(np.vstack([flow_ids, state]), dtype=torch.long)

    data = add_reverse_edges_hetero(data)
    tr, va, te = make_stratified_masks_from_y(y_enc, seed=seed)
    data["flow"].train_mask, data["flow"].val_mask, data["flow"].test_mask = tr, va, te
    return X_tab, pd.Series(y_enc), data, labels


# =============================
# Runner
# =============================
def run_one_dataset(
    ds_name: str,
    X: pd.DataFrame,
    y_enc: pd.Series,
    data: HeteroData,
    class_labels: List[str],
    out_reports: str,
    out_cms: str,
    epochs: int = 50,
):
    print(f"\n===== {ds_name} =====")
    vc = pd.Series(y_enc.values).value_counts().sort_index()
    print("Class distribution:", vc.to_dict())

    # quick sanity on flow.x
    print(debug_tensor_stats(f"{ds_name}/flow.x", data["flow"].x))

    # ---- Baselines ----
    results = run_baselines(X, y_enc, labels=class_labels, seed=42)
    for r in results:
        save_text(os.path.join(out_reports, f"{ds_name}__{r.name}__report.txt"), r.report)
        save_confusion_matrix(r.cm, class_labels,
                              os.path.join(out_cms, f"{ds_name}__{r.name}__cm_raw"),
                              f"{ds_name} - {r.name}",
                              normalize=None)
        save_confusion_matrix(r.cm, class_labels,
                              os.path.join(out_cms, f"{ds_name}__{r.name}__cm_norm"),
                              f"{ds_name} - {r.name}",
                              normalize="true")

    # ---- Graph models ----
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def _save_graph(model_name: str, y_true: np.ndarray, y_pred: np.ndarray, cm: np.ndarray):
        rep = classification_report(y_true, y_pred, target_names=class_labels, zero_division=0)
        save_text(os.path.join(out_reports, f"{ds_name}__{model_name}__report.txt"), rep)
        save_confusion_matrix(cm, class_labels,
                              os.path.join(out_cms, f"{ds_name}__{model_name}__cm_raw"),
                              f"{ds_name} - {model_name}",
                              normalize=None)
        save_confusion_matrix(cm, class_labels,
                              os.path.join(out_cms, f"{ds_name}__{model_name}__cm_norm"),
                              f"{ds_name} - {model_name}",
                              normalize="true")

    # HetGNN
    het = HetGNN(data, hidden=128, k=1, dropout=0.2)
    acc, f1, cm, y_true, y_pred = fit_graph(
        het, data, epochs=epochs, device=device, name=f"{ds_name} HetGNN",
        standardize_flow=True, class_weighted_loss=True
    )
    print(f"{ds_name} HetGNN test_acc={acc:.4f} test_macroF1={f1:.4f}")
    _save_graph("HetGNN", y_true, y_pred, cm)

    # R-GCN
    rgcn = RGCNFlowClassifier(data, hidden=64, layers=2, num_bases=8, dropout=0.2)
    acc, f1, cm, y_true, y_pred = fit_graph(
        rgcn, data, epochs=epochs, device=device, name=f"{ds_name} R-GCN",
        standardize_flow=True, class_weighted_loss=True
    )
    print(f"{ds_name} R-GCN test_acc={acc:.4f} test_macroF1={f1:.4f}")
    _save_graph("RGCN", y_true, y_pred, cm)

    # HGT (optional)
    if _HGT_AVAILABLE:
        try:
            hgt = HGTFlowClassifier(data, hidden=64, layers=2, heads=2, dropout=0.2)
            acc, f1, cm, y_true, y_pred = fit_graph(
                hgt, data, epochs=epochs, device=device, name=f"{ds_name} HGT",
                standardize_flow=True, class_weighted_loss=True
            )
            print(f"{ds_name} HGT test_acc={acc:.4f} test_macroF1={f1:.4f}")
            _save_graph("HGT", y_true, y_pred, cm)
        except Exception as e:
            warn = f"HGT skipped for {ds_name} due to: {repr(e)}"
            print(warn)
            save_text(os.path.join(out_reports, f"{ds_name}__HGT__SKIPPED.txt"), warn)
    else:
        warn = f"HGTConv not available -> HGT skipped for {ds_name}."
        print(warn)
        save_text(os.path.join(out_reports, f"{ds_name}__HGT__SKIPPED.txt"), warn)


def main():
    set_seed(42)

    OUT = "outputs"
    out_reports = os.path.join(OUT, "reports")
    out_cms = os.path.join(OUT, "confusion_matrices")
    ensure_dir(out_reports)
    ensure_dir(out_cms)

    # ---- Load local CSVs ----
    df_iot = load_iot23_csv("/kaggle/input/cybersecurity-dataset/IOT-23.csv")
    df_cic = load_cic_csv("/kaggle/input/cybersecurity-dataset/CIC.csv")
    df_unsw = load_unsw_csv("/kaggle/input/cybersecurity-dataset/unswnb15.csv")

    # ---- IoT-23 (binary) ----
    X_iot, y_iot, data_iot, labels_iot = build_iot23_binary(df_iot, seed=42)
    run_one_dataset("IoT23_binary", X_iot, y_iot, data_iot, labels_iot, out_reports, out_cms, epochs=100)

    # ---- CIC (label) ----
    X_cic, y_cic, data_cic, labels_cic = build_cic_label(df_cic, seed=42)
    run_one_dataset("CIC_label", X_cic, y_cic, data_cic, labels_cic, out_reports, out_cms, epochs=100)

    # ---- UNSW (attack_cat multi-class) ----
    X_unsw, y_unsw, data_unsw, labels_unsw = build_unsw_attackcat(df_unsw, min_samples=1000, seed=42)
    run_one_dataset("UNSW_attack_cat", X_unsw, y_unsw, data_unsw, labels_unsw, out_reports, out_cms, epochs=100)

    print("\nDone. Check outputs/")


main()



===== IoT23_binary =====
Class distribution: {0: 1923, 1: 21222}
IoT23_binary/flow.x: shape=(23145, 6) finite_frac=1.000000 min=0.0000 max=18.1403
[IoT23_binary HetGNN] ep=001 loss=0.6826 val_acc=0.9899 val_macroF1=0.9652
[IoT23_binary HetGNN] ep=020 loss=0.0163 val_acc=0.9945 val_macroF1=0.9825
[IoT23_binary HetGNN] ep=040 loss=0.0015 val_acc=1.0000 val_macroF1=1.0000
[IoT23_binary HetGNN] ep=060 loss=0.0008 val_acc=1.0000 val_macroF1=1.0000
[IoT23_binary HetGNN] ep=080 loss=0.0008 val_acc=1.0000 val_macroF1=1.0000
[IoT23_binary HetGNN] ep=100 loss=0.0008 val_acc=1.0000 val_macroF1=1.0000
IoT23_binary HetGNN test_acc=0.9994 test_macroF1=0.9981
[IoT23_binary R-GCN] ep=001 loss=1.4278 val_acc=0.9170 val_macroF1=0.4784
[IoT23_binary R-GCN] ep=020 loss=0.0342 val_acc=0.9977 val_macroF1=0.9925
[IoT23_binary R-GCN] ep=040 loss=0.0072 val_acc=0.9997 val_macroF1=0.9991
[IoT23_binary R-GCN] ep=060 loss=0.0031 val_acc=1.0000 val_macroF1=1.0000
[IoT23_binary R-GCN] ep=080 loss=0.0011 val_acc=1.



[UNSW_attack_cat HetGNN] ep=001 loss=2.2037 val_acc=0.0879 val_macroF1=0.0681
[UNSW_attack_cat HetGNN] ep=020 loss=1.5455 val_acc=0.5074 val_macroF1=0.2737
[UNSW_attack_cat HetGNN] ep=040 loss=1.3391 val_acc=0.5724 val_macroF1=0.3219
[UNSW_attack_cat HetGNN] ep=060 loss=1.2671 val_acc=0.6002 val_macroF1=0.3526
[UNSW_attack_cat HetGNN] ep=080 loss=1.2289 val_acc=0.6149 val_macroF1=0.3816
[UNSW_attack_cat HetGNN] ep=100 loss=1.1981 val_acc=0.6207 val_macroF1=0.3886
UNSW_attack_cat HetGNN test_acc=0.6209 test_macroF1=0.3896
[UNSW_attack_cat R-GCN] ep=001 loss=3.2023 val_acc=0.1672 val_macroF1=0.1073
[UNSW_attack_cat R-GCN] ep=020 loss=1.5800 val_acc=0.5263 val_macroF1=0.3021
[UNSW_attack_cat R-GCN] ep=040 loss=1.4488 val_acc=0.5744 val_macroF1=0.3176
[UNSW_attack_cat R-GCN] ep=060 loss=1.3857 val_acc=0.5959 val_macroF1=0.3264
[UNSW_attack_cat R-GCN] ep=080 loss=1.3538 val_acc=0.5846 val_macroF1=0.3269
[UNSW_attack_cat R-GCN] ep=100 loss=1.3302 val_acc=0.6028 val_macroF1=0.3338
UNSW_attack

In [4]:
!zip -r /kaggle/working/reports.zip /kaggle/working/outputs/reports

  adding: kaggle/working/outputs/reports/ (stored 0%)
  adding: kaggle/working/outputs/reports/CIC_label__MLP__report.txt (deflated 68%)
  adding: kaggle/working/outputs/reports/UNSW_attack_cat__LogisticRegression__report.txt (deflated 62%)
  adding: kaggle/working/outputs/reports/IoT23_binary__LogisticRegression__report.txt (deflated 55%)
  adding: kaggle/working/outputs/reports/CIC_label__HGT__report.txt (deflated 67%)
  adding: kaggle/working/outputs/reports/CIC_label__LogisticRegression__report.txt (deflated 68%)
  adding: kaggle/working/outputs/reports/UNSW_attack_cat__HGT__report.txt (deflated 61%)
  adding: kaggle/working/outputs/reports/IoT23_binary__RandomForest__report.txt (deflated 56%)
  adding: kaggle/working/outputs/reports/UNSW_attack_cat__HetGNN__report.txt (deflated 61%)
  adding: kaggle/working/outputs/reports/IoT23_binary__RGCN__report.txt (deflated 60%)
  adding: kaggle/working/outputs/reports/IoT23_binary__HetGNN__report.txt (deflated 62%)
  adding: kaggle/working/

In [5]:
!zip -r /kaggle/working/cm.zip /kaggle/working/outputs/confusion_matrices

  adding: kaggle/working/outputs/confusion_matrices/ (stored 0%)
  adding: kaggle/working/outputs/confusion_matrices/IoT23_binary__MLP__cm_norm.pdf (deflated 32%)
  adding: kaggle/working/outputs/confusion_matrices/UNSW_attack_cat__HGT__cm_norm.png (deflated 9%)
  adding: kaggle/working/outputs/confusion_matrices/IoT23_binary__RandomForest__cm_raw.pdf (deflated 31%)
  adding: kaggle/working/outputs/confusion_matrices/IoT23_binary__RandomForest__cm_raw.png (deflated 23%)
  adding: kaggle/working/outputs/confusion_matrices/UNSW_attack_cat__MLP__cm_norm.png (deflated 10%)
  adding: kaggle/working/outputs/confusion_matrices/IoT23_binary__RGCN__cm_norm.png (deflated 24%)
  adding: kaggle/working/outputs/confusion_matrices/UNSW_attack_cat__RandomForest__cm_norm.pdf (deflated 31%)
  adding: kaggle/working/outputs/confusion_matrices/UNSW_attack_cat__RGCN__cm_raw.png (deflated 10%)
  adding: kaggle/working/outputs/confusion_matrices/UNSW_attack_cat__RandomForest__cm_raw.pdf (deflated 30%)
  add