# 12/22　CaT-GNN

https://arxiv.org/pdf/2402.14708

In [None]:
# A. ランタイムを一度再起動してクリーンにしてください（UI: ランタイム → ランタイムを再起動）

# B. 壊れた残骸を削除（警告に出ていた ~orch を消す）
!rm -rf /usr/local/lib/python3.12/dist-packages/~orch*
!rm -rf /usr/local/lib/python3.12/dist-packages/~orch

# C. 既存の torch 関連と問題になりそうなパッケージを一旦アンインストール
!pip uninstall -y torch torchvision torchaudio torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric fastai timm

# D. pip の利用可能なバージョン一覧を確認（出力をここに貼ってください）
!pip index versions torch
!pip index versions torchvision

[0mFound existing installation: torch_scatter 2.1.2+pt22cpu
Uninstalling torch_scatter-2.1.2+pt22cpu:
  Successfully uninstalled torch_scatter-2.1.2+pt22cpu
Found existing installation: torch_sparse 0.6.18+pt22cpu
Uninstalling torch_sparse-0.6.18+pt22cpu:
  Successfully uninstalled torch_sparse-0.6.18+pt22cpu
Found existing installation: torch_cluster 1.6.3+pt22cpu
Uninstalling torch_cluster-1.6.3+pt22cpu:
  Successfully uninstalled torch_cluster-1.6.3+pt22cpu
Found existing installation: torch_spline_conv 1.2.2+pt22cpu
Uninstalling torch_spline_conv-1.2.2+pt22cpu:
  Successfully uninstalled torch_spline_conv-1.2.2+pt22cpu
Found existing installation: torch-geometric 2.7.0
Uninstalling torch-geometric-2.7.0:
  Successfully uninstalled torch-geometric-2.7.0
[0mtorch (2.9.1)
Available versions: 2.9.1, 2.9.0, 2.8.0, 2.7.1, 2.7.0, 2.6.0, 2.5.1, 2.5.0, 2.4.1, 2.4.0, 2.3.1, 2.3.0, 2.2.2, 2.2.1, 2.2.0
[0mtorchvision (0.24.1)
Available versions: 0.24.1, 0.24.0, 0.23.0, 0.22.1, 0.22.0, 0.21.

In [None]:
# 1) ランタイムを再起動してクリーンな状態にする（UIで実行）
# 壊れた残骸があれば削除
!rm -rf /usr/local/lib/python3.12/dist-packages/~orch*
!rm -rf /usr/local/lib/python3.12/dist-packages/~orch

# 既存の関連パッケージを一旦削除
!pip uninstall -y torch torchvision torchaudio torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric


# 4) pip キャッシュをクリア（任意だが推奨）
!pip cache purge

# 5) CPU版の PyTorch / torchvision / torchaudio をインストール
# CPU版を使う場合（確実）
!pip install -q "torch==2.9.1+cpu" "torchvision==0.24.1+cpu" "torchaudio==2.9.1" --extra-index-url https://download.pytorch.org/whl/cpu


# 6) PyG の CPU 用ホイール（torch バージョンに合わせる）
# torch のバージョン文字列が 2.9.1+cpu の場合
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-2.9.1+cpu.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-2.9.1+cpu.html
!pip install -q torch-cluster -f https://data.pyg.org/whl/torch-2.9.1+cpu.html
!pip install -q torch-spline-conv -f https://data.pyg.org/whl/torch-2.9.1+cpu.html

# torch-geometric 本体
!pip install -q torch-geometric
!pip install -q scikit-learn


Found existing installation: torch 2.9.0+cpu
Uninstalling torch-2.9.0+cpu:
  Successfully uninstalled torch-2.9.0+cpu
Found existing installation: torchvision 0.24.0+cpu
Uninstalling torchvision-0.24.0+cpu:
  Successfully uninstalled torchvision-0.24.0+cpu
Found existing installation: torchaudio 2.9.0+cpu
Uninstalling torchaudio-2.9.0+cpu:
  Successfully uninstalled torchaudio-2.9.0+cpu
[0mFiles removed: 0
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.4/184.4 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m83.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m495.6/495.6 kB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.0/108.0 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone

  Building wheel for torch-scatter (setup.py) ... 

In [None]:
import torch, torch_geometric
print("torch:", torch.__version__)
print("cuda available:", torch.cuda.is_available())
print("torch_geometric:", torch_geometric.__version__)

ModuleNotFoundError: No module named 'torch'

In [None]:
# Colab / Python セルに貼って実行
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv
from torch_geometric.data import Data
from sklearn.metrics import roc_auc_score, f1_score, average_precision_score
import numpy as np
import random

# ---------------------------
# ユーティリティ
# ---------------------------
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# ---------------------------
# サンプル合成データ（検証用）
# 実データがある場合はここを置き換えてください
# ---------------------------
def make_synthetic_graph(num_nodes=1000, feat_dim=16, edge_prob=0.01):
    x = torch.randn(num_nodes, feat_dim)
    # timestamps: 0..T
    t = torch.randint(0, 10, (num_nodes,)).float()
    # random edges
    rows = []
    cols = []
    for i in range(num_nodes):
        for j in range(num_nodes):
            if i!=j and random.random() < edge_prob:
                rows.append(i); cols.append(j)
    edge_index = torch.tensor([rows, cols], dtype=torch.long)
    # labels: small fraction fraud
    y = (torch.rand(num_nodes) < 0.05).long()
    return Data(x=x, edge_index=edge_index, y=y, t=t)

data = make_synthetic_graph()
data = data.to(device)

# ---------------------------
# CT-GAT ブロック（Temporal GAT）
# - GATConv をベースに、時間差を簡易的に組み込む
# - attention weights を取得して重要度スコアに使う
# ---------------------------
class CTGATLayer(nn.Module):
    def __init__(self, in_dim, out_dim, heads=4, concat=True, dropout=0.2):
        super().__init__()
        self.gat = GATConv(in_dim, out_dim // heads, heads=heads, concat=concat, dropout=dropout)
        # 時刻埋め込み（簡易）
        self.time_proj = nn.Linear(1, in_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, edge_index, t):
        # t: [N] -> embed scalar time and add to features
        t_emb = self.time_proj(t.view(-1,1))
        x_in = x + t_emb
        # GATConv で attention weights を得る（PyG の return_attention_weights）
        out = self.gat(x_in, edge_index)
        return self.dropout(out)

    # attention weights を得るための補助（PyG の内部APIを使う）
    def get_attention(self, x, edge_index):
        # GATConv の attention を得るには forward with return_attention_weights
        # PyG >= 2.0 の API を想定
        _, (edge_index_out, attn) = self.gat(x, edge_index, return_attention_weights=True)
        # attn: [num_edges, heads]
        # aggregate per target node: mean attention from incoming edges per head
        # compute importance per node by averaging attention across heads and incoming edges
        num_nodes = x.size(0)
        heads = attn.size(1)
        # sum attention per target node
        tgt = edge_index_out[1]
        agg = torch.zeros(num_nodes, heads, device=x.device)
        counts = torch.zeros(num_nodes, 1, device=x.device)
        agg = agg.index_add(0, tgt, attn)
        counts = counts.index_add(0, tgt, torch.ones_like(tgt, dtype=torch.float).view(-1,1))
        counts[counts==0] = 1.0
        agg = agg / counts
        # importance scalar per node: mean over heads
        importance = agg.mean(dim=1)  # [N]
        return importance.detach()

# ---------------------------
# Causal-Inspector と Causal-Intervener を含む CaT-GNN モデル
# ---------------------------
class CaT_GNN(nn.Module):
    def __init__(self, in_dim, hid_dim=128, heads=4, n_layers=2, env_ratio=0.3, mix_k=3, mix_mode='learn'):
        super().__init__()
        self.layers = nn.ModuleList()
        self.layers.append(CTGATLayer(in_dim, hid_dim, heads=heads))
        for _ in range(n_layers-1):
            self.layers.append(CTGATLayer(hid_dim, hid_dim, heads=heads))
        self.mlp = nn.Sequential(
            nn.Linear(hid_dim, hid_dim//2),
            nn.ReLU(),
            nn.Linear(hid_dim//2, 1)
        )
        self.env_ratio = env_ratio  # re: 環境ノード比率
        self.mix_k = mix_k          # k: mixup に使う因果ノード数
        self.mix_mode = mix_mode    # 'learn' or 'importance'
        # mixup weight layer (if learnable)
        if mix_mode == 'learn':
            self.mix_layer = nn.Linear(hid_dim*(mix_k+1), mix_k+1)

    def forward(self, data):
        x, edge_index, t = data.x, data.edge_index, data.t
        # 1) CT-GAT stack -> node embeddings
        for layer in self.layers:
            x = layer(x, edge_index, t)
        # 2) Causal-Inspector: compute importance scores using last layer's attention
        # Use last layer's get_attention
        importance = self.layers[-1].get_attention(data.x, data.edge_index)  # [N]
        # normalize importance
        imp_norm = (importance - importance.min()) / (importance.max() - importance.min() + 1e-9)
        # select environment nodes: lowest re fraction
        N = x.size(0)
        k_env = max(1, int(self.env_ratio * N))
        # get indices sorted by importance ascending
        sorted_idx = torch.argsort(imp_norm)  # ascending
        env_idx = sorted_idx[:k_env]
        causal_idx = sorted_idx[k_env:]
        # 3) Causal-Intervener: causal mixup on environment nodes (create augmented features x_aug)
        x_aug = x.clone()
        if len(causal_idx) == 0:
            # no causal nodes -> skip
            pass
        else:
            # for each env node, pick top-k causal nodes by importance (descending)
            causal_sorted_desc = torch.argsort(imp_norm, descending=True)
            topk = causal_sorted_desc[:self.mix_k]
            # prepare mix vectors
            for j in env_idx:
                xj = x[j]
                # gather causal features
                causal_feats = x[topk]  # [k, dim]
                if self.mix_mode == 'importance':
                    # weights proportional to importance
                    w_c = imp_norm[topk]
                    w_c = w_c / (w_c.sum() + 1e-9)
                    w_env = 1.0 - w_c.sum()
                    mix = w_env * xj + (w_c.view(-1,1) * causal_feats).sum(dim=0)
                else:
                    # learnable weights: concatenate and pass through softmax
                    vec = torch.cat([xj.unsqueeze(0), causal_feats], dim=0).view(1,-1)  # [1, (k+1)*dim]
                    logits = self.mix_layer(vec)  # [1, k+1]
                    alpha = F.softmax(logits, dim=1).view(-1)  # [k+1]
                    mix = alpha[0]*xj + (alpha[1:].view(-1,1) * causal_feats).sum(dim=0)
                x_aug[j] = mix
        # 4) final prediction from augmented embeddings
        logits = self.mlp(x_aug).view(-1)
        probs = torch.sigmoid(logits)
        return probs, imp_norm

# ---------------------------
# 損失・評価関数
# ---------------------------
def compute_metrics(y_true, y_prob, threshold=0.5):
    y_pred = (y_prob >= threshold).astype(int)
    auc = roc_auc_score(y_true, y_prob) if len(np.unique(y_true))>1 else 0.5
    f1 = f1_score(y_true, y_pred, zero_division=0)
    ap = average_precision_score(y_true, y_prob) if len(np.unique(y_true))>1 else 0.0
    return {'auc': auc, 'f1': f1, 'ap': ap}

# ---------------------------
# 学習ループ（簡易）
# ---------------------------
def train_model(model, data, epochs=50, lr=3e-3, weight_decay=1e-5):
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    criterion = nn.BCELoss()
    best_auc = 0.0
    best_state = None
    y = data.y.cpu().numpy()
    for epoch in range(1, epochs+1):
        model.train()
        optimizer.zero_grad()
        probs, imp = model(data)
        loss = criterion(probs, data.y.float())
        loss.backward()
        optimizer.step()
        # eval
        model.eval()
        with torch.no_grad():
            probs_eval, _ = model(data)
            metrics = compute_metrics(y, probs_eval.cpu().numpy())
        if metrics['auc'] > best_auc:
            best_auc = metrics['auc']
            best_state = {k:v.cpu() for k,v in model.state_dict().items()}
        if epoch % 10 == 0 or epoch==1:
            print(f"Epoch {epoch:03d} loss={loss.item():.4f} auc={metrics['auc']:.4f} f1={metrics['f1']:.4f} ap={metrics['ap']:.4f}")
    # load best
    if best_state is not None:
        model.load_state_dict({k: v.to(device) for k,v in best_state.items()})
    return model

# ---------------------------
# 実行
# ---------------------------
model = CaT_GNN(in_dim=data.x.size(1), hid_dim=128, heads=4, n_layers=2, env_ratio=0.2, mix_k=3, mix_mode='learn')
model = train_model(model, data, epochs=60, lr=3e-3)
# 最終評価
model.eval()
with torch.no_grad():
    probs, imp = model(data)
    metrics = compute_metrics(data.y.cpu().numpy(), probs.cpu().numpy())
print("Final metrics:", metrics)


ModuleNotFoundError: No module named 'torch'

# 全体像

ログ

 ↓ 時間窓集約

ユーザ × リソース グラフ（t=1）

 ↓ GCN

ノード埋め込み h_i^t

 ↓ Pooling

グラフ埋め込み g_t

 ↓

graph_embeddings = [g_1, g_2, ..., g_T]

 ↓

HMM



# データ前処理、特徴量処理

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("aryan208/cybersecurity-threat-detection-logs")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/aryan208/cybersecurity-threat-detection-logs?dataset_version_number=1...


100%|██████████| 95.7M/95.7M [00:02<00:00, 49.3MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/aryan208/cybersecurity-threat-detection-logs/versions/1


In [None]:
#前処理
!pip install pandas numpy torch torch-geometric scikit-learn

Collecting torch-geometric
  Downloading torch_geometric-2.7.0-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.7.0-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.7.0


In [None]:
import os

base_path = "/root/.cache/kagglehub/datasets/aryan208/cybersecurity-threat-detection-logs/versions/1"

for root, dirs, files in os.walk(base_path):
    for file in files:
        print(os.path.join(root, file))

/root/.cache/kagglehub/datasets/aryan208/cybersecurity-threat-detection-logs/versions/1/cybersecurity_threat_detection_logs.csv


In [None]:
import pandas as pd

df = pd.read_csv(os.path.join(base_path, "/root/.cache/kagglehub/datasets/aryan208/cybersecurity-threat-detection-logs/versions/1/cybersecurity_threat_detection_logs.csv"))

df.head()
df.info()
df.columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000000 entries, 0 to 5999999
Data columns (total 10 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   timestamp          object
 1   source_ip          object
 2   dest_ip            object
 3   protocol           object
 4   action             object
 5   threat_label       object
 6   log_type           object
 7   bytes_transferred  int64 
 8   user_agent         object
 9   request_path       object
dtypes: int64(1), object(9)
memory usage: 457.8+ MB


Index(['timestamp', 'source_ip', 'dest_ip', 'protocol', 'action',
       'threat_label', 'log_type', 'bytes_transferred', 'user_agent',
       'request_path'],
      dtype='object')

In [None]:
df = df.rename(columns={
    "time": "timestamp",   # 必要に応じて
})

df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values("timestamp")

In [None]:
#時間窓でログを集約（HMM の観測単位）
WINDOW = "1h"

df['time_window'] = df['timestamp'].dt.floor(WINDOW)

agg = (
    df.groupby(['time_window', 'source_ip', 'dest_ip'])
      .agg(
          bytes_sum=('bytes_transferred', 'sum'),
          access_count=('bytes_transferred', 'count')
      )
      .reset_index()
)

agg.head()

Unnamed: 0,time_window,source_ip,dest_ip,bytes_sum,access_count
0,2024-01-01,103.172.167.96,192.168.1.1,25015,1
1,2024-01-01,103.172.167.96,192.168.1.102,38357,1
2,2024-01-01,103.172.167.96,192.168.1.103,10858,1
3,2024-01-01,103.172.167.96,192.168.1.119,28199,1
4,2024-01-01,103.172.167.96,192.168.1.128,6035,1


In [None]:
#ユーザ × リソースをノードとして符号化
from sklearn.preprocessing import LabelEncoder

le_user = LabelEncoder()
le_res = LabelEncoder()

agg['user_id'] = le_user.fit_transform(agg['source_ip'])
agg['res_id']  = le_res.fit_transform(agg['dest_ip'])

In [None]:
#PyTorch Geometric 用グラフ構造に変換
from torch_geometric.data import Data

graphs = []
times = []

for t, g in agg.groupby("time_window"):
    nodes = pd.unique(g[['source_ip', 'dest_ip']].values.ravel())
    node_map = {n: i for i, n in enumerate(nodes)}

    edge_index = torch.tensor(
        [[node_map[u], node_map[v]] for u, v in zip(g['source_ip'], g['dest_ip'])],
        dtype=torch.long
    ).t().contiguous()

    edge_attr = torch.tensor(
        g[['bytes_sum', 'access_count']].values,
        dtype=torch.float
    )

    data_t = Data(
        edge_index=edge_index,
        edge_attr=edge_attr,
        num_nodes=len(nodes)
    )

    graphs.append(data_t)
    times.append(t)

# すでに groupby が時系列ならOK
assert len(graphs) == len(times)

In [None]:
# #PyTorch Geometric 用グラフ構造に変換
# import numpy as np
# import torch

# edge_index = torch.from_numpy(
#     np.vstack([
#         agg['user_id'].values,
#         agg['res_id'].values
#     ])
# ).long()


# edge_attr = torch.from_numpy(
#     agg[['bytes_sum', 'access_count']].values
# ).float()

# data = Data(
#     edge_index=edge_index,
#     edge_attr=edge_attr,
#     num_nodes=max(agg['user_id'].max(), agg['res_id'].max()) + 1
# )

# data

# #時間窓ごとにグラフを分割（HMM入力用）
# graphs = {}

# for t, g in agg.groupby("time_window"):
#     edge_index = torch.from_numpy(
#         np.vstack([g['user_id'].values, g['res_id'].values])
#     ).long()

#     edge_attr = torch.from_numpy(
#         g[['bytes_sum', 'access_count']].values
#     ).float()

#     graphs[t] = Data(
#         edge_index=edge_index,
#         edge_attr=edge_attr,
#         num_nodes=data.num_nodes
#     )

Data(edge_index=[2, 5482788], edge_attr=[5482788, 2], num_nodes=354)

# GCN によるノード埋め込み（PyTorch Geometric）

In [None]:
import torch
from torch_geometric.nn import GCNConv

class GCNEncoder(torch.nn.Module):
    def __init__(self, num_nodes, hidden_dim=64):
        super().__init__()
        self.embedding = torch.nn.Embedding(num_nodes, hidden_dim)
        self.conv1 = GCNConv(hidden_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)

    def forward(self, data):
        x = self.embedding.weight
        x = self.conv1(x, data.edge_index)
        x = torch.relu(x)
        x = self.conv2(x, data.edge_index)
        return x


# グラフ全体埋め込み（Graph-level Representation）

In [None]:
#②-1 Mean Pooling（最も安定
from torch_geometric.nn import global_mean_pool

def graph_embedding(node_emb):
    return node_emb.mean(dim=0)


In [None]:
#②-2 Attention Pooling（発展）
from torch_geometric.nn import AttentionalAggregation
import torch

model = GCNEncoder(num_nodes=data.num_nodes, hidden_dim=64)
node_emb = model(data)

att_pool = AttentionalAggregation(
    gate_nn=torch.nn.Linear(64, 1)
)

batch = torch.zeros(node_emb.size(0), dtype=torch.long)
g_t = att_pool(node_emb, batch)

print(g_t.shape)


torch.Size([1, 64])


# graph_embeddings

In [None]:
graph_embeddings = []

model.eval()
with torch.no_grad():
    for data_t in graphs:   # 時間窓ごとのグラフ
        node_emb = model(data_t)  # (N_t, d)

        batch = torch.zeros(
            node_emb.size(0),
            dtype=torch.long,
            device=node_emb.device
        )

        g_t = att_pool(node_emb, batch)  # (1, d)
        graph_embeddings.append(g_t.squeeze(0).cpu().numpy())

print(len(graph_embeddings), graph_embeddings[0].shape)


365 (64,)


# GNN 出力 → HMM（hmmlearn）

In [None]:
!pip install hmmlearn

Collecting hmmlearn
  Downloading hmmlearn-0.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Downloading hmmlearn-0.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (165 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m166.0/166.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: hmmlearn
Successfully installed hmmlearn-0.3.3


In [None]:
import numpy as np
from hmmlearn.hmm import GaussianHMM

X = np.stack(graph_embeddings)  # (T, d)

hmm = GaussianHMM(
    n_components=3,
    covariance_type="full",
    n_iter=200,
    random_state=42
)

hmm.fit(X)
states = hmm.predict(X)

In [None]:
print(states)

[1 0 1 2 0 1 0 1 0 1 2 2 2 2 2 2 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 1 0 1
 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 2 2 2 1 0 1 0 1 0 1 0 1 0 1 0 1 0
 1 0 1 0 1 2 2 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 2 2
 0 1 2 0 1 0 1 0 1 0 1 0 1 0 1 2 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1
 2 2 2 2 2 2 2 0 1 2 2 0 1 0 1 0 1 0 1 0 1 0 1 2 0 1 2 2 1 0 1 0 1 0 1 0 1
 0 1 0 1 0 1 2 2 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 2 2 2 0 1 0 1 2 2 2
 2 2 1 0 1 0 1 2 2 0 1 0 1 0 1 0 1 2 2 2 1 0 1 0 1 0 1 0 1 2 2 1 0 1 0 1 0
 1 1 0 1 2 1 2 0 1 2 2 2 1 0 1 0 1 2 0 1 2 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1
 2 2 2 2 1 0 1 0 1 0 1 2 1 2 0 1 0 1 0 0 1 2 2 2 0 1 0 1 0 2 0 1 0 1 0 1 2
 2 2 2 2 0 1 0 1 0 1 2 2 2 0 1 2 0 1 0 1 0 1 0 1 0 1 0 1 0 1 2 2]


In [None]:
#②-1 状態ごとの「滞在時間」
import pandas as pd

state_series = pd.Series(states)

print(state_series.value_counts())

1    152
0    140
2     73
Name: count, dtype: int64


In [None]:
#②-2 状態遷移行列（最重要）
print(hmm.transmat_)


[[0.00714289 0.98571425 0.00714286]
 [0.78947368 0.0131579  0.19736842]
 [0.26388883 0.15277784 0.58333333]]


In [None]:
#②-3 状態ごとのグラフ特徴量（意味付け）
import numpy as np

for k in range(3):
    idx = np.where(states == k)[0]
    print(f"State {k}: mean ||g_t|| =",
          np.mean(np.linalg.norm(X[idx], axis=1)))


State 0: mean ||g_t|| = 6.299156
State 1: mean ||g_t|| = 6.2389765
State 2: mean ||g_t|| = 6.2719097


In [None]:
#④-1 異常状態（例：state=2）の時間窓を抽出
anomaly_times = [times[t] for t in np.where(states == 2)[0]]
print(anomaly_times)

[Timestamp('2024-01-04 00:00:00'), Timestamp('2024-01-11 00:00:00'), Timestamp('2024-01-12 00:00:00'), Timestamp('2024-01-13 00:00:00'), Timestamp('2024-01-14 00:00:00'), Timestamp('2024-01-15 00:00:00'), Timestamp('2024-01-16 00:00:00'), Timestamp('2024-02-27 00:00:00'), Timestamp('2024-02-28 00:00:00'), Timestamp('2024-02-29 00:00:00'), Timestamp('2024-03-20 00:00:00'), Timestamp('2024-03-21 00:00:00'), Timestamp('2024-04-19 00:00:00'), Timestamp('2024-04-20 00:00:00'), Timestamp('2024-04-23 00:00:00'), Timestamp('2024-05-06 00:00:00'), Timestamp('2024-05-28 00:00:00'), Timestamp('2024-05-29 00:00:00'), Timestamp('2024-05-30 00:00:00'), Timestamp('2024-05-31 00:00:00'), Timestamp('2024-06-01 00:00:00'), Timestamp('2024-06-02 00:00:00'), Timestamp('2024-06-03 00:00:00'), Timestamp('2024-06-06 00:00:00'), Timestamp('2024-06-07 00:00:00'), Timestamp('2024-06-20 00:00:00'), Timestamp('2024-06-23 00:00:00'), Timestamp('2024-06-24 00:00:00'), Timestamp('2024-07-10 00:00:00'), Timestamp('20

#ベンチマーク

In [None]:
# time_window ごとに不正が1回でもあれば1
y_true = (
    df.groupby("time_window")["threat_label"]
      .max()
      .loc[times]
      .values
)


['suspicious' 'suspicious' 'suspicious' 'suspicious' 'suspicious'
 'suspicious' 'suspicious' 'suspicious' 'suspicious' 'suspicious'
 'suspicious' 'suspicious' 'suspicious' 'suspicious' 'suspicious'
 'suspicious' 'suspicious' 'suspicious' 'suspicious' 'suspicious'
 'suspicious' 'suspicious' 'suspicious' 'suspicious' 'suspicious'
 'suspicious' 'suspicious' 'suspicious' 'suspicious' 'suspicious'
 'suspicious' 'suspicious' 'suspicious' 'suspicious' 'suspicious'
 'suspicious' 'suspicious' 'suspicious' 'suspicious' 'suspicious'
 'suspicious' 'suspicious' 'suspicious' 'suspicious' 'suspicious'
 'suspicious' 'suspicious' 'suspicious' 'suspicious' 'suspicious'
 'suspicious' 'suspicious' 'suspicious' 'suspicious' 'suspicious'
 'suspicious' 'suspicious' 'suspicious' 'suspicious' 'suspicious'
 'suspicious' 'suspicious' 'suspicious' 'suspicious' 'suspicious'
 'suspicious' 'suspicious' 'suspicious' 'suspicious' 'suspicious'
 'suspicious' 'suspicious' 'suspicious' 'suspicious' 'suspicious'
 'suspicio

##HMM-only

In [None]:
#HMM-only
# 各時間窓の統計量
features = []

for t in times:
    g = agg[agg['time_window'] == t]

    features.append([
        g['bytes_sum'].sum(),
        g['access_count'].sum(),
        g['source_ip'].nunique(),
        g['dest_ip'].nunique()
    ])

X_hmm = np.array(features)  # (T, d)

In [None]:
from hmmlearn.hmm import GaussianHMM

hmm_only = GaussianHMM(
    n_components=3,
    covariance_type="full",
    n_iter=200,
    random_state=42
)

hmm_only.fit(X_hmm)

states_hmm = hmm_only.predict(X_hmm)
loglik_hmm = hmm_only.score_samples(X_hmm)[0]




## GNN-only（時間なし・構造のみ）

In [None]:
import torch
from torch_geometric.nn import GCNConv

class GCNEncoder(torch.nn.Module):
    def __init__(self, num_nodes, hidden_dim=64):
        super().__init__()
        self.embedding = torch.nn.Embedding(num_nodes, hidden_dim)
        self.conv1 = GCNConv(hidden_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)

    def forward(self, data):
        x = self.embedding.weight
        x = self.conv1(x, data.edge_index)
        x = torch.relu(x)
        x = self.conv2(x, data.edge_index)
        return x

In [None]:
#グラフ埋め込み（mean pooling）
from torch_geometric.nn import global_mean_pool

model = GCNEncoder(num_nodes=data.num_nodes, hidden_dim=64)

graph_embeddings = []

model.eval()
with torch.no_grad():
    for d in graphs:   # ← Timestampは使わない
        node_emb = model(d)
        batch = torch.zeros(node_emb.size(0), dtype=torch.long)
        g_emb = global_mean_pool(node_emb, batch)
        graph_embeddings.append(g_emb.squeeze(0))


# list → Tensor
graph_embeddings = torch.stack(graph_embeddings)  # (T, d)

# Tensor → numpy（HMM 用）
X_gnn = graph_embeddings.numpy()


In [None]:
#異常スコア（距離ベース）
center = X_gnn.mean(axis=0)
score_gnn = np.linalg.norm(X_gnn - center, axis=1)

print(center)
print(score_gnn)

[ 0.0626632  -0.45314085  1.1826874  -0.29683053 -0.7799158  -0.40817636
  1.400655    0.56546646  0.7829114  -0.38766617 -0.22717665 -1.350349
  0.14194407 -1.8211545   0.15707104 -0.02916639  0.41207504 -0.00380835
 -1.8934249  -0.01651706  0.99174637  1.6753844   0.82970005 -0.05753732
 -1.2248442   1.9541584  -0.57943296  0.55176485 -0.08248364  0.6468075
 -1.9211559   0.00267815 -1.6548649   1.2441369   1.3403865   0.26201084
  0.827159   -0.96655476 -0.17738171  1.0208024  -0.22396646  0.8368466
 -1.3960623   0.39841127 -1.2554045  -1.0415889   0.71536136 -0.2646621
 -1.2645024   0.9809791  -1.9708035   0.07174475 -0.35369718 -0.4743427
  0.13070817 -1.2983385   1.5321132   1.3450783  -2.6692235   0.904907
 -1.9256772  -0.1683672   1.7575938  -0.15285511]
[0.6390015  0.5170152  0.47815874 0.64287055 0.6356545  0.46365407
 0.5752285  0.55194    0.70990396 0.68600875 0.6230674  0.50173646
 0.64828026 0.6158026  0.6435692  0.6641283  0.4418879  0.5409468
 0.48976758 0.5045158  0.625

##LSTM（時系列のみ・構造なし）

In [None]:
import torch.nn as nn

X_lstm = torch.tensor(X_hmm, dtype=torch.float32)  # (T, d)
X_lstm = X_lstm.unsqueeze(0)  # (1, T, d)

class LSTMAE(nn.Module):
    def __init__(self, input_dim, hidden_dim=64):
        super().__init__()
        self.encoder = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.decoder = nn.LSTM(hidden_dim, input_dim, batch_first=True)

    def forward(self, x):
        z, _ = self.encoder(x)
        out, _ = self.decoder(z)
        return out


In [None]:
model_lstm = LSTMAE(input_dim=X_hmm.shape[1])
opt = torch.optim.Adam(model_lstm.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()

for epoch in range(100):
    opt.zero_grad()
    recon = model_lstm(X_lstm)
    loss = loss_fn(recon, X_lstm)
    loss.backward()
    opt.step()


In [None]:
with torch.no_grad():
    recon = model_lstm(X_lstm)
    score_lstm = torch.mean((recon - X_lstm) ** 2, dim=2).squeeze().numpy()

print(score_lstm)

[4.2881220e+16 4.2275934e+16 4.1085365e+16 4.1986028e+16 4.2687366e+16
 4.3764372e+16 4.2926377e+16 4.1566036e+16 4.1744101e+16 4.1364164e+16
 4.2540968e+16 4.2185628e+16 4.1679810e+16 4.2279894e+16 4.1667045e+16
 4.2299690e+16 4.1719208e+16 4.2235158e+16 4.1705103e+16 4.1334538e+16
 4.1628468e+16 4.2270261e+16 4.1451606e+16 4.3263519e+16 4.1877426e+16
 4.2262899e+16 4.2443524e+16 4.3745221e+16 4.2203534e+16 4.3581553e+16
 4.1897543e+16 4.1744844e+16 4.2635874e+16 4.2853066e+16 4.1549977e+16
 4.3372173e+16 4.1805232e+16 4.2541887e+16 4.2240544e+16 4.2999052e+16
 4.1924408e+16 4.2434960e+16 4.2680512e+16 4.2041171e+16 4.1097924e+16
 4.1419256e+16 4.2250126e+16 4.2636621e+16 4.2023519e+16 4.1747645e+16
 4.1665525e+16 4.3515423e+16 4.1688640e+16 4.2339311e+16 4.2458638e+16
 4.1155094e+16 4.3044252e+16 4.3214247e+16 4.1935687e+16 4.1731543e+16
 4.2213911e+16 4.0757195e+16 4.2137782e+16 4.2596833e+16 4.1740700e+16
 4.1590827e+16 4.1855960e+16 4.3797353e+16 4.4014885e+16 4.1782421e+16
 4.275