# Template Model Inference & Thresholding

In [None]:
import json
import math
from pathlib import Path
from typing import Dict, List

import yaml
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset


In [None]:
CWD = Path.cwd().resolve()
REPO_ROOT = CWD.parent if CWD.name == 'notebooks' else CWD
cfg = yaml.safe_load((REPO_ROOT / 'configs/train_openstack.yaml').read_text())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")


In [None]:
def read_sequences(parquet_path: Path):
    table = pq.read_table(parquet_path, columns=['templates'])
    sequences = []
    for seq in table.column(0).to_pylist():
        if seq and len(seq) > 1:
            sequences.append([int(x) for x in seq])
    return sequences

class TemplateSequenceDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences
    def __len__(self):
        return len(self.sequences)
    def __getitem__(self, idx):
        return self.sequences[idx]

class TemplateBatchCollator:
    def __init__(self, pad_id: int, max_length: int):
        self.pad_id = pad_id
        self.max_length = max_length
    def __call__(self, batch):
        filtered = [seq[: self.max_length] for seq in batch if len(seq) > 1]
        if not filtered:
            filtered = [batch[0][: self.max_length]]
        max_len = max(len(seq) for seq in filtered)
        input_len = max_len - 1
        bs = len(filtered)
        input_ids = torch.full((bs, input_len), self.pad_id, dtype=torch.long)
        target_ids = torch.full((bs, input_len), self.pad_id, dtype=torch.long)
        attention_mask = torch.zeros((bs, input_len), dtype=torch.long)
        for i, seq in enumerate(filtered):
            src = seq[:-1]
            tgt = seq[1:]
            input_ids[i, : len(src)] = torch.tensor(src, dtype=torch.long)
            target_ids[i, : len(tgt)] = torch.tensor(tgt, dtype=torch.long)
            attention_mask[i, : len(src)] = 1
        return {
            'input_ids': input_ids,
            'target_ids': target_ids,
            'attention_mask': attention_mask,
        }


In [None]:
class TemplateTransformer(nn.Module):
    def __init__(self, vocab_size: int, pad_id: int, d_model: int, n_layers: int, n_heads: int,
                 ffn_dim: int, dropout: float, max_length: int):
        super().__init__()
        self.pad_id = pad_id
        self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=pad_id)
        self.positional = nn.Parameter(torch.zeros(1, max_length, d_model))
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=n_heads, dim_feedforward=ffn_dim,
                                                   dropout=dropout, batch_first=True, activation='gelu')
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        self.dropout = nn.Dropout(dropout)
        self.norm = nn.LayerNorm(d_model)
        self.output = nn.Linear(d_model, vocab_size)
        self.register_buffer('causal_mask', torch.triu(torch.ones(max_length, max_length), diagonal=1).bool(), persistent=False)

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor):
        seq_len = input_ids.size(1)
        x = self.embedding(input_ids)
        x = x + self.positional[:, :seq_len, :]
        causal = self.causal_mask[:seq_len, :seq_len]
        causal = causal.float().masked_fill(causal, float('-inf'))
        key_padding = attention_mask == 0
        x = self.encoder(x, mask=causal, src_key_padding_mask=key_padding)
        x = self.dropout(self.norm(x))
        logits = self.output(x)
        return logits


In [None]:
# Load vocab and datasets
vocab = json.loads((REPO_ROOT / cfg['data']['vocab_path']).read_text())
base_vocab_size = len(vocab.get('id_to_template', []))
pad_strategy = cfg['data'].get('pad_token_strategy', 'vocab_size')
if pad_strategy == 'vocab_size':
    pad_id = base_vocab_size
    vocab_size = base_vocab_size + 1
else:
    pad_id = int(pad_strategy)
    vocab_size = max(base_vocab_size + 1, pad_id + 1)

max_seq_len = cfg['data'].get('max_sequence_length', 100)
collator = TemplateBatchCollator(pad_id=pad_id, max_length=max_seq_len)

val_sequences = read_sequences((REPO_ROOT / cfg['data']['val_file']).resolve())
val_loader = DataLoader(TemplateSequenceDataset(val_sequences), batch_size=cfg['training']['eval_batch_size'], shuffle=False, collate_fn=collator)

test_sequences = read_sequences((REPO_ROOT / cfg['data']['test_file']).resolve())
test_loader = DataLoader(TemplateSequenceDataset(test_sequences), batch_size=cfg['training']['eval_batch_size'], shuffle=False, collate_fn=collator)


In [None]:
# Load fine-tuned model
model_cfg = cfg['model']
model = TemplateTransformer(
    vocab_size=vocab_size,
    pad_id=pad_id,
    d_model=model_cfg['d_model'],
    n_layers=model_cfg['n_layers'],
    n_heads=model_cfg['n_heads'],
    ffn_dim=model_cfg['ffn_dim'],
    dropout=model_cfg['dropout'],
    max_length=max_seq_len,
).to(device)

ckpt_path = (REPO_ROOT / cfg['checkpointing']['output_dir'] / 'best.pt').resolve()
if not ckpt_path.exists():
    raise FileNotFoundError(f"Checkpoint not found: {ckpt_path}")
state = torch.load(ckpt_path, map_location=device)
model.load_state_dict(state['model_state_dict'])
model.eval()
criterion = nn.CrossEntropyLoss(ignore_index=pad_id)


In [None]:
def sequence_losses(model, loader):
    losses = []
    model.eval()
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            target_ids = batch['target_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            logits = model(input_ids, attention_mask)
            log_probs = F.log_softmax(logits, dim=-1)
            token_losses = F.nll_loss(
                log_probs.view(-1, log_probs.size(-1)),
                target_ids.view(-1),
                reduction='none',
                ignore_index=pad_id,
            )
            token_losses = token_losses.view(target_ids.size())
            mask = (target_ids != pad_id)
            seq_loss = (token_losses * mask).sum(dim=1) / mask.sum(dim=1).clamp_min(1)
            losses.extend(seq_loss.cpu().tolist())
    return losses

val_losses = sequence_losses(model, val_loader)
test_losses = sequence_losses(model, test_loader)
print(f"Validation loss mean {np.mean(val_losses):.4f} | std {np.std(val_losses):.4f}")
print(f"Test loss mean {np.mean(test_losses):.4f} | std {np.std(test_losses):.4f}")


In [None]:
threshold = float(np.percentile(val_losses, 95))
print(f"Suggested anomaly threshold (95th percentile): {threshold:.4f}")

metrics_dir = (REPO_ROOT / cfg['logging'].get('metrics_dir', 'artifacts/metrics/openstack')).resolve()
metrics_dir.mkdir(parents=True, exist_ok=True)
metrics_path = metrics_dir / 'openstack_threshold.json'
metrics_path.write_text(json.dumps({
    'val_loss_mean': float(np.mean(val_losses)),
    'val_loss_std': float(np.std(val_losses)),
    'test_loss_mean': float(np.mean(test_losses)),
    'test_loss_std': float(np.std(test_losses)),
    'threshold': threshold
}, indent=2))
print(f"Saved threshold summary to {metrics_path}")


In [None]:
def score_sequences(parquet_path: Path) -> pd.DataFrame:
    sequences = read_sequences(parquet_path)
    loader = DataLoader(TemplateSequenceDataset(sequences), batch_size=cfg['training']['eval_batch_size'], shuffle=False, collate_fn=collator)
    scores = sequence_losses(model, loader)
    return pd.DataFrame({'sequence_index': range(len(scores)), 'avg_loss': scores, 'is_anomaly': [score >= threshold for score in scores]})

# Example usage (disabled by default)
# scored = score_sequences((REPO_ROOT / 'artifacts/openstack_finetune/val.parquet').resolve())
# scored.head()
