# 00 · Prepare Data for LogBERT Pipeline

This notebook downloads public log datasets, applies regex normalization, mines templates with Drain3, and materializes tokenized Parquet splits ready for training and evaluation.

## Notebook Goals
- Install pinned dependencies for the workstation environment.
- Fetch HDFS and OpenStack log corpora with checksum verification and automatic mirror fallback.
- Apply regex-based normalization rules from `configs/data.yaml` and preview before/after examples.
- Mine log templates in streaming mode with Drain3 and persist template transitions as Parquet.
- Build Hugging Face datasets with BERT-compatible tokenization, time-based splits (80/10/10), and truncation stats.
- Save artifacts (tokenizer, processed Parquet, metadata) for downstream notebooks.

## 1. Environment Setup

In [1]:
import os, sys, subprocess
from pathlib import Path

if os.environ.get('SKIP_REQUIREMENTS', '0') == '1':
    print('SKIP_REQUIREMENTS=1 -> skipping pip install from requirements.txt')
else:
    req_path = Path('requirements.txt')
    if req_path.exists():
        print('[setup] Installing dependencies from requirements.txt ...')
        completed = subprocess.run([sys.executable, '-m', 'pip', 'install', '-r', str(req_path)])
        if completed.returncode != 0:
            raise RuntimeError('pip installation failed; inspect output above.')
    else:
        print('requirements.txt not found; skipping installation')

requirements.txt not found; skipping installation


In [2]:
import json
import math
import tarfile
import hashlib
import shutil
import gc
import re
from collections import defaultdict
from pathlib import Path
from datetime import datetime
from typing import Any, Dict, Iterable, List, Optional, Tuple

import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import yaml
import requests
import pyarrow.parquet as pq


In [3]:
# Optional: Install polars for faster data loading
# !pip install polars

import polars as pl

### Load Configuration and Prepare Folders

In [4]:
!ls

00_prepare_data.ipynb	     03_anomaly_detection.ipynb        README.md
01_pretrain_hdfs.ipynb	     03_synthetic_log_inference.ipynb  README_NEW.md
02_finetune_openstack.ipynb  configs


In [5]:
def load_yaml(path: Path) -> Dict:
    with path.open('r') as fh:
        return yaml.safe_load(fh)

CWD = Path.cwd().resolve()
REPO_ROOT = CWD.parent if CWD.name == 'notebooks' else CWD

print(f"[bootstrap] Repository root: {REPO_ROOT}")

data_config = load_yaml(REPO_ROOT / 'configs' / 'data.yaml')
ARTIFACTS_ROOT = (REPO_ROOT / data_config.get('artifacts_root', 'artifacts')).resolve()
ARTIFACTS_ROOT.mkdir(parents=True, exist_ok=True)
print(f"[bootstrap] Artifacts root: {ARTIFACTS_ROOT}")


[bootstrap] Repository root: /home/tpi/distil_shahreyar
[bootstrap] Artifacts root: /home/tpi/distil_shahreyar/artifacts


In [6]:
# %% [markdown]
# Patch A: Repair drain3.ini (avoid multiline JSON) + validate

# %%
import json, configparser
from pathlib import Path

# Resolve repo roots (works from repo root OR notebooks/)
CWD = Path.cwd().resolve()
REPO_ROOT = CWD.parent if CWD.name == "notebooks" else CWD

# We'll write the ini in BOTH locations so whichever the code uses will be valid
INI_TARGETS = [
    REPO_ROOT / "configs" / "drain3.ini",
    REPO_ROOT / "notebooks" / "configs" / "drain3.ini",
]

for p in INI_TARGETS:
    p.parent.mkdir(parents=True, exist_ok=True)

# Build a safe, single-line JSON for mask_patterns
mask_patterns = [
    {"name": "REQ",  "pattern": r"\breq-[0-9a-fA-F\-]{8,}\b", "replace_with": "<OS_REQ>"},
    {"name": "UUID", "pattern": r"\b[0-9a-fA-F]{8}(?:-[0-9a-fA-F]{4}){3}-[0-9a-fA-F]{12}\b", "replace_with": "<UUID>"},
    {"name": "IPV4", "pattern": r"\b(?:(?:25[0-5]|2[0-4]\d|1?\d?\d)(?:\.(?!$)|$)){4}\b", "replace_with": "<IP>"},
    {"name": "IPV6", "pattern": r"\b(?:[A-Fa-f0-9]{0,4}:){2,7}[A-Fa-f0-9]{0,4}\b", "replace_with": "<IP6>"},
    {"name": "HEX",  "pattern": r"\b0x[0-9a-fA-F]+\b|\b[0-9a-fA-F]{8,}\b", "replace_with": "<HEX>"},
    {"name": "NUM",  "pattern": r"(?<![A-Za-z])[-+]?\d+(?:\.\d+)?(?![A-Za-z])", "replace_with": "<NUM>"},
]
mask_patterns_json = json.dumps(mask_patterns, separators=(",", ":"))

ini_text = f"""[DRAIN]
sim_th = 0.4
depth = 4
max_children = 100
max_clusters = 100000
extra_delimiters = ["/","_","=","&","?","-",".",":",","]

[PROFILING]
enabled = false

[MASKING]
mask_prefix = <
mask_suffix = >
mask_patterns = {mask_patterns_json}
"""

# Write files
for ini_path in INI_TARGETS:
    ini_path.write_text(ini_text, encoding="utf-8")
    print(f"[drain3.ini] Wrote: {ini_path}")

# Validate with configparser + json
def validate_ini(path: Path):
    cp = configparser.ConfigParser()
    cp.read(path)
    raw = cp.get("MASKING", "mask_patterns", fallback="[]")
    try:
        parsed = json.loads(raw)
        assert isinstance(parsed, list) and len(parsed) >= 1
        print(f"[validate] OK: {path}  ({len(parsed)} mask patterns)")
        return True
    except Exception as e:
        print(f"[validate] FAILED: {path}  -> {e}")
        return False

_ = [validate_ini(p) for p in INI_TARGETS]

[drain3.ini] Wrote: /home/tpi/distil_shahreyar/configs/drain3.ini
[drain3.ini] Wrote: /home/tpi/distil_shahreyar/notebooks/configs/drain3.ini
[validate] OK: /home/tpi/distil_shahreyar/configs/drain3.ini  (6 mask patterns)
[validate] OK: /home/tpi/distil_shahreyar/notebooks/configs/drain3.ini  (6 mask patterns)


## 2. Download Public Datasets

In [7]:
# %% [markdown]
# ## Normalization rules (robust & ordered)
# Replace PII-like and high-cardinality tokens with placeholders to improve generalization and template stability.

# %%
import re, math
from typing import Optional

# Regex patterns for token normalization (correct word boundaries)
RE_EMAIL = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
RE_URL   = re.compile(r"https?://\S+")
RE_OPENSTACK_REQ = re.compile(r"\breq-[0-9a-fA-F\-]{8,}\b", re.IGNORECASE)

RE_IPv4  = re.compile(r"\b(?:(?:25[0-5]|2[0-4]\d|1?\d?\d)(?:\.(?!$)|$)){4}\b")
RE_IPv6  = re.compile(r"\b(?:[A-Fa-f0-9]{0,4}:){2,7}[A-Fa-fA-F0-9]{0,4}\b")
RE_UUID  = re.compile(r"\b[0-9a-fA-F]{8}(?:-[0-9a-fA-F]{4}){3}-[0-9a-fA-F]{12}\b")
RE_HEX   = re.compile(r"\b0x[0-9a-fA-F]+\b|\b[0-9a-fA-F]{8,}\b")  # long hex-ish tokens
RE_NUM   = re.compile(r"(?<![A-Za-z])[-+]?\d+(?:\.\d+)?(?![A-Za-z])")
RE_PATH  = re.compile(r"(?:/[^/\s]+)+")   # basic POSIX path

# Domain-specific
RE_HDFS_BLOCK = re.compile(r"blk_-?\d+")

# Context compaction & noisy HTML
RE_OS_CTX = re.compile(r"\[(\s*<OS_REQ>)[^\]]*\]")           # keep only <OS_REQ>
RE_HTML_DOCTYPE = re.compile(r"<!DOCTYPE[^>]*>", re.IGNORECASE)

# Helpers for numeric bucketing
def bucket_number(m: re.Match) -> str:
    s = m.group(0)
    try:
        if "." in s:
            val = float(s)
        else:
            val = int(s)
        if val == 0:
            return "<NUM_E0>"
        mag = int(math.floor(math.log10(abs(val))))
        return f"<NUM_E{mag}>"
    except Exception:
        return "<NUM>"

def _normalize_path(path: str) -> str:
    # Replace numeric/hex path segments with {id} to reduce churn
    parts = path.split("/")
    norm = []
    for p in parts:
        if not p:
            continue
        if RE_NUM.fullmatch(p) or RE_UUID.fullmatch(p) or RE_HEX.fullmatch(p):
            norm.append("{id}")
        else:
            norm.append(p)
    return "/" + "/".join(norm) if norm else path

def normalize_message(msg: str) -> str:
    if not msg:
        return msg
    out = msg

    # 1) General early
    out = RE_EMAIL.sub("<EMAIL>", out)
    out = RE_URL.sub("<URL>", out)

    # 2) IMPORTANT: req-* before UUID so we don't get "req-<UUID>"
    out = RE_OPENSTACK_REQ.sub("<OS_REQ>", out)

    # 3) The rest
    out = RE_IPv4.sub("<IP>", out)
    out = RE_IPv6.sub("<IP6>", out)
    out = RE_UUID.sub("<UUID>", out)
    out = RE_HDFS_BLOCK.sub("<HDFS_BLOCK>", out)
    out = RE_HEX.sub("<HEX>", out)
    out = RE_PATH.sub(lambda m: _normalize_path(m.group(0)), out)
    out = RE_NUM.sub(bucket_number, out)

    # 4) Compact noisy OpenStack bracket contexts after <OS_REQ>
    out = RE_OS_CTX.sub(r"[\1]", out)

    # 5) Collapse HTML doctypes
    out = RE_HTML_DOCTYPE.sub("<HTML_DOCTYPE>", out)

    # 6) Whitespace canon
    return re.sub(r"\s+", " ", out).strip()

## IO helpers (compressed readers)

In [8]:
import gzip, bz2

def open_maybe_compressed(path: Path, mode="rt", **kwargs):
    """Open .gz / .bz2 / plain files transparently."""
    suf = path.suffix.lower()
    if suf == ".gz":
        return gzip.open(path, mode, **kwargs)
    elif suf == ".bz2":
        return bz2.open(path, mode, **kwargs)
    else:
        return open(path, mode, **kwargs)

def yield_lines(path: Path, encoding="utf-8", errors="ignore"):
    """Stream lines from a (possibly compressed) log file."""
    with open_maybe_compressed(path, "rt", encoding=encoding, errors=errors) as fh:
        for line in fh:
            line = line.rstrip("\r\n")
            if line:
                yield line

In [9]:
# === Time utilities & session helpers ===
TS_FORMATS = [
    '%Y-%m-%d %H:%M:%S',
    '%Y-%m-%dT%H:%M:%S',
    '%Y-%m-%d %H:%M:%S.%f',
    '%Y-%m-%dT%H:%M:%S.%f',
    '%y%m%d %H%M%S',
]


def parse_ts(text: str) -> Optional[pd.Timestamp]:
    if not text:
        return None
    for fmt in TS_FORMATS:
        try:
            return pd.to_datetime(text, format=fmt, errors='coerce')
        except Exception:
            continue
    return pd.to_datetime(text, errors='coerce')


def ensure_session_ids(df: pd.DataFrame, prefix: str) -> pd.DataFrame:
    if 'session_id' not in df.columns:
        df['session_id'] = None
    missing = df['session_id'].isna() | (df['session_id'].astype(str).str.len() == 0)
    if missing.any():
        df.loc[missing, 'session_id'] = [f"{prefix}-{i}" for i in range(missing.sum())]
    return df


In [10]:
# %% [markdown]
# ## HDFS parser (final & robust)

# %%
import pandas as pd
from typing import Any, Dict, Optional

# Strong patterns (comma/dot millis, ISO, thread, file:line, compact)
RE_HDFS_RICH_A = re.compile(
    r"^(?P<ts>\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}(?:[.,]\d{3,6})?Z?)\s+"
    r"(?:(?:\[(?P<thr>[^\]]+)\])\s+)?"
    r"(?P<level>[A-Z]+)\s+"
    r"(?P<component>[A-Za-z0-9\.\$\-]+)"
    r"(?:\s*\([^)]*\))?\s*[:\-]\s*(?P<msg>.*)$"
)
RE_HDFS_RICH_B = re.compile(
    r"^(?P<ts>\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}(?:[.,]\d{3,6})?Z?)\s+"
    r"(?P<level>[A-Z]+)\s+"
    r"(?:(?:\[(?P<thr>[^\]]+)\])\s+)?"
    r"(?P<component>[A-Za-z0-9\.\$\-]+)"
    r"(?:\s*\([^)]*\))?\s*[:\-]\s*(?P<msg>.*)$"
)
RE_HDFS_LEGACY  = re.compile(
    r"^(?P<ts>\d{6}\s+\d{6})\s+(?P<pid>\d+)\s+(?P<level>[A-Z]+)\s+(?P<component>[A-Za-z0-9\.\$\-]+)\s*[:\-]\s*(?P<msg>.*)$"
)
RE_HDFS_FALLBACK = re.compile(
    r"^(?P<ts>\S+\s+\S+)\s+(?P<level>[A-Z]+)\s+(?P<component>[A-Za-z0-9\.\$\-]+)\s*[:\-]\s*(?P<msg>.*)$"
)
RE_JAVA_CLASS_IN_MSG = re.compile(r"\b(?:[a-z_][a-z0-9_]*\.)+[A-Za-z0-9_\$]+\b")

def parse_ts(text: str) -> Optional[pd.Timestamp]:
    try:
        return pd.to_datetime(text)
    except Exception:
        return pd.NaT

def _parse_hdfs_compact_ts(ts_str: str) -> pd.Timestamp:
    # YYMMDD HHMMSS -> 20YY-MM-DD HH:MM:SS
    yy, mm, dd = ts_str[:2], ts_str[2:4], ts_str[4:6]
    hh, mi, ss = ts_str[7:9], ts_str[9:11], ts_str[11:13]
    year = f"20{yy}"
    return pd.to_datetime(f"{year}-{mm}-{dd} {hh}:{mi}:{ss}", format="%Y-%m-%d %H:%M:%S", errors="coerce")

def _parse_hdfs_fields(text: str) -> Optional[Dict[str, Any]]:
    m = RE_HDFS_LEGACY.match(text)
    if m:
        return {
            "timestamp": _parse_hdfs_compact_ts(m.group("ts")),
            "level": m.group("level"),
            "component": m.group("component"),
            "thread": m.group("pid"),
            "raw_message": m.group("msg"),
        }
    for rx in (RE_HDFS_RICH_A, RE_HDFS_RICH_B, RE_HDFS_FALLBACK):
        m = rx.match(text)
        if m:
            return {
                "timestamp": parse_ts(m.group("ts")) or pd.NaT,
                "level": m.group("level"),
                "component": m.group("component"),
                "thread": m.groupdict().get("thr"),
                "raw_message": m.group("msg"),
            }
    return None

def _backfill_component_if_unknown(fields: Dict[str, Any]) -> Dict[str, Any]:
    comp = fields.get("component") or "unknown"
    if comp == "unknown":
        mm = RE_JAVA_CLASS_IN_MSG.search(fields.get("raw_message") or "")
        if mm:
            fields["component"] = mm.group(0)
    return fields

def parse_hdfs_line(line: str) -> Optional[Dict[str, Any]]:
    f = _parse_hdfs_fields(line)
    if not f:
        return None
    blk = RE_HDFS_BLOCK.search(line)
    f["session_id"] = blk.group(0) if blk else None
    f = _backfill_component_if_unknown(f)
    f["norm_message"] = normalize_message(f["raw_message"])
    return f

def parse_hdfs_csv_row(row: pd.Series) -> Optional[Dict[str, Any]]:
    content = row.get("Content") or row.get("content") or row.get("Message") or row.get("message")
    if not isinstance(content, str):
        return None
    f = _parse_hdfs_fields(content)
    if not f:
        f = {
            "timestamp": parse_ts(str(row.get("Time") or row.get("time") or row.get("Timestamp") or row.get("timestamp"))) or pd.NaT,
            "level": str(row.get("Level") or row.get("level") or "INFO").upper(),
            "component": row.get("Component") or row.get("component") or "unknown",
            "thread": None,
            "raw_message": content,
        }
    f = _backfill_component_if_unknown(f)
    blk = RE_HDFS_BLOCK.search(content)
    f["session_id"] = blk.group(0) if blk else None
    f["norm_message"] = normalize_message(f["raw_message"])
    return f

In [11]:
# %% [markdown]
# ## OpenStack parser (filename prefix tolerant, robust fields)

# %%
RE_OPENSTACK_RICH = re.compile(
    r"^(?P<ts>\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}(?:\.\d{3,6})?)\s+"
    r"(?:(?P<pid>\d+)\s+)?(?:(?P<tid>[^\s]+)\s+)?"
    r"(?P<level>[A-Z]+)\s+(?P<component>[^\s:]+)\s*[:\-]\s*(?P<msg>.*)$"
)
RE_OPENSTACK_GENERIC = re.compile(
    r"^(?P<ts>\S+\s+\S+)\s+(?P<level>[A-Z]+)\s+(?P<component>[\w\.\-\[\]/]+)\s*[:\-]\s*(?P<msg>.*)$"
)

def _strip_filename_prefix(line: str) -> str:
    # OpenStack files sometimes prefix a filename before the timestamp
    # e.g., "nova-api.log.1.2017-05-16_13:53:08 2017-05-16 00:00:00.008 ..."
    p0 = line.split(" ", 1)[0]
    if p0.endswith(".log") or ".log." in p0:
        parts = line.split(" ", 1)
        if len(parts) > 1:
            return parts[1]
    return line

def parse_openstack_line(line: str) -> Optional[Dict[str, Any]]:
    line = _strip_filename_prefix(line.strip())
    m = RE_OPENSTACK_RICH.match(line) or RE_OPENSTACK_GENERIC.match(line)
    if not m:
        parts = line.split(" ", 5)
        if len(parts) >= 6:
            ts = parse_ts(f"{parts[0]} {parts[1]}") or pd.NaT
            pid = parts[2] if parts[2].isdigit() else None
            level = parts[3] if parts[3].isupper() else "INFO"
            component = parts[4]; msg = parts[5]; tid = None
        else:
            return None
    else:
        ts = parse_ts(m.group("ts")) or pd.NaT
        level = m.group("level"); component = m.group("component"); msg = m.group("msg")
        pid = m.groupdict().get("pid"); tid = m.groupdict().get("tid")

    mr = RE_OPENSTACK_REQ.search(line)
    req = mr.group(0) if mr else None

    return {
        "timestamp": ts, "level": level, "component": component,
        "pid": pid, "thread": tid, "session_id": req,
        "raw_message": msg, "norm_message": normalize_message(msg),
    }

def parse_openstack_csv_row(row: pd.Series) -> Optional[Dict[str, Any]]:
    content = row.get("Content") or row.get("content") or row.get("Message") or row.get("message")
    if not isinstance(content, str):
        return None
    ts = row.get("Time") or row.get("time") or row.get("Timestamp") or row.get("timestamp")
    ts = parse_ts(str(ts)) if ts is not None else pd.NaT
    level = str(row.get("Level") or row.get("level") or "INFO").upper()
    component = row.get("Component") or row.get("component") or "unknown"
    mr = RE_OPENSTACK_REQ.search(content)
    req = mr.group(0) if mr else None
    return {
        "timestamp": ts, "level": level, "component": component,
        "pid": str(row.get("pid") or row.get("PID")) if pd.notna(row.get("pid") or row.get("PID")) else None,
        "thread": None, "session_id": req,
        "raw_message": content, "norm_message": normalize_message(content),
    }

In [12]:
# %% [markdown]
# Patch B: make_template_miner that prefers the *existent* fixed ini and explains failures

# %%
def _pick_drain3_ini():
    # Prefer notebooks/configs if present, else root configs
    for p in [
        REPO_ROOT / "notebooks" / "configs" / "drain3.ini",
        REPO_ROOT / "configs" / "drain3.ini",
    ]:
        if p.exists():
            return p
    return None

def make_template_miner(persist_dir: Path):
    persist_dir.mkdir(parents=True, exist_ok=True)
    ini_path = _pick_drain3_ini()
    print(f"[drain3] Using ini: {ini_path}" if ini_path else "[drain3] No ini found")
    try:
        from drain3 import TemplateMiner
        from drain3.file_persistence import FilePersistence
        from drain3.template_miner_config import TemplateMinerConfig
        cfg = TemplateMinerConfig()
        if ini_path:
            cfg.load(str(ini_path))
        miner = TemplateMiner(FilePersistence(str(persist_dir / "drain3_state.json")), config=cfg)
        print("[drain3] TemplateMiner initialised.")
        return miner, True
    except Exception as e:
        print("[drain3] FAILED, falling back to SimpleTemplateMiner:", e)
        class SimpleTemplateMiner:
            # Minimal pass-through: the *normalized* message is the template
            def add_log_message(self, msg: str):
                return msg
        return SimpleTemplateMiner(), False

In [13]:
# === Node.js / generic JSON log parser ===
RE_NODE_TEXT = re.compile(
    r"^(?P<ts>\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}(?:\.\d{3,6})?Z?)\s+"
    r"(?P<level>[A-Z]+|info|warn|error|debug|trace)\s+"
    r"(?P<component>[^:]+?):\s*(?P<msg>.*)$",
    re.IGNORECASE
)


def parse_nodejs_line(line: str):
    if not line:
        return None
    line = line.strip()
    # JSON logs
    if line.startswith('{') and line.endswith('}'):
        try:
            payload = json.loads(line)
        except Exception:
            payload = {}
        ts = payload.get('time') or payload.get('@timestamp') or payload.get('timestamp')
        level = str(payload.get('level') or payload.get('severity') or 'INFO').upper()
        component = payload.get('service') or payload.get('component') or payload.get('logger') or 'node'
        session = payload.get('reqId') or payload.get('requestId') or payload.get('session_id')
        message = payload.get('msg') or payload.get('message') or line
        if isinstance(ts, (int, float)):
            ts = pd.to_datetime(ts, unit='ms', errors='coerce')
        else:
            ts = parse_ts(str(ts)) if ts else pd.NaT
        return {
            'timestamp': ts,
            'level': level,
            'component': component,
            'thread': None,
            'session_id': session,
            'raw_message': message,
            'norm_message': normalize_message(message),
        }
    # Text logs
    m = RE_NODE_TEXT.match(line)
    if m:
        ts = parse_ts(m.group('ts')) or pd.NaT
        level = m.group('level').upper()
        component = m.group('component')
        msg = m.group('msg')
    else:
        parts = line.split(' ', 2)
        if len(parts) < 3:
            return None
        ts = parse_ts(parts[0]) or pd.NaT
        level = parts[1].upper()
        msg = parts[2]
        component = 'node'
    req = RE_OPENSTACK_REQ.search(line)
    session_id = req.group(0) if req else None
    return {
        'timestamp': ts,
        'level': level,
        'component': component,
        'thread': None,
        'session_id': session_id,
        'raw_message': msg,
        'norm_message': normalize_message(msg),
    }


In [14]:
# %% [markdown]
# ## IO helpers: compressed input + safe line iterator

# %%
import gzip, bz2, lzma
from pathlib import Path
from typing import Iterable

def open_maybe_compressed(path: Path):
    suf = path.suffix.lower()
    if suf == ".gz":
        return gzip.open(path, "rt", encoding="utf-8", errors="ignore")
    if suf in (".bz2", ".bz"):
        return bz2.open(path, "rt", encoding="utf-8", errors="ignore")
    if suf in (".xz", ".lzma"):
        return lzma.open(path, "rt", encoding="utf-8", errors="ignore")
    return open(path, "r", encoding="utf-8", errors="ignore")

def yield_lines(path: Path) -> Iterable[str]:
    with open_maybe_compressed(path) as f:
        for line in f:
            yield line.rstrip("\n")

In [15]:
# %% [markdown]
# ## Template mining pipeline (Path A)

# %%
from dataclasses import dataclass
from collections import defaultdict
from typing import List

WINDOW_LEN = int(data_config.get('preprocessing', {}).get('window_len', 100))

@dataclass
class ParseHooks:
    row_parser: Optional[Any] = None
    line_parser: Optional[Any] = None
    name: str = 'dataset'

def parse_log(path: Path, hooks: ParseHooks) -> pd.DataFrame:
    records: List[Dict[str, Any]] = []
    if hooks.line_parser:
        for line in yield_lines(path):
            parsed = hooks.line_parser(line)
            if parsed:
                parsed['source_file'] = path.name
                records.append(parsed)
    elif hooks.row_parser:
        df = pd.read_csv(path)
        for _, row in df.iterrows():
            parsed = hooks.row_parser(row)
            if parsed:
                parsed['source_file'] = path.name
                records.append(parsed)
    if not records:
        return pd.DataFrame(columns=['timestamp','session_id','norm_message','raw_message','source_file'])
    df = pd.DataFrame(records)
    df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
    df = df.dropna(subset=['timestamp'])
    return df

def ensure_session_ids(df: pd.DataFrame, prefix: str) -> pd.DataFrame:
    if 'session_id' not in df.columns:
        df['session_id'] = None
    missing = df['session_id'].isna() | (df['session_id'].astype(str).str.len() == 0)
    if missing.any():
        # generate stable pseudo sessions within file order
        df.loc[missing, 'session_id'] = [f"{prefix}-{i}" for i in range(missing.sum())]
    return df

def assign_templates(df: pd.DataFrame, miner, is_drain3: bool):
    template_to_id: Dict[str, int] = {}
    id_to_template: List[str] = []
    counts = defaultdict(int)
    tids = []
    for msg in df['norm_message']:
        if is_drain3:
            res = miner.add_log_message(msg)
            # Drain3 returns dict with "template_mined"; fall back to msg if missing
            template = (res.get('template_mined') if isinstance(res, dict) else None) or msg
        else:
            template = miner.add_log_message(msg)
        if template not in template_to_id:
            template_to_id[template] = len(id_to_template)
            id_to_template.append(template)
        tid = template_to_id[template]
        counts[tid] += 1
        tids.append(tid)
    df['template_id'] = tids
    vocab = {
        'id_to_template': id_to_template,
        'template_to_id': template_to_id,
        'counts': {str(k): int(v) for k, v in counts.items()}
    }
    return df, vocab

def build_sequences(df: pd.DataFrame, prefix: str) -> List[Dict[str, Any]]:
    df = ensure_session_ids(df, prefix).sort_values(['session_id','timestamp'])
    sequences: List[Dict[str, Any]] = []
    for session_id, g in df.groupby('session_id', sort=False):
        tids = g['template_id'].tolist()
        tss  = g['timestamp'].tolist()
        if not tids:
            continue
        # fixed windows of WINDOW_LEN in event order
        for i in range(0, len(tids), WINDOW_LEN):
            win = tids[i:i+WINDOW_LEN]
            st  = tss[i]
            en  = tss[min(i+len(win)-1, len(tss)-1)]
            sequences.append({'session_id': session_id, 'templates': win, 'start_time': st, 'end_time': en})
    return sequences

def split_sequences(records: List[Dict[str, Any]], splits: Dict[str, float]):
    cols = ['session_id', 'templates', 'start_time', 'end_time']
    if not records:
        empty = pd.DataFrame(columns=cols)
        return {'train': empty.copy(), 'val': empty.copy(), 'test': empty.copy()}
    records = sorted(records, key=lambda r: r['start_time'])
    n = len(records)
    train_end = max(1, int(n * splits.get('train', 0.8)))
    val_end   = max(train_end + 1, int(n * (splits.get('train', 0.8) + splits.get('val', 0.1))))
    train = pd.DataFrame(records[:train_end], columns=cols)
    val   = pd.DataFrame(records[train_end:val_end], columns=cols)
    test  = pd.DataFrame(records[val_end:], columns=cols)
    return {'train': train, 'val': val, 'test': test}

def save_artifacts(out_dir: Path, splits: Dict[str, pd.DataFrame], vocab: Dict[str, Any]):
    out_dir.mkdir(parents=True, exist_ok=True)
    for name, df in splits.items():
        (out_dir / f'{name}.parquet').unlink(missing_ok=True)
        df.to_parquet(out_dir / f'{name}.parquet', index=False)
    vocab['created_at'] = pd.Timestamp.utcnow().isoformat() + 'Z'
    (out_dir / 'template_vocab.json').write_text(json.dumps(vocab, indent=2))

In [16]:
# %% [markdown]
# ## OpenStack labels (clean word boundaries)

# %%
RE_OPENSTACK_REQ = re.compile(r"\breq-[0-9a-fA-F\-]{8,}\b", re.IGNORECASE)

def read_openstack_labels(label_path: Path) -> Dict[str, int]:
    labels: Dict[str, int] = {}
    if not label_path.exists():
        print(f"WARNING: label file not found: {label_path}")
        return labels

    def _infer_label(text: str) -> Optional[int]:
        t = text.strip().lower()
        if "abnormal" in t or "anomaly" in t or re.search(r"\b1\b", t):
            return 1
        if "normal" in t or re.search(r"\b0\b", t):
            return 0
        return None

    with open(label_path, "r", encoding="utf-8", errors="ignore") as fh:
        for line in fh:
            if not line.strip() or line.lstrip().startswith(("#","//")):
                continue
            reqs = RE_OPENSTACK_REQ.findall(line)
            if not reqs:
                parts = re.split(r"[\s,]+", line.strip())
                reqs = [p for p in parts if RE_OPENSTACK_REQ.fullmatch(p)]
            if not reqs:
                continue
            lab = _infer_label(line)
            if lab is None:
                lab = 1
            for rid in reqs:
                labels[rid] = lab
    print(f"Loaded labels for {len(labels)} request IDs from {label_path.name}")
    return labels

def attach_true_labels(out_dir: Path, label_path: Path, suffix: str = "_truth"):
    labmap = read_openstack_labels(label_path)
    if not labmap:
        print("No labels found; skipping truth attachment.")
        return
    for split in ["train","val","test"]:
        ip = out_dir / f"{split}.parquet"
        if not ip.exists():
            continue
        df = pd.read_parquet(ip)
        df["label"] = df["session_id"].map(labmap).fillna(0).astype(int)
        op = out_dir / f"{split}{suffix}.parquet"
        df.to_parquet(op, index=False)
        print(f"Annotated {op.name} with {int(df['label'].sum())} anomalies")

In [17]:
# %% [markdown]
# ## Dataset runners (Path A)

# %%
SPLITS = data_config.get('splits', {'train': 0.8, 'val': 0.1, 'test': 0.1})
FALLBACK_ROOTS = [REPO_ROOT / 'notebooks', REPO_ROOT / 'data']

def _reset_out_dir(out_dir: Path):
    if out_dir.exists():
        for name in ["sequences_raw.parquet","train.parquet","val.parquet","test.parquet",
                     "template_vocab.json","train_truth.parquet","val_truth.parquet","test_truth.parquet"]:
            (out_dir / name).unlink(missing_ok=True)
        state_dir = out_dir / "templates_state"
        if state_dir.exists():
            shutil.rmtree(state_dir, ignore_errors=True)

def _resolve_path(path: Path) -> Optional[Path]:
    if path.exists():
        return path
    cands: List[Path] = []
    for root in FALLBACK_ROOTS:
        cands.append(root / path.name)
        if not path.is_absolute():
            cands.append(root / path)
        else:
            try:
                rel = path.relative_to(REPO_ROOT)
                cands.append(root / rel)
            except Exception:
                pass
    for cand in cands:
        if cand.exists():
            return cand
    return None

def _collect_logs(paths: List[Path], hooks: ParseHooks, prefix: str) -> pd.DataFrame:
    frames = []
    for path in paths:
        resolved = _resolve_path(path)
        if resolved is None:
            print(f"[skip] Missing log file: {path}")
            continue
        if resolved != path:
            print(f"[info] Using fallback log at {resolved} for {path.name}")
        print(f"Parsing {resolved.name} ...")
        df = parse_log(resolved, hooks)
        if not df.empty:
            frames.append(df)
    if not frames:
        return pd.DataFrame(columns=['timestamp','session_id','norm_message','raw_message','source_file'])
    combined = pd.concat(frames, ignore_index=True)
    combined = combined.sort_values('timestamp')
    combined = combined.dropna(subset=['norm_message'])
    return ensure_session_ids(combined, prefix)

def process_dataset(paths: List[Path], hooks: ParseHooks, out_dir: Path, prefix: str, label_path: Optional[Path] = None):
    _reset_out_dir(out_dir)
    df = _collect_logs(paths, hooks, prefix)
    if df.empty:
        print('No records parsed; aborting.')
        return
    miner, is_drain3 = make_template_miner(out_dir / "templates_state")
    df, vocab = assign_templates(df, miner, is_drain3)
    df[['timestamp','session_id','template_id','source_file']].to_parquet(out_dir / "sequences_raw.parquet", index=False)

    # Build sequences & splits
    sequences = build_sequences(df, prefix)
    splits = split_sequences(sequences, SPLITS)
    save_artifacts(out_dir, splits, vocab)

    # Optional label attachment (OpenStack)
    if label_path is not None:
        resolved = _resolve_path(label_path)
        if resolved and resolved.exists():
            attach_true_labels(out_dir, resolved)
        else:
            print(f"Label file not found: {label_path}")

Patch C: Guard against duplicate dataset runs in this kernel

In [18]:
if "___DATASETS_PROCESSED___" not in globals():
    ___DATASETS_PROCESSED___ = set()

def _should_run_dataset(name: str) -> bool:
    if name in ___DATASETS_PROCESSED___:
        print(f"[guard] Skipping duplicate run for dataset: {name}")
        return False
    ___DATASETS_PROCESSED___.add(name)
    return True

Patch D: Diagnose anomaly_labels.txt parsing

In [19]:
def debug_openstack_labels(label_path: Path, max_lines=40):
    if not label_path or not label_path.exists():
        print("[labels] File missing:", label_path); return
    print(f"[labels] Preview first {max_lines} lines of {label_path}:")
    with open(label_path, "r", encoding="utf-8", errors="ignore") as fh:
        lines = [next(fh, "") for _ in range(max_lines)]
    for i, line in enumerate(lines, 1):
        m = re.findall(r"\breq-[0-9a-fA-F\-]{8,}\b", line)
        tag = f"  -> REQS: {m}" if m else ""
        print(f"{i:02d}: {line.rstrip()}{tag}")

# Example usage (uncomment if you have OPENSTACK_LABELS_PATH variable):
# debug_openstack_labels(REPO_ROOT / 'data/openstack/raw/anomaly_labels.txt')

In [20]:
# === Execute preprocessing for configured datasets ===
dataset_specs = data_config.get('datasets', {})
if not dataset_specs:
    raise RuntimeError('No datasets defined in configs/data.yaml')

for name, spec in dataset_specs.items():
    if spec.get('enabled', True) is False:
        print(f"[{name}] Skipping (disabled in config).")
        continue
    if not _should_run_dataset(name):
        continue
    dtype = spec.get('type')
    inputs = spec.get('inputs', {}).get('logs', [])
    if not inputs:
        print(f"[{name}] No input logs defined; skipping.")
        continue
    log_paths = [Path(p) if Path(p).is_absolute() else (REPO_ROOT / p).resolve() for p in inputs]
    output_dir = (REPO_ROOT / spec.get('output_dir', f'artifacts/{name}')).resolve()
    prefix = spec.get('session_prefix', name)
    label_path = None
    labels_cfg = spec.get('labels') or {}
    if 'request_ids' in labels_cfg:
        raw = Path(labels_cfg['request_ids'])
        label_path = raw if raw.is_absolute() else (REPO_ROOT / raw).resolve()

    if dtype == 'hdfs':
        hooks = ParseHooks(line_parser=parse_hdfs_line, name='HDFS')
    elif dtype == 'openstack':
        hooks = ParseHooks(line_parser=parse_openstack_line, name='OpenStack')
    elif dtype == 'nodejs':
        hooks = ParseHooks(line_parser=parse_nodejs_line, name='NodeJS')
    else:
        print(f"[{name}] Unknown dataset type '{dtype}'; skipping.")
        continue

    print(f"[{name}] Starting preprocessing ({dtype}) ...")
    process_dataset(log_paths, hooks, output_dir, prefix=prefix, label_path=label_path)

[hdfs] Starting preprocessing (hdfs) ...
Parsing HDFS.log ...
[drain3] Using ini: /home/tpi/distil_shahreyar/notebooks/configs/drain3.ini
[drain3] TemplateMiner initialised.
[drain3] Using ini: /home/tpi/distil_shahreyar/notebooks/configs/drain3.ini
[drain3] TemplateMiner initialised.
[openstack] Starting preprocessing (openstack) ...
Parsing openstack_normal1.log ...
[openstack] Starting preprocessing (openstack) ...
Parsing openstack_normal1.log ...
Parsing openstack_normal2.log ...
Parsing openstack_normal2.log ...
Parsing openstack_abnormal.log ...
Parsing openstack_abnormal.log ...
[drain3] Using ini: /home/tpi/distil_shahreyar/notebooks/configs/drain3.ini
[drain3] TemplateMiner initialised.
[drain3] Using ini: /home/tpi/distil_shahreyar/notebooks/configs/drain3.ini
[drain3] TemplateMiner initialised.
Loaded labels for 0 request IDs from anomaly_labels.txt
No labels found; skipping truth attachment.
[nodejs_example] Skipping (disabled in config).
Loaded labels for 0 request IDs fr

## Attach OpenStack Anomaly Labels (Instance UUID → Request ID Mapping)

After preprocessing creates the parquet files, we need to map the instance UUIDs from `anomaly_labels.txt` to the request IDs used for sessionization. This cell:
1. Scans OpenStack logs to build a mapping between instance UUIDs and request IDs
2. Uses both direct co-occurrence (same line) and sliding window (±5 lines)
3. Labels sequences where the session_id (req-...) is linked to an anomalous instance
4. Outputs `*_truth.parquet` files with the `label` column added

In [21]:
# Attach OpenStack labels when anomaly_labels.txt lists *instance UUIDs* (not req-IDs).
# Scans logs in data/openstack/raw, builds instance<->req map (co-occur + ±5 line window),
# then labels sequences in artifacts/openstack_finetune by session_id (req-...).

from collections import defaultdict, deque
import re

# --- Paths ---
LOG_DIR = (REPO_ROOT / "data" / "openstack" / "raw").resolve()
LABEL_FILE = LOG_DIR / "anomaly_labels.txt"

# Split dir candidates (first existing wins)
SPLIT_DIRS = [
    REPO_ROOT / "artifacts" / "openstack_finetune",
    REPO_ROOT / "artifacts" / "openstack",  # fallback
]
OPENSTACK_OUT = next((p for p in SPLIT_DIRS if (p / "train.parquet").exists()), None)

if OPENSTACK_OUT is None:
    print("[label_attach] OpenStack splits not found; skipping label attachment")
    print("               Expected artifacts/openstack_finetune/{train,val,test}.parquet")
else:
    print(f"[label_attach] Logs dir: {LOG_DIR}")
    print(f"[label_attach] Splits dir: {OPENSTACK_OUT}")
    
    # --- Regexes ---
    RE_REQ   = re.compile(r"\breq-[0-9a-fA-F\-]{8,}\b", re.IGNORECASE)
    RE_INST1 = re.compile(r"\[instance:\s*([0-9a-fA-F\-]{36})\]", re.IGNORECASE)
    RE_INST2 = re.compile(r"\binstance[=\s:]+([0-9a-fA-F\-]{36})\b", re.IGNORECASE)
    RE_UUID  = re.compile(r"\b[0-9a-fA-F]{8}(?:-[0-9a-fA-F]{4}){3}-[0-9a-fA-F]{12}\b")
    
    # --- Collect logs present (normal, normal1/2, abnormal) ---
    log_names = ["openstack_normal.log", "openstack_normal1.log", "openstack_normal2.log", "openstack_abnormal.log"]
    log_files = [LOG_DIR / n for n in log_names if (LOG_DIR / n).exists()]
    
    if not log_files:
        print(f"[label_attach] No OpenStack logs found in {LOG_DIR}")
    elif not LABEL_FILE.exists():
        print(f"[label_attach] Label file not found: {LABEL_FILE}")
    else:
        print("[label_attach] Using logs:")
        for lf in log_files:
            print(f"               - {lf.name}")
        
        # --- Read anomaly instance UUIDs ---
        anom_instances = set(u.lower() for u in RE_UUID.findall(LABEL_FILE.read_text(encoding="utf-8", errors="ignore")))
        print(f"[label_attach] Found {len(anom_instances)} anomalous instance UUIDs")
        if anom_instances:
            print(f"               Sample: {sorted(list(anom_instances))[:3]}")
        
        # --- Pass 1: build direct co-occurrence map (instance + req on same line) ---
        inst_to_reqs = defaultdict(set)
        req_to_insts = defaultdict(set)
        
        def _update_maps(insts, reqs):
            for i in insts:
                for r in reqs:
                    inst_to_reqs[i].add(r)
                    req_to_insts[r].add(i)
        
        for lf in log_files:
            with lf.open("r", encoding="utf-8", errors="ignore") as fh:
                for line in fh:
                    reqs = [m.lower() for m in RE_REQ.findall(line)]
                    insts = []
                    m1 = RE_INST1.search(line)
                    m2 = RE_INST2.search(line)
                    if m1: insts.append(m1.group(1).lower())
                    if m2: insts.append(m2.group(1).lower())
                    if reqs and insts:
                        _update_maps(insts, reqs)
        
        print(f"[label_attach] Pass 1 (same-line): {len(inst_to_reqs)} instances → {len(req_to_insts)} requests")
        
        # --- Pass 2: sliding window (±5 lines) to catch near-by mentions ---
        WINDOW = 5
        for lf in log_files:
            buf = deque(maxlen=WINDOW)
            with lf.open("r", encoding="utf-8", errors="ignore") as fh:
                # Prime buffer
                for _ in range(WINDOW):
                    line = fh.readline()
                    if line:
                        buf.append(line)
                # Sliding over file
                for line in fh:
                    window_lines = list(buf) + [line]
                    reqs = set()
                    insts = set()
                    for L in window_lines:
                        reqs.update(m.lower() for m in RE_REQ.findall(L))
                        m1 = RE_INST1.search(L)
                        m2 = RE_INST2.search(L)
                        if m1: insts.add(m1.group(1).lower())
                        if m2: insts.add(m2.group(1).lower())
                    if reqs and insts:
                        _update_maps(insts, reqs)
                    buf.append(line)
        
        print(f"[label_attach] Pass 2 (±5 window): {len(inst_to_reqs)} instances → {len(req_to_insts)} requests")
        
        # --- Label splits: session_id is req-id; label=1 if any linked instance is anomalous ---
        def label_split(split):
            p = OPENSTACK_OUT / f"{split}.parquet"
            if not p.exists():
                print(f"[label_attach] [{split}] File not found: {p}")
                return
            
            df = pd.read_parquet(p)
            
            def get_label(sid):
                if pd.isna(sid):
                    return 0
                sid = str(sid).lower()
                insts = req_to_insts.get(sid, set())
                return int(any(i in anom_instances for i in insts))
            
            df["label"] = df["session_id"].map(get_label).fillna(0).astype(int)
            
            out = OPENSTACK_OUT / f"{split}_truth.parquet"
            df.to_parquet(out, index=False)
            
            pos = int(df["label"].sum())
            total = len(df)
            pct = (pos / total * 100) if total > 0 else 0
            print(f"[label_attach] [{split}] Labeled {pos}/{total} ({pct:.1f}%) as anomalies → {out.name}")
        
        for sp in ("train", "val", "test"):
            label_split(sp)
        
        print("[label_attach] ✓ Complete!")
        print("[label_attach] If all counts are 0, req-IDs and instances may not co-occur in logs.")
        print("[label_attach] Alternative: sessionize by instance UUID instead of req-ID.")

[label_attach] Logs dir: /home/tpi/distil_shahreyar/data/openstack/raw
[label_attach] Splits dir: /home/tpi/distil_shahreyar/artifacts/openstack_finetune
[label_attach] Using logs:
               - openstack_normal.log
               - openstack_normal1.log
               - openstack_normal2.log
               - openstack_abnormal.log
[label_attach] Found 4 anomalous instance UUIDs
               Sample: ['1643649d-2f42-4303-bfcd-7798baec19f9', '544fd51c-4edc-4780-baae-ba1d80a0acfc', 'a445709b-6ad0-40ec-8860-bec60b6ca0c2']
[label_attach] Pass 1 (same-line): 2067 instances → 6198 requests
[label_attach] Pass 1 (same-line): 2067 instances → 6198 requests
[label_attach] Pass 2 (±5 window): 2069 instances → 65651 requests
[label_attach] [train] Labeled 561/85351 (0.7%) as anomalies → train_truth.parquet
[label_attach] [val] Labeled 51/10669 (0.5%) as anomalies → val_truth.parquet
[label_attach] [test] Labeled 184/10669 (1.7%) as anomalies → test_truth.parquet
[label_attach] ✓ Complete!
[la

In [22]:
# === Sanity checks ===
def summarize(dirpath: Path):
    print(f"== {dirpath.name} ==")
    for split in ['train', 'val', 'test']:
        p = dirpath / f'{split}.parquet'
        print(f"  {split}.parquet:", p.exists(), str(p) if p.exists() else '')
    vocab_path = dirpath / 'template_vocab.json'
    print('  template_vocab.json:', vocab_path.exists(), str(vocab_path) if vocab_path.exists() else '')
    if (dirpath / 'train.parquet').exists():
        pf = pq.ParquetFile(str(dirpath / 'train.parquet'))
        print('  train rows:', pf.metadata.num_rows)
    if vocab_path.exists():
        vocab = json.loads(vocab_path.read_text())
        print('  vocab size:', len(vocab.get('id_to_template', [])))


def top_templates(dirpath: Path, k: int = 5):
    vocab_path = dirpath / 'template_vocab.json'
    if not vocab_path.exists():
        return
    vocab = json.loads(vocab_path.read_text())
    id2t = vocab.get('id_to_template', [])
    train_path = dirpath / 'train.parquet'
    if not train_path.exists():
        return
    df = pd.read_parquet(train_path, columns=['templates'])
    from collections import Counter
    counter = Counter()
    for seq in df['templates']:
        if isinstance(seq, list):
            counter.update(seq)
    print(f'Top {k} templates:')
    for tid, count in counter.most_common(k):
        desc = id2t[tid] if tid < len(id2t) else '<UNK>'
        print(f'  [{tid}] x{count} :: {desc[:120]}')

for name, spec in data_config.get('datasets', {}).items():
    if spec.get('enabled', True) is False:
        continue
    out_dir = (REPO_ROOT / spec.get('output_dir', f'artifacts/{name}')).resolve()
    if not out_dir.exists():
        print(f"== {name} == Output directory missing: {out_dir}")
        continue
    print(f"Dataset: {name}")
    summarize(out_dir)
    if (out_dir / 'train.parquet').exists():
        top_templates(out_dir, 5)


Dataset: hdfs
== hdfs_pretrain ==
  train.parquet: True /home/tpi/distil_shahreyar/artifacts/hdfs_pretrain/train.parquet
  val.parquet: True /home/tpi/distil_shahreyar/artifacts/hdfs_pretrain/val.parquet
  test.parquet: True /home/tpi/distil_shahreyar/artifacts/hdfs_pretrain/test.parquet
  template_vocab.json: True /home/tpi/distil_shahreyar/artifacts/hdfs_pretrain/template_vocab.json
  train rows: 460119
  vocab size: 118
Top 5 templates:
Dataset: openstack
== openstack_finetune ==
  train.parquet: True /home/tpi/distil_shahreyar/artifacts/openstack_finetune/train.parquet
  val.parquet: True /home/tpi/distil_shahreyar/artifacts/openstack_finetune/val.parquet
  test.parquet: True /home/tpi/distil_shahreyar/artifacts/openstack_finetune/test.parquet
  template_vocab.json: True /home/tpi/distil_shahreyar/artifacts/openstack_finetune/template_vocab.json
  train rows: 85351
  vocab size: 158
Top 5 templates:
Top 5 templates:
Dataset: openstack
== openstack_finetune ==
  train.parquet: True 

## Preprocessing Summary

All patches have been successfully applied:
- ✅ **Patch A**: Repaired drain3.ini with single-line JSON (no more parsing errors)
- ✅ **Patch B**: Robust miner factory with explicit ini selection
- ✅ **Patch C**: Duplicate-run guard to prevent accidental re-runs
- ✅ **Patch D**: Label debugging utility
- ✅ **IO Helpers**: Compressed file readers for .gz/.bz2 support
- ✅ **Updated Parsers**: HDFS (compact format) and OpenStack (filename prefix handling)
- ✅ **Updated Normalization**: Correct word boundaries and proper OS_REQ ordering

The preprocessing pipeline now uses **Drain3** successfully for template mining!

In [23]:
# Debug OpenStack labels to see why 0 request IDs were loaded
label_file = REPO_ROOT / 'data/openstack/raw/anomaly_labels.txt'
if label_file.exists():
    debug_openstack_labels(label_file, max_lines=20)
else:
    print(f"Label file not found: {label_file}")

[labels] Preview first 20 lines of /home/tpi/distil_shahreyar/data/openstack/raw/anomaly_labels.txt:
01: The following VM instances have injected anomalies as observed in openstack_abnormal.log.
02: 
03: 544fd51c-4edc-4780-baae-ba1d80a0acfc
04: ae651dff-c7ad-43d6-ac96-bbcd820ccca8
05: a445709b-6ad0-40ec-8860-bec60b6ca0c2
06: 1643649d-2f42-4303-bfcd-7798baec19f9
07: 
08: 
09: 
10: 
11: 
12: 
13: 
14: 
15: 
16: 
17: 
18: 
19: 
20: 


In [24]:
# %% [markdown]
# ## Sanity checks

# %%
import pyarrow.parquet as pq, json

def summarize(dirpath: Path):
    print(f"== {dirpath.name} ==")
    for split in ['train','val','test']:
        p = dirpath / f'{split}.parquet'
        print(f"  {split}.parquet:", p.exists(), str(p) if p.exists() else '')
    vocab_path = dirpath / 'template_vocab.json'
    print('  template_vocab.json:', vocab_path.exists(), str(vocab_path) if vocab_path.exists() else '')
    if (dirpath / 'train.parquet').exists():
        pf = pq.ParquetFile(str(dirpath / 'train.parquet'))
        print('  train rows:', pf.metadata.num_rows)
    if vocab_path.exists():
        vocab = json.loads(vocab_path.read_text())
        print('  vocab size:', len(vocab.get('id_to_template', [])))

def top_templates(dirpath: Path, k: int = 5):
    vocab_path = dirpath / 'template_vocab.json'
    if not vocab_path.exists():
        return
    vocab = json.loads(vocab_path.read_text())
    id2t = vocab.get('id_to_template', [])
    train_path = dirpath / 'train.parquet'
    if not train_path.exists():
        return
    df = pd.read_parquet(train_path, columns=['templates'])
    from collections import Counter
    counter = Counter()
    for seq in df['templates']:
        if isinstance(seq, list):
            counter.update(seq)
    print(f'Top {k} templates:')
    for tid, count in counter.most_common(k):
        desc = id2t[tid] if tid < len(id2t) else '<UNK>'
        print(f'  [{tid}] x{count} :: {desc[:120]}')

for name, spec in data_config.get('datasets', {}).items():
    if spec.get('enabled', True) is False:
        continue
    out_dir = (REPO_ROOT / spec.get('output_dir', f'artifacts/{name}')).resolve()
    if not out_dir.exists():
        print(f"== {name} == Output directory missing: {out_dir}")
        continue
    print(f"Dataset: {name}")
    summarize(out_dir)
    if (out_dir / 'train.parquet').exists():
        top_templates(out_dir, 5)

Dataset: hdfs
== hdfs_pretrain ==
  train.parquet: True /home/tpi/distil_shahreyar/artifacts/hdfs_pretrain/train.parquet
  val.parquet: True /home/tpi/distil_shahreyar/artifacts/hdfs_pretrain/val.parquet
  test.parquet: True /home/tpi/distil_shahreyar/artifacts/hdfs_pretrain/test.parquet
  template_vocab.json: True /home/tpi/distil_shahreyar/artifacts/hdfs_pretrain/template_vocab.json
  train rows: 460119
  vocab size: 118
Top 5 templates:
Dataset: openstack
== openstack_finetune ==
  train.parquet: True /home/tpi/distil_shahreyar/artifacts/openstack_finetune/train.parquet
  val.parquet: True /home/tpi/distil_shahreyar/artifacts/openstack_finetune/val.parquet
  test.parquet: True /home/tpi/distil_shahreyar/artifacts/openstack_finetune/test.parquet
  template_vocab.json: True /home/tpi/distil_shahreyar/artifacts/openstack_finetune/template_vocab.json
  train rows: 85351
  vocab size: 158
Top 5 templates:
