# BERTopic for log clustering (human-label friendly)

This notebook shows a **BERTopic-style** workflow for logs:

1. Load (or generate) logs  
2. Normalize noisy tokens (timestamps, ids, IPs, numbers)  
3. Run **BERTopic** (HDBSCAN + c-TF-IDF)  
4. Inspect **topic keywords + representative log lines** so clusters are easy to label

> Note: In this runtime, `bertopic` may not be preinstalled. The notebook includes an automatic fallback ("MiniTopic") that still provides **clusters + top keywords + examples** using only `scikit-learn`.


In [None]:
# If you're running this locally / on Colab, run this cell once:
# !pip install -U bertopic sentence-transformers hdbscan umap-learn


In [None]:
from __future__ import annotations

import re
import random
from datetime import datetime, timedelta
from pathlib import Path
from collections import defaultdict

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans


In [None]:
# Stopwords tuned for normalized logs:
# after normalization, placeholders like <NUM>/<TS>/<HEX> often become tokens like 'num', 'ts', 'hex'
LOG_STOPWORDS = {"num", "ts", "hex", "trace", "latency_ms", "host"}

## 0) Load or generate log file

In [None]:
def gen_synthetic_logs(path: Path, n: int = 6000, seed: int = 42) -> None:
    random.seed(seed)
    start = datetime(2026, 2, 25, 10, 0, 0)

    services = ["gateway", "payments", "auth", "search", "profile"]
    regions = ["eu-west-1", "eu-central-1", "us-east-1"]
    routes = ["/v1/pay", "/v1/login", "/v1/search", "/v1/profile", "/v1/chargeback"]

    def pick_level(code: int) -> str:
        if code >= 500:
            return "ERROR"
        if code >= 400:
            return "WARN"
        return "INFO"

    lines = []
    t = start
    for _ in range(n):
        t = t + timedelta(seconds=random.randint(1, 7))
        service = random.choice(services)
        region = random.choice(regions)
        route = random.choice(routes)

        if service == "payments":
            code = random.choices([200, 201, 429, 502, 504], weights=[55, 15, 10, 10, 10])[0]
        elif service == "auth":
            code = random.choices([200, 401, 403, 429], weights=[65, 20, 10, 5])[0]
        elif service == "gateway":
            code = random.choices([200, 502, 503, 504], weights=[70, 10, 10, 10])[0]
        else:
            code = random.choice([200, 201, 204, 400, 401, 403, 404, 429, 500, 502, 503, 504])

        level = pick_level(code)
        latency = max(5, int(random.gauss(120, 60)))
        host = f"srv-{random.randint(1, 8):02d}"
        trace = f"{random.getrandbits(64):016x}"
        proto = random.choice(["http/1.1", "http/2"])

        extra = ""
        if code in (502, 503, 504):
            extra = random.choice([" upstream timeout", " bad gateway", " service unavailable"])
            latency = max(latency, random.randint(800, 2400))
        if code == 429:
            extra += " rate_limited=true burst=ip"
        if code in (401, 403):
            extra += " user=anonymous token=missing"

        lines.append(
            f"{t.isoformat()}Z {level} service={service} host={host} region={region} "
            f"route={route} latency_ms={latency} code={code} trace={trace} proto={proto}{extra}"
        )

    path.write_text("\n".join(lines), encoding="utf-8")


log_path = Path("synthetic_system_logs.log")

# If you already have your own file, just put it next to this notebook and skip generation.
if not log_path.exists():
    gen_synthetic_logs(log_path, n=6000, seed=42)
    print("Generated:", log_path)

lines = log_path.read_text(encoding="utf-8").splitlines()
print("Loaded lines:", len(lines))
print("Example:", lines[0])


## 1) Normalize logs for better topics

In [None]:
TS_RE = re.compile(r"\b\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?Z\b")
HEX_RE = re.compile(r"\b[0-9a-f]{12,}\b", re.IGNORECASE)
IP_RE = re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b")
NUM_RE = re.compile(r"\b\d+\b")


def normalize_log(s: str) -> str:
    s = TS_RE.sub("<TS>", s)
    s = IP_RE.sub("<IP>", s)
    s = HEX_RE.sub("<HEX>", s)
    s = NUM_RE.sub("<NUM>", s)
    return s


norm_lines = [normalize_log(x) for x in lines]
print("Before:", lines[0])
print("After :", norm_lines[0])


## 2) BERTopic (preferred) + fallback

In [None]:
def show_cluster_report(topic_ids: np.ndarray, docs: list[str], top_words: dict[int, list[str]], n_examples: int = 5) -> None:
    examples = defaultdict(list)
    for doc, tid in zip(docs, topic_ids):
        if len(examples[int(tid)]) < n_examples:
            examples[int(tid)].append(doc)

    unique = sorted(set(int(x) for x in topic_ids))
    for tid in unique:
        print("\n" + "="*80)
        print(f"TOPIC/CLUSTER {tid} | count={(topic_ids==tid).sum()}")
        if tid in top_words:
            print("keywords:", ", ".join(top_words[tid]))
        for ex in examples[tid]:
            print(" -", ex[:220])


In [None]:
use_bertopic = False
try:
    from bertopic import BERTopic
    from sentence_transformers import SentenceTransformer
    use_bertopic = True
except Exception as e:
    print("BERTopic not available here, using fallback. Reason:", e)

if use_bertopic:
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

    topic_model = BERTopic(
        embedding_model=embedding_model,
        language="english",
        verbose=True,
        nr_topics="auto",
        min_topic_size=25,
    )

    topics, _ = topic_model.fit_transform(norm_lines)

    # top keywords per topic
    top_words = {}
    for tid in sorted(set(topics)):
        if tid == -1:
            continue
        words_scores = topic_model.get_topic(tid) or []
        top_words[int(tid)] = [w for w, _ in words_scores[:10]]

    show_cluster_report(np.array(topics), lines, top_words, n_examples=5)

else:
    # MiniTopic fallback: KMeans + class-based TF-IDF keywords
    n_clusters = 6
    tfidf = TfidfVectorizer(
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.9,
        stop_words=LOG_STOPWORDS,
        token_pattern=r"(?u)\b[\w=/\.-]+\b",
    )
    X = tfidf.fit_transform(norm_lines)
    km = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
    cluster_ids = km.fit_predict(X)

    clusters_as_docs = []
    for c in range(n_clusters):
        joined = " ".join([norm_lines[i] for i in range(len(norm_lines)) if cluster_ids[i] == c])
        clusters_as_docs.append(joined)

    cv = CountVectorizer(
        ngram_range=(1, 2),
        min_df=2,
        stop_words=LOG_STOPWORDS,
        token_pattern=r"(?u)\b[\w=/\.-]+\b",
    )
    counts = cv.fit_transform(clusters_as_docs).tocsr()
    vocab = np.array(cv.get_feature_names_out())

    tf = counts.multiply(1.0 / (counts.sum(axis=1) + 1e-9))
    df = np.asarray((counts > 0).sum(axis=0)).ravel()
    idf = np.log((1 + n_clusters) / (1 + df)) + 1
    ctfidf = tf.multiply(idf)

    top_words = {}
    for c in range(n_clusters):
        row = ctfidf.getrow(c).toarray().ravel()
        top_idx = row.argsort()[::-1][:12]
        top_words[c] = [vocab[i] for i in top_idx if row[i] > 0][:10]

    show_cluster_report(np.array(cluster_ids), lines, top_words, n_examples=5)


## Notes

For the most labelable topics:
- keep normalization ON
- increase `min_topic_size` (BERTopic)
- consider adding domain stopwords (e.g., `proto`, `http/1.1`, etc.) if they dominate.
