Demo: log lines -> (1) unsupervised clustering -> (2) supervised classification

Scenario:
- You have system logs and you *suspect* there are ~5 recurring "types of problems"
  (traits), but you don't know how to hand-engineer features.
- Step 1: build representations automatically from raw text
- Step 2: cluster (unsupervised) to discover groups
- Step 3: inspect a few examples from each cluster to name them
- Step 4: train a classifier (small neural net) to assign new lines to the discovered cluster

Files generated by the notebook version:
- synthetic_system_logs.log
- cluster_preview.json
- synthetic_system_logs_truth.csv (teacher-only; not needed in production)

Requirements:
pip install scikit-learn numpy

In [12]:
from __future__ import annotations

from collections import defaultdict
from pathlib import Path
import json
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import MiniBatchKMeans
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import silhouette_score, classification_report

In [13]:
def load_lines(path: Path) -> list[str]:
    return path.read_text(encoding="utf-8").splitlines()

In [17]:
def main() -> None:
    log_path = Path("synthetic_system_logs.log")
    lines = load_lines(log_path)

    # 1) "Unknown feature extraction" from raw text
    # Character n-grams work well for logs: they catch tokens like "code=504", "db timeout",
    # numeric patterns, and key=value structures without hand-coding parsers.
    vectorizer = TfidfVectorizer(
        analyzer="char_wb",
        ngram_range=(3, 5),
        min_df=2,
        max_df=0.95,
    )
    X = vectorizer.fit_transform(lines)

    # 2) Unsupervised clustering
    k = 5
    km = MiniBatchKMeans(n_clusters=k, random_state=42, batch_size=1024, n_init="auto")
    clusters = km.fit_predict(X)

    sil = silhouette_score(X, clusters, sample_size=600, random_state=42)
    print(f"Silhouette (sampled): {sil:.4f}  (higher is better; logs are noisy so don't expect miracles)")

    # 3) Inspect samples per cluster (this is where a human gives meaning)
    examples = defaultdict(list)
    for line, cid in zip(lines, clusters):
        if len(examples[str(cid)]) < 5: # Convert cid to string for dictionary key
            examples[str(cid)].append(line) # Convert cid to string for dictionary key

    print("\n--- Cluster samples (for naming) ---")
    for cid in range(k):
        print(f"\n[Cluster {cid}]")
        for ex in examples[str(cid)]: # Convert cid to string for dictionary access
            print("  ", ex)

    Path("cluster_preview.json").write_text(json.dumps(examples, indent=2, ensure_ascii=False), encoding="utf-8")

    # 4) Supervised model: learn to predict cluster id for NEW incoming lines
    X_tr, X_te, y_tr, y_te = train_test_split(
        X, clusters, test_size=0.2, random_state=42, stratify=clusters
    )

    clf = MLPClassifier(
        hidden_layer_sizes=(128, 32),
        activation="relu",
        solver="adam",
        batch_size=128,
        max_iter=12,
        random_state=42,
        early_stopping=True,
        n_iter_no_change=2,
        validation_fraction=0.15,
    )
    clf.fit(X_tr, y_tr)
    pred = clf.predict(X_te)

    print("\n--- Classifier report (predicting cluster id) ---")
    print(classification_report(y_te, pred, digits=3))

    # 5) Classify a new line (example)
    new_line = "2026-02-25T12:13:01Z ERROR service=gateway host=srv-02 region=eu-west-1 upstream timeout service=payments route=/v1/pay latency_ms=1800 code=504 trace=deadbeef proto=http/2"
    new_vec = vectorizer.transform([new_line])
    new_cluster = int(clf.predict(new_vec)[0])
    print("\nNew line predicted cluster:", new_cluster)
    print("Line:", new_line)

    terms = np.array(vectorizer.get_feature_names_out())
    for cid in range(k):
      top = np.argsort(km.cluster_centers_[cid])[::-1][:15]
      print(cid,terms[top])




In [18]:
main()

Silhouette (sampled): 0.0744  (higher is better; logs are noisy so don't expect miracles)

--- Cluster samples (for naming) ---

[Cluster 0]
   2026-02-25T08:21:39Z ERROR service=profile host=srv-10 region=us-east-1 p99 latency breached route=/v1/cart p99_ms=822 threshold_ms=1200 region=us-east-1 proto=http/1.1
   2026-02-25T08:56:44Z ERROR service=profile host=srv-05 region=eu-west-1 p99 latency breached route=/v1/login p99_ms=948 threshold_ms=800 region=eu-west-1 proto=http/2
   2026-02-25T08:17:23Z WARN service=search host=srv-11 region=eu-west-1 cache latency high backend=redis latency_ms=168 cmd=MGET trace=52eb9d8e96cf37cb990c801f shard=redis-2
   2026-02-25T08:59:41Z ERROR service=auth host=srv-07 region=us-east-1 upstream timeout service=orders route=/v1/order latency_ms=901 code=504 trace=0a3a4419f4fe020864d39793 proto=http/1.1
   2026-02-25T08:20:20Z ERROR service=payments host=srv-11 region=eu-west-1 p99 latency breached route=/v1/login p99_ms=643 threshold_ms=1200 region=eu-