# Logs ➜ (1) Unsupervised clustering ➜ (2) Supervised classification (TensorFlow)

This notebook is a cell-based version of the provided demo script. It:
1. Loads log lines.
2. Vectorizes them with `TextVectorization` (TF-IDF).
3. Clusters with a minimal KMeans implemented in pure TensorFlow.
4. Trains an MLP classifier to predict the discovered cluster id for new lines.

> Tip: Keep epochs low for a live demo, then crank them up for better accuracy.


In [None]:
# If needed (e.g., in a fresh environment), uncomment and run:
# !pip install -U tensorflow numpy


In [None]:
from __future__ import annotations

from collections import defaultdict
from pathlib import Path

import numpy as np
import tensorflow as tf


## Helper functions

In [None]:
def load_lines(path: Path) -> list[str]:
    return path.read_text(encoding="utf-8").splitlines()


def tf_kmeans(x: tf.Tensor, k: int, iters: int = 20, seed: int = 42) -> tuple[tf.Tensor, tf.Tensor]:
    """Minimal KMeans in TensorFlow.

    Parameters
    ----------
    x:
        Float tensor [N, D]
    k:
        Number of clusters
    iters:
        Iterations
    seed:
        Random seed

    Returns
    -------
    labels:
        int32 tensor [N]
    centroids:
        float32 tensor [K, D]
    """
    tf.random.set_seed(seed)
    n = tf.shape(x)[0]

    # Init centroids by sampling points
    idx = tf.random.shuffle(tf.range(n))[:k]
    centroids = tf.gather(x, idx)

    for _ in range(iters):
        # distances: [N, K] using squared euclidean distance
        x2 = tf.reduce_sum(tf.square(x), axis=1, keepdims=True)              # [N, 1]
        c2 = tf.reduce_sum(tf.square(centroids), axis=1, keepdims=True)      # [K, 1]
        xc = tf.matmul(x, centroids, transpose_b=True)                       # [N, K]
        dists = x2 + tf.transpose(c2) - 2.0 * xc

        labels = tf.cast(tf.argmin(dists, axis=1), tf.int32)                 # [N]

        # recompute centroids
        new_centroids = []
        for j in range(k):
            mask = tf.equal(labels, j)
            pts = tf.boolean_mask(x, mask)
            # handle empty cluster
            new_c = tf.cond(
                tf.shape(pts)[0] > 0,
                lambda: tf.reduce_mean(pts, axis=0),
                lambda: centroids[j],
            )
            new_centroids.append(new_c)
        centroids = tf.stack(new_centroids, axis=0)

    return labels, centroids


## 0) Data: load your log file

The original script expects a file named:

- `synthetic_system_logs.log`

Place it next to this notebook (same directory), or change the path below.


In [None]:
log_path = Path("synthetic_system_logs.log")

# Basic sanity check
if not log_path.exists():
    raise FileNotFoundError(
        f"Missing {log_path}. Put your log file next to this notebook "
        "or update `log_path`."
    )

lines = load_lines(log_path)
print("Loaded lines:", len(lines))
print("First line:", lines[0] if lines else "<empty>")


### Optional: quick synthetic log generator (only if you need a demo file)

If you don't have logs handy, you can generate a small synthetic file.
Run this once, then re-run the data-loading cell above.


In [None]:
import random
from datetime import datetime, timedelta

def gen_synthetic_logs(path: Path, n: int = 5000, seed: int = 42) -> None:
    random.seed(seed)
    start = datetime(2026, 2, 25, 10, 0, 0)

    levels = ["INFO", "WARN", "ERROR"]
    services = ["gateway", "payments", "auth", "search", "profile"]
    regions = ["eu-west-1", "eu-central-1", "us-east-1"]
    routes = ["/v1/pay", "/v1/login", "/v1/search", "/v1/profile", "/v1/chargeback"]
    codes = [200, 201, 204, 400, 401, 403, 404, 429, 500, 502, 503, 504]

    def pick_level(code):
        if code >= 500: return "ERROR"
        if code >= 400: return "WARN"
        return "INFO"

    lines = []
    t = start
    for i in range(n):
        t = t + timedelta(seconds=random.randint(1, 7))
        service = random.choice(services)
        region = random.choice(regions)
        route = random.choice(routes)

        # Bias codes by service to create clusterable patterns
        if service == "payments":
            code = random.choices([200, 201, 429, 502, 504], weights=[55, 15, 10, 10, 10])[0]
        elif service == "auth":
            code = random.choices([200, 401, 403, 429], weights=[65, 20, 10, 5])[0]
        elif service == "gateway":
            code = random.choices([200, 502, 503, 504], weights=[70, 10, 10, 10])[0]
        else:
            code = random.choice(codes)

        level = pick_level(code)
        latency = max(5, int(random.gauss(120, 60)))
        host = f"srv-{random.randint(1, 8):02d}"
        trace = f"{random.getrandbits(64):016x}"
        proto = random.choice(["http/1.1", "http/2"])

        # Add a few recurring textual motifs to strengthen clustering
        extra = ""
        if code in (502, 503, 504):
            extra = random.choice([" upstream timeout", " bad gateway", " service unavailable"])
            latency = max(latency, random.randint(800, 2400))
        if code == 429:
            extra += " rate_limited=true burst=ip"
        if code in (401, 403):
            extra += " user=anonymous token=missing"

        lines.append(
            f"{t.isoformat()}Z {level} service={service} host={host} region={region} "
            f"route={route} latency_ms={latency} code={code} trace={trace} proto={proto}{extra}"
        )

    path.write_text("\n".join(lines), encoding="utf-8")

# Uncomment to generate:
# gen_synthetic_logs(Path("synthetic_system_logs.log"), n=6000)
# print("Generated synthetic_system_logs.log")


## 1) Vectorize logs ➜ TF-IDF

We use `TextVectorization(output_mode="tf-idf")` with character-level 5-grams to avoid hand-written parsers.


In [None]:
max_tokens = 8000
vec = tf.keras.layers.TextVectorization(
    standardize=None,
    split="character",
    ngrams=5,
    max_tokens=max_tokens,
    output_mode="tf-idf",
)

text_ds = tf.data.Dataset.from_tensor_slices(lines).batch(256)
vec.adapt(text_ds)

X = vec(tf.constant(lines))      # [N, V] dense
X = tf.cast(X, tf.float32)

# Optional: L2 normalize (often helps KMeans)
X = tf.linalg.l2_normalize(X, axis=1)

print("X shape:", X.shape)


## 2) Unsupervised clustering (KMeans in TensorFlow)

In [None]:
k = 5
labels, centroids = tf_kmeans(X, k=k, iters=25, seed=42)
labels_np = labels.numpy()

print("Cluster counts:")
for cid in range(k):
    print(cid, int((labels_np == cid).sum()))


## 3) Print a few examples per cluster (for naming)

In [None]:
examples = defaultdict(list)
for line, cid in zip(lines, labels_np):
    if len(examples[int(cid)]) < 5:
        examples[int(cid)].append(line)

print("\n--- Cluster samples (for naming) ---")
for cid in range(k):
    print(f"\n[Cluster {cid}]")
    for ex in examples[cid]:
        print("  ", ex)


## 4) Supervised model: MLP predicts cluster id

We turn the cluster assignments into labels, then train a small MLP classifier.


In [None]:
y = tf.keras.utils.to_categorical(labels_np, num_classes=k)

# split
n = len(lines)
idx = np.arange(n)
rng = np.random.default_rng(42)
rng.shuffle(idx)
cut = int(n * 0.8)
tr, te = idx[:cut], idx[cut:]

X_tr, X_te = tf.gather(X, tr), tf.gather(X, te)
y_tr, y_te = y[tr], y[te]

model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X.shape[1],)),
    tf.keras.layers.Dense(256, activation="relu"),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(k, activation="softmax"),
])
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss="categorical_crossentropy",
    metrics=["accuracy"],
)

history = model.fit(
    X_tr, y_tr,
    validation_split=0.15,
    epochs=6,          # demo-speed; increase to 15-30 if you want
    batch_size=128,
    verbose=2
)


## 5) Evaluate and predict a new line

In [None]:
loss, acc = model.evaluate(X_te, y_te, verbose=0)
print(f"Test accuracy (predicting cluster id): {acc:.4f}")

new_line = "2026-02-25T12:13:01Z ERROR service=gateway host=srv-02 region=eu-west-1 upstream timeout service=payments route=/v1/pay latency_ms=1800 code=504 trace=deadbeef proto=http/2"
new_vec = vec(tf.constant([new_line]))
new_vec = tf.linalg.l2_normalize(tf.cast(new_vec, tf.float32), axis=1)
pred_cluster = int(tf.argmax(model.predict(new_vec, verbose=0), axis=1).numpy()[0])

print("\nNew line predicted cluster:", pred_cluster)
print("Line:", new_line)


---  
### Notes for live training

- **Cluster meaning**: clusters are discovered from text patterns, so you can "name" them by inspecting sample lines.
- **Accuracy** here is "how well the MLP reproduces KMeans labels" (not ground truth). It's a proxy for whether clusters are learnable and stable.
- For speed: reduce `max_tokens`, `k`, `iters`, or `epochs`.
