In [1]:
!pip -q install scikit-learn

In [2]:
import re, hashlib, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
from sklearn.pipeline import Pipeline

rng = np.random.default_rng(7)

root_causes = ["NullPointer", "OutOfMemory", "RaceDeadlock", "BadInput", "Network", "IO", "GPUDriver"]
exception_map = {
    "NullPointer": ["NullPointerException", "SIGSEGV", "EXC_BAD_ACCESS"],
    "OutOfMemory": ["OutOfMemoryError", "std::bad_alloc", "OOMKilled"],
    "RaceDeadlock": ["DeadlockDetected", "ANR", "ThreadStateException"],
    "BadInput": ["ValueError", "IllegalArgumentException", "ParseError"],
    "Network": ["TimeoutError", "ConnectionReset", "SSLHandshakeError"],
    "IO": ["IOException", "DiskFull", "PermissionDenied"],
    "GPUDriver": ["DXGI_ERROR_DEVICE_REMOVED", "VK_ERROR_DEVICE_LOST", "GPU Hang"]
}

modules = ["core", "ui", "net", "storage", "ml", "render", "audio"]

In [3]:
def synth_stack(rc):
    exc = rng.choice(exception_map[rc])
    mod = rng.choice(modules)
    line = int(rng.integers(10, 500))
    addr = hex(int(rng.integers(0, 2**32)))
    frames = []
    if rc == "NullPointer":
        frames = [
            f"at {mod}.UserSession.getToken(UserSession.java:{line})",
            f"at {mod}.AuthInterceptor.intercept(AuthInterceptor.java:{line+5})",
            f"at okhttp3.RealCall.execute(RealCall.kt:{line+20})",
        ]
    elif rc == "OutOfMemory":
        frames = [
            f"at ml.Tensor.allocate(Tensor.cpp:{line})",
            f"at ml.Model.forward(Model.cpp:{line+12})",
            f"at render.Pipeline.run(Pipeline.cpp:{line+30})",
        ]
    elif rc == "RaceDeadlock":
        frames = [
            f"at core.LockManager.acquire(LockManager.cpp:{line})",
            f"at storage.DbTxn.commit(DbTxn.cpp:{line+8})",
            f"at core.ThreadPool.worker(ThreadPool.cpp:{line+25})",
        ]
    elif rc == "BadInput":
        frames = [
            f"at ui.JsonParser.parse(JsonParser.kt:{line})",
            f"at ui.Config.load(Config.kt:{line+10})",
            f"at core.App.start(App.kt:{line+30})",
        ]
    elif rc == "Network":
        frames = [
            f"at net.HttpClient.request(HttpClient.py:{line})",
            f"at net.RetryPolicy.call(RetryPolicy.py:{line+12})",
            f"at core.SyncService.run(SyncService.py:{line+30})",
        ]
    elif rc == "IO":
        frames = [
            f"at storage.FileStore.write(FileStore.go:{line})",
            f"at storage.Cache.flush(Cache.go:{line+8})",
            f"at core.ShutdownHook.run(ShutdownHook.go:{line+25})",
        ]
    else:  # GPUDriver
        frames = [
            f"at render.GpuDevice.submit(GpuDevice.cpp:{line})",
            f"at render.Renderer.draw(Renderer.cpp:{line+9})",
            f"at ui.FrameLoop.tick(FrameLoop.cpp:{line+40})",
        ]

    msg = f"{exc}: crash at addr {addr} (build={int(rng.integers(1000,2000))})"
    stack = "\n".join([msg] + frames)
    return exc, msg, stack

rows = []
for i in range(2500):
    rc = rng.choice(root_causes, p=[0.18,0.15,0.12,0.18,0.14,0.13,0.10])
    exc, msg, st = synth_stack(rc)
    rows.append({
        "id": i,
        "app_version": f"1.{int(rng.integers(0,10))}.{int(rng.integers(0,30))}",
        "platform": rng.choice(["android", "ios", "windows"]),
        "exception_type": exc,
        "message": msg,
        "stacktrace": st,
        "root_cause": rc
    })

df = pd.DataFrame(rows)
print(df.head(2))

   id app_version platform    exception_type  \
0   0       1.2.1  android        ParseError   
1   1       1.7.3      ios  PermissionDenied   

                                             message  \
0  ParseError: crash at addr 0xc693565f (build=1833)   
1  PermissionDenied: crash at addr 0xd23c068f (bu...   

                                          stacktrace root_cause  
0  ParseError: crash at addr 0xc693565f (build=18...   BadInput  
1  PermissionDenied: crash at addr 0xd23c068f (bu...         IO  


In [4]:
# Normalize stack traces

re_line = re.compile(r":\d+\)")
re_addr = re.compile(r"0x[0-9a-fA-F]+")
re_build = re.compile(r"build=\d+")

def normalize_text(s: str) -> str:
    s = s.lower()
    s = re_addr.sub("0xADDR", s)
    s = re_build.sub("build=NUM", s)
    s = re_line.sub(":LINE)", s)
    return s

df["text"] = (df["exception_type"].fillna("") + " " +
              df["message"].fillna("") + " " +
              df["stacktrace"].fillna("")).apply(normalize_text)

# Crash signature: exception + top 3 frames (normalized)
def signature_from_stack(stack: str, exc: str, topk=3) -> str:
    lines = [l.strip() for l in (stack or "").splitlines() if l.strip()]
    frames = [l for l in lines if l.startswith("at ")]
    key = exc.lower() + " | " + " | ".join(frames[:topk]).lower()
    key = normalize_text(key)
    return hashlib.md5(key.encode("utf-8")).hexdigest()

df["signature"] = df.apply(lambda r: signature_from_stack(r["stacktrace"], r["exception_type"]), axis=1)

# Show top signatures (dedup)
sig_counts = df["signature"].value_counts().head(10)
print("\nTop crash signatures:\n", sig_counts)


Top crash signatures:
 signature
e0c7788007543a6a50b2bee882385c32    143
a01ad561a31d1a56f75eea817962e880    140
6990b11c82d38b5bb8c957398a3935c9    132
de30dcc1785d8aa2e5d03b69f8dd457f    131
741009c2745abc6c8327ea4080256d7f    129
d60f89ab566fc2222425fb1e48781f39    129
1826ee1dad5437fa6f59686f9b97633f    119
29db2a64f28082e309e2bda783d4e621    118
290336ac4899a3193d838cad9a02aa1a    117
c9d7c4b57a4e202e6f8d5940461bf5d8    109
Name: count, dtype: int64


In [5]:
# Root cause classifier (TF-IDF + Logistic Regression)

X = df["text"].values
y = df["root_cause"].values

pipe = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), min_df=2, max_features=200_000)),
    ("clf", LogisticRegression(max_iter=300, n_jobs=-1))
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof = np.empty_like(y, dtype=object)

for fold, (tr, va) in enumerate(skf.split(X, y), 1):
    pipe.fit(X[tr], y[tr])
    oof[va] = pipe.predict(X[va])
    macro = f1_score(y[va], oof[va], average="macro")
    print(f"Fold {fold} macro-F1: {macro:.4f}")

print("\nOverall macro-F1:", f1_score(y, oof, average="macro"))
print("\nClassification report:\n", classification_report(y, oof))

Fold 1 macro-F1: 1.0000
Fold 2 macro-F1: 1.0000
Fold 3 macro-F1: 1.0000
Fold 4 macro-F1: 1.0000
Fold 5 macro-F1: 1.0000

Overall macro-F1: 1.0

Classification report:
               precision    recall  f1-score   support

    BadInput       1.00      1.00      1.00       415
   GPUDriver       1.00      1.00      1.00       254
          IO       1.00      1.00      1.00       311
     Network       1.00      1.00      1.00       346
 NullPointer       1.00      1.00      1.00       464
 OutOfMemory       1.00      1.00      1.00       389
RaceDeadlock       1.00      1.00      1.00       321

    accuracy                           1.00      2500
   macro avg       1.00      1.00      1.00      2500
weighted avg       1.00      1.00      1.00      2500



In [6]:
# RCA-style output: explain a crash

def explain_one(idx: int, top_sim=5):
    # Find similar crashes by shared signature first, otherwise by TF-IDF similarity
    row = df.iloc[idx]
    sig = row["signature"]
    same = df[df["signature"] == sig].head(top_sim)

    print("\n--- Crash ---")
    print("ID:", row["id"])
    print("True root cause:", row["root_cause"])
    print("Exception:", row["exception_type"])
    print("Signature:", sig)
    print("\nStack (first lines):")
    print("\n".join(row["stacktrace"].splitlines()[:5]))

    print("\n--- Similar crashes (same signature) ---")
    print(same[["id","root_cause","exception_type","app_version","platform"]].to_string(index=False))

explain_one(0)


--- Crash ---
ID: 0
True root cause: BadInput
Exception: ParseError
Signature: e0c7788007543a6a50b2bee882385c32

Stack (first lines):
ParseError: crash at addr 0xc693565f (build=1833)
at ui.JsonParser.parse(JsonParser.kt:293)
at ui.Config.load(Config.kt:303)
at core.App.start(App.kt:323)

--- Similar crashes (same signature) ---
 id root_cause exception_type app_version platform
  0   BadInput     ParseError       1.2.1  android
  5   BadInput     ParseError      1.6.13      ios
 54   BadInput     ParseError      1.8.26      ios
 66   BadInput     ParseError      1.1.13  windows
 73   BadInput     ParseError      1.8.11  android
