In [None]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import hdbscan

from IPython.display import display

DATA = Path("./data")
RAW = DATA / "raw"
PROC = DATA / "processed"
PROC.mkdir(parents=True, exist_ok=True)

SEED = 1337
rng = np.random.default_rng(SEED)


In [None]:
wo = pd.read_csv(RAW / "work_orders.csv")
logs = pd.read_csv(RAW / "logs.csv")
stop = pd.read_csv(RAW / "stopworks.csv")

# aggregate logs per WO into a single document
doc = (logs
       .sort_values(["work_order_id","ts"])
       .groupby("work_order_id")["message"]
       .apply(lambda s: " \n".join(s.astype(str).tolist()))
       .reset_index()
       .rename(columns={"message":"log_doc"}))

df = wo.merge(doc, on="work_order_id", how="left")
df["log_doc"] = df["log_doc"].fillna("")
df.head(3)


In [None]:
tfidf = TfidfVectorizer(
    ngram_range=(1,3),
    min_df=5,           # adjust based on corpus size
    max_df=0.6,
    strip_accents="unicode",
    lowercase=True
)
X_text = tfidf.fit_transform(df["log_doc"])
with open(PROC / "log_vocab.json", "w") as f:
    json.dump(tfidf.vocabulary_, f)
sparse.save_npz(PROC / "log_tfidf_sparse.npz", X_text)
X_text.shape


In [None]:
# HDBSCAN prefers a distance matrix or a metric; with sparse TFIDF we use cosine metric.
# Using min_cluster_size ~ 25-60 is reasonable; tune as needed.
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=40,
    min_samples=None,
    metric='cosine',
    cluster_selection_epsilon=0.0,
    cluster_selection_method='eom',
    prediction_data=True # enables membership/probabilities
)

labels = clusterer.fit_predict(X_text)
prob = getattr(clusterer, 'probabilities_', np.ones_like(labels, dtype=float))
df["cluster_id"] = labels
df["cluster_conf"] = prob
df["is_noise"] = (labels == -1).astype(int)

# optional: soft clustering features via approximate prediction
# from hdbscan.prediction import membership_vector
# soft_membership = membership_vector(clusterer, X_text)  # dict-like per row
df["cluster_id"].value_counts().head(10)


In [None]:
# What words define top clusters?
inv_vocab = {v:k for k,v in tfidf.vocabulary_.items()}
def top_terms_for_cluster(cid, topk=15):
    idx = np.where(df["cluster_id"].values == cid)[0]
    if len(idx) == 0: return []
    mean_vec = X_text[idx].mean(axis=0).A1
    top = mean_vec.argsort()[::-1][:topk]
    return [inv_vocab[i] for i in top]

for cid in df["cluster_id"].value_counts().index[:5]:
    if cid == -1: continue
    print(f"Cluster {cid}: ", top_terms_for_cluster(cid))


In [None]:
# Optional: bring in normalized Stopworks labels as features
stop_feats = stop.copy()
stop_feats["has_stopworks"] = 1
stop_feats["norm_subsystem"] = stop_feats["norm_subsystem"].replace("", np.nan)
stop_feats["norm_root_cause"] = stop_feats["norm_root_cause"].replace("", np.nan)

df2 = df.merge(stop_feats[["work_order_id","has_stopworks","norm_subsystem","norm_root_cause"]],
               on="work_order_id", how="left")

df2["has_stopworks"] = df2["has_stopworks"].fillna(0).astype(int)
for c in ["norm_subsystem","norm_root_cause"]:
    df2[c] = df2[c].fillna("UNK")

y = df2["failure_label"].astype(int)

cat_cols = ["catalog_id","supplier","device_type","technician","shift","norm_subsystem","norm_root_cause"]
num_cols = ["cluster_conf","is_noise"]
ord_cols = []  # add dates if encoding e.g., day-of-week/seasonality

X_tab = df2[cat_cols + num_cols + ord_cols + ["cluster_id"]].copy()
# Turn cluster_id into a categorical string so OneHot can handle -1 nicely
X_tab["cluster_id"] = "cid_" + X_tab["cluster_id"].astype(str)

pre = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse=True), cat_cols + ["cluster_id"]),
        ("passthru", "passthrough", num_cols + ord_cols),
    ],
    remainder="drop",
    sparse_threshold=1.0
)

clf = RandomForestClassifier(
    n_estimators=400,
    max_depth=None,
    n_jobs=-1,
    random_state=SEED,
    class_weight="balanced_subsample",
    oob_score=False
)
pipe = Pipeline([("prep", pre), ("rf", clf)])


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_tab, y, test_size=0.25, random_state=SEED, stratify=y
)

pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)
proba = pipe.predict_proba(X_test)[:,1]

print(classification_report(y_test, pred, digits=3))
print("ROC AUC:", roc_auc_score(y_test, proba))


In [None]:
no_cluster_cols = cat_cols + num_cols  # drop cluster_id feature
X_nc = df2[no_cluster_cols].copy()

pre_nc = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse=True), cat_cols),
        ("passthru", "passthrough", num_cols),
    ],
    remainder="drop",
    sparse_threshold=1.0
)
pipe_nc = Pipeline([("prep", pre_nc), ("rf", RandomForestClassifier(
    n_estimators=400, n_jobs=-1, random_state=SEED, class_weight="balanced_subsample"
))])

X_train_nc, X_test_nc, y_train_nc, y_test_nc = train_test_split(
    X_nc, y, test_size=0.25, random_state=SEED, stratify=y
)

pipe_nc.fit(X_train_nc, y_train_nc)
pred_nc = pipe_nc.predict(X_test_nc)
proba_nc = pipe_nc.predict_proba(X_test_nc)[:,1]

print("== Without clusters ==")
print(classification_report(y_test_nc, pred_nc, digits=3))
print("ROC AUC:", roc_auc_score(y_test_nc, proba_nc))


In [None]:
cluster_sizes = df["cluster_id"].value_counts().rename_axis("cluster_id").reset_index(name="size")
display(cluster_sizes.head(10))

# Failure rate per cluster
fr_per_cluster = (
    df2.groupby("cluster_id")["failure_label"].mean()
      .rename("failure_rate").reset_index()
      .merge(cluster_sizes, on="cluster_id", how="left")
      .sort_values("failure_rate", ascending=False)
)
display(fr_per_cluster.head(15))
