# Production Fingerprinting — Diagnostics & Parameter Sweep (v3)
Purpose-built for diagnostics and iteration on clustering and model performance.

Includes clustering diagnostics, parameter sweep, ablation (baseline vs +clusters), threshold selection, and macro cell (`quick_diag`).

In [21]:
# --- Setup & Config ---
from pathlib import Path
import numpy as np
import pandas as pd

DATA_DIR = Path("data")
RAW_DIR  = DATA_DIR / "raw"
PROC_DIR = DATA_DIR / "processed"
PROC_DIR.mkdir(parents=True, exist_ok=True)

RANDOM_SEED = 42
TEST_SIZE   = 0.25

# Vectorization
NGRAM_RANGE   = (1, 2)
MIN_DF        = 1
MAX_FEATURES  = 100_000
svd_components = 100  # keep; later try 150


# Clustering
USE_HDBSCAN               = True
HDBSCAN_MIN_CLUSTER_SIZE  = 10
HDBSCAN_MIN_SAMPLES       = None  # or an int < min_cluster_size

TOP_K_CLUSTERS            = 100

print("Config loaded.")

Config loaded.


In [22]:
# --- Imports ---
import numpy as np
import pandas as pd

from typing import List

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    classification_report,
    roc_auc_score,
    average_precision_score,
    precision_recall_curve,
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import TruncatedSVD

try:
    import hdbscan
    HDBSCAN_AVAILABLE = True
except Exception as e:
    HDBSCAN_AVAILABLE = False
    from sklearn.cluster import DBSCAN
    import warnings
    warnings.warn(f"hdbscan not available ({e}); falling back to DBSCAN.")

import matplotlib.pyplot as plt

np.random.seed(RANDOM_SEED)
print("Imports OK. HDBSCAN_AVAILABLE =", HDBSCAN_AVAILABLE)

Imports OK. HDBSCAN_AVAILABLE = True


In [23]:
# --- Data helpers ---
def load_raw_frames(raw_dir: Path):
    wo = pd.read_csv(raw_dir / "work_orders.csv")
    logs = pd.read_csv(raw_dir / "logs.csv")
    sw = pd.read_csv(raw_dir / "stopworks.csv")
    return wo, logs, sw

def combine_text_per_wo(logs: pd.DataFrame, sw: pd.DataFrame) -> pd.DataFrame:
    logs = logs.copy()
    logs["text_piece"] = logs["level"].astype(str) + " " + logs["message"].astype(str)
    agg_logs = (
        logs.groupby("work_order_id")["text_piece"]
            .apply(lambda s: "\n".join(s.tolist()))
            .rename("logs_text").reset_index()
    )
    if not sw.empty:
        sw_agg = (
            sw.groupby("work_order_id")["note_text"]
              .apply(lambda s: "\n".join(s.astype(str).tolist()))
              .rename("stopworks_text").reset_index()
        )
    else:
        sw_agg = pd.DataFrame(columns=["work_order_id", "stopworks_text"])

    text_df = agg_logs.merge(sw_agg, on="work_order_id", how="left")
    text_df["stopworks_text"] = text_df["stopworks_text"].fillna("")
    text_df["all_text"] = (text_df["logs_text"].astype(str) + "\n" + text_df["stopworks_text"].astype(str)).str.strip()
    return text_df

def basic_log_stats(logs: pd.DataFrame) -> pd.DataFrame:
    cnt = logs.pivot_table(index="work_order_id", columns="level", values="message", aggfunc="count", fill_value=0)
    cnt = cnt.add_prefix("logcnt_").reset_index()
    total = logs.groupby("work_order_id")["message"].size().rename("logcnt_total").reset_index()
    logs["msg_len"] = logs["message"].astype(str).str.len()
    avg_len = logs.groupby("work_order_id")["msg_len"].mean().rename("loglen_avg").reset_index()
    return cnt.merge(total, on="work_order_id", how="outer").merge(avg_len, on="work_order_id", how="outer").fillna(0)

In [24]:
# --- Vectorization & clustering ---
def make_tfidf_matrix(texts: List[str], ngram_range=(1,2), min_df=2, max_features=100_000):
    vect = TfidfVectorizer(
        ngram_range=ngram_range,
        min_df=min_df,
        max_features=max_features,
    )
    X = vect.fit_transform(texts)
    return X, vect

def reduce_svd(X, n_components=100, random_state=42):
    svd = TruncatedSVD(n_components=n_components, random_state=random_state)
    Xr = svd.fit_transform(X)
    return Xr, svd

def cluster_texts_matrix(X, use_hdbscan=True, min_cluster_size=20, min_samples=None,
                         selection='leaf', metric='euclidean'):
    if use_hdbscan and HDBSCAN_AVAILABLE:
        clusterer = hdbscan.HDBSCAN(
            min_cluster_size=min_cluster_size,
            min_samples=min_samples,                 # try None first
            metric=metric,                           # 'euclidean' on SVD; 'cosine' if using raw TF-IDF
            cluster_selection_method=selection,      # 'leaf' gives more clusters
            cluster_selection_epsilon=0.0,           # can tune >0 later
            prediction_data=False,
        )
        labels = clusterer.fit_predict(X)
        return labels, "hdbscan", clusterer
    else:
        from sklearn.cluster import DBSCAN
        clusterer = DBSCAN(eps=0.8, min_samples=10, metric="euclidean")
        labels = clusterer.fit_predict(X if isinstance(X, np.ndarray) else X.toarray())
        model_type = "dbscan"
        model = clusterer
    return labels, model_type, model

def build_cluster_feature_frame(work_order_ids, labels, top_k=100, include_noise_flag=True):
    df = pd.DataFrame({"work_order_id": work_order_ids, "cluster": labels})
    freq = df["cluster"].value_counts(dropna=False)
    kept = [c for c in freq.index.tolist() if c != -1][:top_k]
    pivot = (
        df.assign(val=1)
          .pivot_table(index="work_order_id", columns="cluster", values="val", aggfunc="sum", fill_value=0)
          .reset_index()
    )
    if include_noise_flag:
        saw_noise = (pivot.get(-1, 0) > 0).astype(int)
        pivot["saw_noise"] = saw_noise
    cols = ["work_order_id"] + kept + ([-1] if -1 in pivot.columns else []) + (["saw_noise"] if include_noise_flag else [])
    pivot = pivot[[c for c in cols if c in pivot.columns]]
    new_cols = {c: (f"cl_{int(c)}" if isinstance(c, (int,)) else c) for c in pivot.columns if c not in ["work_order_id","saw_noise"]}
    pivot = pivot.rename(columns=new_cols)
    return pivot

In [30]:
from sklearn.preprocessing import normalize

# --- Feature assembly & preprocessors ---
def build_tabular_features(wo: pd.DataFrame, logs: pd.DataFrame, include_log_stats=True):
    base = wo.copy()
    y = base["failure_label"].astype(int).values
    base = base.drop(columns=["failure_label"])
    if include_log_stats:
        stats = basic_log_stats(logs)
        base = base.merge(stats, on="work_order_id", how="left")
        num_cols = base.select_dtypes(include=[np.number]).columns
        base[num_cols] = base[num_cols].fillna(0)
    return base, y

def build_preprocessor(frame: pd.DataFrame):
    base = frame.drop(columns=["work_order_id", "failure_label"], errors="ignore").copy()
    cat_cols = base.select_dtypes(include=["object"]).columns.tolist()
    num_cols = [c for c in base.columns if c not in cat_cols]
    return ColumnTransformer(
        transformers=[
            ("num", SimpleImputer(strategy="median"), num_cols),
            ("cat", Pipeline(steps=[
                ("imputer", SimpleImputer(strategy="most_frequent")),
                ("ohe", OneHotEncoder(handle_unknown="ignore")),
            ]), cat_cols),
        ],
        remainder="drop",
        verbose_feature_names_out=False,
    )

def assemble_features(wo, logs, sw,
                      ngram_range=(1,2), min_df=2, max_features=100_000,
                      use_hdb=True, min_cluster_size=20, min_samples=None,
                      svd_components=100, top_k_clusters=100, seed=42):
    text_df = combine_text_per_wo(logs, sw)
    X_text, tfidf = make_tfidf_matrix(
        text_df["all_text"].tolist(),
        ngram_range=ngram_range, min_df=min_df, max_features=max_features
    )
    # after SVD
    if svd_components:
        X_input, svd = reduce_svd(X_text, n_components=svd_components, random_state=seed)
        X_input = normalize(X_input)             # <-- IMPORTANT: row-normalize
        labels, model_type, cluster_model = cluster_texts_matrix(
            X_input, use_hdbscan=use_hdb, min_cluster_size=min_cluster_size,
            min_samples=min_samples, selection='leaf', metric='euclidean'
        )
    else:
        # If you ever cluster raw TF-IDF, use cosine metric
        X_input = normalize(X_text)
        labels, model_type, cluster_model = cluster_texts_matrix(
            X_input, use_hdbscan=use_hdb, min_cluster_size=min_cluster_size,
            min_samples=min_samples, selection='leaf', metric='cosine'
        )

    labels, model_type, cluster_model = cluster_texts_matrix(
        X_input, use_hdbscan=use_hdb, min_cluster_size=min_cluster_size, min_samples=min_samples
    )
    cl_feats = build_cluster_feature_frame(text_df["work_order_id"].tolist(), labels, top_k=top_k_clusters, include_noise_flag=True)
    base_tab, y = build_tabular_features(wo, logs, include_log_stats=True)
    merged = base_tab.merge(cl_feats, on="work_order_id", how="left").fillna(0)

    # Save cluster assignments for diagnostics
    assign_path = PROC_DIR / "cluster_assignments.csv"
    pd.DataFrame({"work_order_id": text_df["work_order_id"], "cluster": labels}).to_csv(assign_path, index=False)

    meta = {
        "tfidf": tfidf,
        "svd": svd,
        "cluster_model": cluster_model,
        "cluster_model_type": model_type,
        "labels": labels,
        "noise_rate": float((pd.Series(labels) == -1).mean())
    }
    return merged, y, meta

In [11]:
# --- Modeling & evaluation + Macro cell ---
def split_xy(frame: pd.DataFrame, y: np.ndarray, test_size=0.25, seed=42):
    keys = frame["work_order_id"].values
    X = frame.drop(columns=["work_order_id"])
    X_train, X_test, y_train, y_test, k_train, k_test = train_test_split(
        X, y, keys, test_size=test_size, random_state=seed, stratify=y
    )
    return (X_train, y_train, k_train), (X_test, y_test, k_test)

def make_rf(random_state=42):
    return RandomForestClassifier(
        n_estimators=800,
        max_depth=20,
        min_samples_leaf=5,
        n_jobs=-1,
        random_state=random_state,
        class_weight="balanced_subsample",
    )

def evaluate_binary(y_true, y_prob, threshold=None):
    if threshold is None:
        p, r, t = precision_recall_curve(y_true, y_prob)
        f1 = 2*p*r/(p+r+1e-9)
        i = np.nanargmax(f1)
        threshold = t[i-1] if i>0 else 0.5
    y_pred = (y_prob >= threshold).astype(int)
    return {
        "threshold": float(threshold),
        "roc_auc": float(roc_auc_score(y_true, y_prob)),
        "pr_auc": float(average_precision_score(y_true, y_prob)),
        "report": classification_report(y_true, y_pred, digits=3)
    }

def quick_diag(name, pipe, X_test, y_test, feat_merged, labels_path="data/processed/cluster_assignments.csv"):
    prob = pipe.predict_proba(X_test)[:,1]
    eval_best = evaluate_binary(y_test, prob, threshold=None)

    # Cluster outcomes
    assign = pd.read_csv(labels_path)
    df = pd.DataFrame({"work_order_id": feat_merged["work_order_id"], "y": feat_merged["failure_label"].astype(int)})
    tab = assign.merge(df, on="work_order_id").groupby("cluster").y.agg(['mean','count']).sort_values('mean', ascending=False)
    noise_rate = (assign.cluster==-1).mean()

    return {
        "Name": name,
        "Noise_rate": float(noise_rate),
        "ROC_AUC": eval_best["roc_auc"],
        "PR_AUC": eval_best["pr_auc"],
        "Best_threshold": eval_best["threshold"],
        "Report@BestF1": eval_best["report"],
        "Cluster_outcomes_head": tab.head(10)
    }

In [27]:
# --- Load data & assemble features ---
wo, logs, sw = load_raw_frames(RAW_DIR)
feat_merged, y, meta = assemble_features(
    wo, logs, sw,
    ngram_range=NGRAM_RANGE, min_df=MIN_DF, max_features=MAX_FEATURES,
    use_hdb=USE_HDBSCAN, min_cluster_size=HDBSCAN_MIN_CLUSTER_SIZE, min_samples=HDBSCAN_MIN_SAMPLES,
    svd_components=None, top_k_clusters=TOP_K_CLUSTERS, seed=RANDOM_SEED
)
pd.DataFrame(feat_merged.assign(failure_label=y)).to_csv(PROC_DIR / "features_merged_diag.csv", index=False)
print("Merged features:", feat_merged.shape, "| Noise rate:", meta["noise_rate"])



Merged features: (4000, 16) | Noise rate: 0.8765


In [28]:
# Identify cluster feature columns
cluster_cols = [c for c in feat_merged.columns if c.startswith("cl_")]
if not cluster_cols:
    print("Warning: no cluster columns present. Consider lowering HDBSCAN_MIN_CLUSTER_SIZE or MIN_DF.")

# Baseline frame excludes cluster columns
baseline_frame = feat_merged[["work_order_id"] + [c for c in feat_merged.columns if c not in cluster_cols + ["work_order_id"]]]

# Build preprocessors on feature-only frames (no labels)
pre_base = build_preprocessor(baseline_frame)
pre_full = build_preprocessor(feat_merged)

def debug_pre_cols(frame, name):
    base = frame.drop(columns=["work_order_id", "failure_label"], errors="ignore")
    cat_cols = base.select_dtypes(include=["object"]).columns.tolist()
    num_cols = [c for c in base.columns if c not in cat_cols]
    print(f"[{name}] num_cols({len(num_cols)}):", num_cols[:8], "...")
    print(f"[{name}] cat_cols({len(cat_cols)}):", cat_cols[:8], "...")

debug_pre_cols(baseline_frame, "baseline")
debug_pre_cols(feat_merged, "full")

# Train/test splits
(train_b, y_tr_b, _), (test_b, y_te_b, _) = split_xy(baseline_frame, y, test_size=TEST_SIZE, seed=RANDOM_SEED)
(train_f, y_tr_f, _), (test_f, y_te_f, _) = split_xy(feat_merged, y, test_size=TEST_SIZE, seed=RANDOM_SEED)

# Pipelines
pipe_base = Pipeline([("prep", pre_base), ("rf", make_rf(RANDOM_SEED))]).fit(train_b, y_tr_b)
pipe_full = Pipeline([("prep", pre_full), ("rf", make_rf(RANDOM_SEED))]).fit(train_f, y_tr_f)

# Predict & evaluate
prob_b = pipe_base.predict_proba(test_b)[:, 1]
prob_f = pipe_full.predict_proba(test_f)[:, 1]

diag_base = evaluate_binary(y_te_b, prob_b)
diag_full = evaluate_binary(y_te_f, prob_f)

print("=== Baseline ===")
print(f"ROC-AUC: {diag_base['roc_auc']:.4f} | PR-AUC: {diag_base['pr_auc']:.4f}")
print(diag_base["report"])

print("=== +Clusters ===")
print(f"ROC-AUC: {diag_full['roc_auc']:.4f} | PR-AUC: {diag_full['pr_auc']:.4f}")
print(diag_full["report"])

[baseline] num_cols(6): ['logcnt_ERROR', 'logcnt_INFO', 'logcnt_WARN', 'logcnt_total', 'loglen_avg', 'saw_noise'] ...
[baseline] cat_cols(6): ['catalog_id', 'supplier', 'device_type', 'technician', 'shift', 'build_date'] ...
[full] num_cols(9): ['logcnt_ERROR', 'logcnt_INFO', 'logcnt_WARN', 'logcnt_total', 'loglen_avg', 'cl_0', 'cl_1', 'cl_-1'] ...
[full] cat_cols(6): ['catalog_id', 'supplier', 'device_type', 'technician', 'shift', 'build_date'] ...
=== Baseline ===
ROC-AUC: 0.5656 | PR-AUC: 0.2107
              precision    recall  f1-score   support

           0      0.858     0.618     0.719       823
           1      0.229     0.525     0.318       177

    accuracy                          0.602      1000
   macro avg      0.543     0.572     0.519      1000
weighted avg      0.747     0.602     0.648      1000

=== +Clusters ===
ROC-AUC: 0.5643 | PR-AUC: 0.2107
              precision    recall  f1-score   support

           0      0.859     0.557     0.676       823
         

In [29]:
# --- Macro usage example ---
diag = quick_diag("current-config", pipe_full, test_f, y_te_f, pd.DataFrame(feat_merged.assign(failure_label=y)))
diag

{'Name': 'current-config',
 'Noise_rate': 0.8765,
 'ROC_AUC': 0.5642509490564355,
 'PR_AUC': 0.2107188339666781,
 'Best_threshold': 0.43828886700591946,
 'Report@BestF1': '              precision    recall  f1-score   support\n\n           0      0.859     0.557     0.676       823\n           1      0.218     0.576     0.317       177\n\n    accuracy                          0.560      1000\n   macro avg      0.539     0.566     0.496      1000\nweighted avg      0.746     0.560     0.612      1000\n',
 'Cluster_outcomes_head':              mean  count
 cluster                 
  0       0.198758    483
 -1       0.174843   3506
  1       0.000000     11}

In [26]:
from sklearn.metrics import precision_recall_curve, classification_report
import numpy as np

def eval_with_precision_floor(y_true, y_prob, floor=0.35):
    p, r, th = precision_recall_curve(y_true, y_prob)
    ok = np.where(p[:-1] >= floor)[0]
    if len(ok)==0:
        print("No threshold meets the precision floor.")
        return None
    i = ok[np.argmax(r[ok])]
    t = th[i]
    y_pred = (y_prob >= t).astype(int)
    print(f"Chosen threshold: {t:.3f} (precision≥{floor}) | P:{p[i]:.3f} R:{r[i]:.3f}")
    print(classification_report(y_true, y_pred, digits=3))
    return t