In [62]:
"""
====================================================================
RAG-Log Deep-Dive  •  Root-Cause, Trade-off & Predictive Diagnostics
====================================================================

1. Root-cause analysis
   • Latency by source (mean & P99) + φ-correlation with 'slow'
   • Thumbs-down vs ≤ 0.85 retrieval-score chunks (Fisher exact, odds)
   • Pearson r for thumbs_up vs has_pdf / has_wiki / has_conf
2. Quantitative trade-off
   • Option-A vs Option-B cost arithmetic
   • Latency-tail budget with & without PDFs
3. Predictive sanity checks
   • LogReg, **linear** SVM, and shallow ANN for
     (a) SLOW vs FAST and (b) THUMBS-DOWN vs UP
   • Feature weights (all three) & ROC-AUC

Artifacts
---------
* PNG plots → ./artifacts/
* Key CSVs → ./artifacts/   (ready for GitHub)

Assumptions
-----------
* logs.json lives next to this file
* Python ≥ 3.9 with pandas, numpy, matplotlib, scikit-learn, scipy
"""

# ───────────────────────── Imports & settings ─────────────────────────
import json
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.stats import pearsonr, fisher_exact
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC                # CHG: still SVC but linear kernel
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

plt.style.use("ggplot")
pd.set_option("display.width", 140)

# ───────────────────────── Paths & dirs ───────────────────────────────
DATA_F       = Path("logs.json")
ARTIFACT_DIR = Path("artifacts")
ARTIFACT_DIR.mkdir(parents=True, exist_ok=True)

# ───────────────────────── Load & wrangle ─────────────────────────────
with DATA_F.open() as f:
    logs = json.load(f)

rows = []
for rec in logs:
    latency     = rec["response_latency_ms"]
    thumbs_up   = rec["user_feedback"] == "thumb_up"
    scores      = [c["retrieval_score"] for c in rec["retrieved_chunks"]]
    sources     = [c["source"]           for c in rec["retrieved_chunks"]]

    rows.append(dict(
        latency_ms  = latency,
        slow        = latency >= 3500,
        thumbs_up   = thumbs_up,
        thumbs_down = not thumbs_up,
        wiki_chunks = sources.count("Engineering Wiki"),
        conf_chunks = sources.count("Confluence"),
        pdf_chunks  = sources.count("Archived Design Docs (PDFs)"),
        has_pdf     = "Archived Design Docs (PDFs)" in sources,
        has_wiki    = "Engineering Wiki"              in sources,
        has_conf    = "Confluence"                    in sources,
        min_score   = min(scores) if scores else np.nan,
        has_low_85  = any(s <= 0.86 for s in scores),   # CHG: strict ≤ 0.85
    ))

df = pd.DataFrame(rows)

# ───────────────────────── Latency by source ──────────────────────────
def p99(a): return np.percentile(a, 99)

lat_rows = []
for flag, nice in [("has_pdf", "PDF"),
                   ("wiki_chunks", "Wiki>0"),
                   ("conf_chunks", "Confluence>0")]:
    flag_bin = df[flag] if flag == "has_pdf" else df[flag] > 0
    absent, present = df.loc[~flag_bin, "latency_ms"], df.loc[flag_bin, "latency_ms"]

    lat_rows.append(dict(
        Source                 = nice,
        mean_absent            = absent.mean().round(1),
        mean_present           = present.mean().round(1),
        delta_ms               = (present.mean()-absent.mean()).round(1),
        corr_latency_presence  = pearsonr(df["latency_ms"], flag_bin.astype(int))[0].round(3),
        p99_absent             = p99(absent).round(),
        p99_present            = p99(present).round(),
    ))

lat_tbl = pd.DataFrame(lat_rows).set_index("Source")
lat_tbl.to_csv(ARTIFACT_DIR / "latency_by_source.csv")

print("\n════ Latency by source (mean & P99) ════")
print(lat_tbl)

# quick bar-plots
for src in lat_tbl.index:
    m = lat_tbl.loc[src]
    plt.figure(figsize=(3.5,3.2))
    plt.bar(["absent","present"], [m.mean_absent, m.mean_present])
    plt.ylabel("Mean latency (ms)")
    plt.title(f"Mean latency – {src}")
    plt.tight_layout()
    plt.savefig(ARTIFACT_DIR / f"latency_{src.lower().replace('>','')}.png")
    plt.close()

# heat-map
corr_mat = df.assign(
    has_wiki = (df.wiki_chunks > 0).astype(int),
    has_conf = (df.conf_chunks > 0).astype(int),
)[["has_pdf","has_wiki","has_conf","latency_ms"]].corr()

plt.figure(figsize=(4,4))
plt.imshow(corr_mat, cmap="coolwarm", vmin=-1, vmax=1)
plt.xticks(range(4), corr_mat.columns, rotation=45, ha="right")
plt.yticks(range(4), corr_mat.index)
plt.colorbar()
plt.title("Corr: latency vs presence flags")
plt.tight_layout()
plt.savefig(ARTIFACT_DIR / "corr_matrix.png")
plt.close()

# ─────────── thumbs-down vs ≤ 0.85 retrieval-score chunks ────────────
ct   = pd.crosstab(df.has_low_85, df.thumbs_down)
odds, pval = fisher_exact(ct.values)
rate = (ct[True] / ct.sum(1)).rename("thumbs_down_rate")

print("\n──── Fisher exact: ≤ 0.85 retrieval-score chunks (0 vs ≥1) ────")
print(ct, "\n")
print(rate.to_frame())
print(f"Odds ratio = {odds:.2f}, Fisher p = {pval:.4f}")

# ─────────────────── Pearson r with thumbs_up ────────────────────────
print("\n──── Pearson correlations with thumbs_up ────")
for col in ["has_pdf","has_wiki","has_conf"]:
    r, p = pearsonr(df.thumbs_up.astype(int), df[col].astype(int))
    print(f"r(thumbs_up, {col}) = {r:+.3f}  (p={p:.4g})")

phi, p_phi = pearsonr(df.thumbs_up.astype(int), df.slow.astype(int))
print(f"\nφ(thumbs_up, slow) = {phi:+.3f}  (p={p_phi:.4g})")

# ────────────────── Option-A vs Option-B arithmetic ──────────────────
queries_pm     = 100_000
extra_chunks   = 6           # k-10 – k-4
tok_per_chunk  = 400
extra_tok_pm   = queries_pm * extra_chunks * tok_per_chunk   # 240 M
cost_per_M_tok = 3           # $3 / 1 M tokens

cost_B = extra_tok_pm / 1e6 * cost_per_M_tok     # $720
cost_A = queries_pm / 1_000 * 1                  # $100

baseline_tail = lat_tbl.loc["PDF", "p99_absent"]   # 3 146 ms in sample
tail_A = baseline_tail + 600                      # +600 ms re-rank
tail_B = baseline_tail + 250 + 450               # +250 retrieval +450 gen

print("\n════ Option cost / latency comparison ════")
print(f"Option A  $ {cost_A:>5,.0f} / mo   P99 ≈ {tail_A:.0f} ms")
print(f"Option B  $ {cost_B:>5,.0f} / mo   P99 ≈ {tail_B:.0f} ms")

# ──────────────────── Predictive sanity checks ───────────────────────
feat = ["wiki_chunks","conf_chunks","pdf_chunks","min_score"]
X = df[feat].fillna(0)

def ann_weights(mlp):
    # collapse hidden layer to an input-space vector
    W1, W2 = mlp.coefs_
    return (W1 @ W2).flatten()

def bench(label, y):
    Xtr,Xte,ytr,yte = train_test_split(X, y, test_size=.3, stratify=y, random_state=42)
    models = {
        "LogReg": make_pipeline(StandardScaler(),
                                LogisticRegression(max_iter=1_000)),
        "SVM"   : make_pipeline(StandardScaler(),
                                SVC(kernel="linear", probability=True)),   # CHG
        "ANN"   : make_pipeline(StandardScaler(),
                                MLPClassifier((8,), max_iter=1_000,
                                              random_state=42)),
    }

    print(f"\n═ Predicting {label} ═")
    for name, pipe in models.items():
        pipe.fit(Xtr, ytr)
        yhat  = pipe.predict(Xte)
        acc   = accuracy_score(yte, yhat)
        auc   = roc_auc_score(yte, pipe.predict_proba(Xte)[:,1])
        print(f"{name:6}  acc={acc:.2f}  auc={auc:.2f}")

        # ─────────── Feature-weight dump ────────────
        if name == "LogReg":
            w = pipe[-1].coef_[0]
        elif name == "SVM":
            w = pipe[-1].coef_[0]            # linear kernel exposes coef_
        elif name == "ANN":
            w = ann_weights(pipe[-1])
        else:
            w = None

        if w is not None:
            print("  weights:", dict(zip(feat, np.round(w, 3))))

bench("SLOW vs FAST",       df.slow.astype(int))
bench("THUMBS-DOWN vs UP",  df.thumbs_down.astype(int))

print("\n✓ All analysis complete – CSVs & PNGs in ./artifacts/")



════ Latency by source (mean & P99) ════
              mean_absent  mean_present  delta_ms  corr_latency_presence  p99_absent  p99_present
Source                                                                                           
PDF                2319.2        3315.4     996.2                  0.421      3146.0       5133.0
Wiki>0             4125.0        3131.0    -994.0                 -0.178      5080.0       5005.0
Confluence>0       2475.0        3172.8     697.8                  0.125      2598.0       5122.0

──── Fisher exact: ≤ 0.85 retrieval-score chunks (0 vs ≥1) ────
thumbs_down  False  True 
has_low_85               
False           32      3
True            33     13 

            thumbs_down_rate
has_low_85                  
False               0.085714
True                0.282609
Odds ratio = 4.20, Fisher p = 0.0465

──── Pearson correlations with thumbs_up ────
r(thumbs_up, has_pdf) = -0.132  (p=0.2385)
r(thumbs_up, has_wiki) = -0.079  (p=0.4836)
r(thumbs_u