In [6]:
# Cell 1 — Imports & paths
import numpy as np, pandas as pd, joblib, json, os
from pathlib import Path
from sklearn.metrics import (precision_recall_curve, roc_auc_score, average_precision_score,
                             precision_score, recall_score, f1_score, accuracy_score,
                             confusion_matrix, brier_score_loss)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")
ROOT = Path("dreaddit_cv_raw_splits")
SEL_DIR = ROOT / "selected_features"
MODEL_ROOT = Path("Machine learning") / "models"
TRAIN_CSV = ROOT / "train_raw_with_clean_text.csv"
TEST_CSV  = ROOT / "test_frozen_raw_with_clean_text.csv"
X_train_sel_path = SEL_DIR / "X_train_fused_selected.npy"
X_test_sel_path  = SEL_DIR / "X_test_fused_selected.npy"
# models to load — change if different
models_to_evaluate = {
    "logreg": MODEL_ROOT / "logreg" / "oof_predictions.csv",
    "svm": MODEL_ROOT / "svm_fast" / "oof_predictions.csv",
    "rf": MODEL_ROOT / "rf_baseline" / "oof_predictions.csv",
    "rf_tuned": MODEL_ROOT / "rf_tuned_corrected" / "rf_tuned_model_corrected.joblib",
    "lgbm": MODEL_ROOT / "lgbm" / "oof_predictions.csv",
    "lgbm_tuned": MODEL_ROOT / "lgbm_tuned_quick" / "lgbm_tuned_test_metrics.json"
}
# outputs
OUT = Path("dreaddit_analysis_outputs"); OUT.mkdir(exist_ok=True)


In [8]:
import pandas as pd
df_test = pd.read_csv(TEST_CSV)
print(df_test.columns.tolist())
df_test.head()


['orig_index', 'social_timestamp', 'social_karma', 'syntax_ari', 'lex_liwc_WC', 'lex_liwc_Analytic', 'lex_liwc_Clout', 'lex_liwc_Authentic', 'lex_liwc_Tone', 'lex_liwc_WPS', 'lex_liwc_Sixltr', 'lex_liwc_Dic', 'lex_liwc_function', 'lex_liwc_pronoun', 'lex_liwc_ppron', 'lex_liwc_i', 'lex_liwc_we', 'lex_liwc_you', 'lex_liwc_shehe', 'lex_liwc_they', 'lex_liwc_ipron', 'lex_liwc_article', 'lex_liwc_prep', 'lex_liwc_auxverb', 'lex_liwc_adverb', 'lex_liwc_conj', 'lex_liwc_negate', 'lex_liwc_verb', 'lex_liwc_adj', 'lex_liwc_compare', 'lex_liwc_interrog', 'lex_liwc_number', 'lex_liwc_quant', 'lex_liwc_affect', 'lex_liwc_posemo', 'lex_liwc_negemo', 'lex_liwc_anx', 'lex_liwc_anger', 'lex_liwc_sad', 'lex_liwc_social', 'lex_liwc_family', 'lex_liwc_friend', 'lex_liwc_female', 'lex_liwc_male', 'lex_liwc_cogproc', 'lex_liwc_insight', 'lex_liwc_cause', 'lex_liwc_discrep', 'lex_liwc_tentat', 'lex_liwc_certain', 'lex_liwc_differ', 'lex_liwc_percept', 'lex_liwc_see', 'lex_liwc_hear', 'lex_liwc_feel', 'lex_

Unnamed: 0,orig_index,social_timestamp,social_karma,syntax_ari,lex_liwc_WC,lex_liwc_Analytic,lex_liwc_Clout,lex_liwc_Authentic,lex_liwc_Tone,lex_liwc_WPS,...,lex_dal_avg_imagery,lex_dal_avg_pleasantness,social_upvote_ratio,social_num_comments,syntax_fk_grade,sentiment,char_len,token_len,label,clean_text
0,685,1493430482,1,0.898088,65,34.3,17.79,99.0,95.12,16.25,...,1.50588,1.89577,0.67,0,1.433824,0.063228,340,82,0,what are you gonna do with that strange lookin...
1,532,1516372493,0,4.484242,91,11.43,18.97,81.48,11.53,15.17,...,1.43846,1.86702,0.47,17,4.655523,0.068519,494,107,1,but what should i say? part of me wants to tel...
2,268,1524175905,2,7.011905,89,41.27,28.71,66.74,1.0,17.8,...,1.41053,1.83995,0.75,9,7.441484,-0.089286,500,98,1,hey guys i have ptsd from years of emotional a...
3,507,1515552336,1,3.404286,40,11.24,77.33,86.07,25.77,8.0,...,1.48108,1.87271,1.0,1,4.26219,0.164286,230,49,1,we had 2 classes together so we spent a few ho...
4,465,1541796675,10,7.428889,75,40.12,21.18,91.75,25.77,15.0,...,1.58095,1.84232,0.99,1,7.626765,0.05,437,87,1,it s really just standard issue big corporatio...


In [9]:
# Cell 2 — load true test labels and predictions from chosen models (example: rf_tuned and logreg)
df_test = pd.read_csv(TEST_CSV)
y_test = df_test['label'].values
texts_test = df_test['clean_text'].astype(str).values

# helper: load OOF or test preds for a model
def load_model_oof(path):
    p = Path(path)
    if not p.exists():
        return None
    df = pd.read_csv(p)
    return df

# Example: load RF baseline OOF (if you want to evaluate OOFs) and tuned RF test predictions (if available)
rf_oof = load_model_oof(MODEL_ROOT / "rf_baseline" / "oof_predictions.csv")
logreg_oof = load_model_oof(MODEL_ROOT / "logreg" / "oof_predictions.csv")

# Load final tuned RF and produce predictions on frozen test
rf_tuned = None
rt_path = MODEL_ROOT / "rf_tuned_corrected" / "rf_tuned_model_corrected.joblib"
if rt_path.exists():
    rf_tuned = joblib.load(rt_path)
    if Path(X_test_sel_path).exists():
        X_test_sel = np.load(X_test_sel_path)
        probs = rf_tuned.predict_proba(X_test_sel)[:,1]
        preds = (probs >= 0.5).astype(int)
        pd.DataFrame({"clean_text": texts_test, "true": y_test, "prob_pos": probs, "pred": preds}).to_csv(OUT/"rf_tuned_test_preds.csv", index=False)
        print("Saved rf_tuned_test_preds.csv")
else:
    print("No tuned RF model found at", rt_path)


Saved rf_tuned_test_preds.csv


In [10]:
# Cell 3 — Core metrics function + compute for rf_tuned (change model variable to evaluate others)
from sklearn.metrics import classification_report, precision_recall_curve, roc_curve
import numpy as np

def compute_core_metrics(y_true, y_pred, y_prob):
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1m = f1_score(y_true, y_pred)
    acc = accuracy_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_prob)
    pr_auc = average_precision_score(y_true, y_prob)  # PR-AUC (AP)
    brier = brier_score_loss(y_true, y_prob)
    return {"accuracy":acc, "precision":prec, "recall":rec, "f1":f1m, "roc_auc":roc_auc, "pr_auc":pr_auc, "brier":brier}

# load rf tuned test preds
preds_df = pd.read_csv(OUT/"rf_tuned_test_preds.csv")
metrics = compute_core_metrics(preds_df["true"].values, preds_df["pred"].values, preds_df["prob_pos"].values)
print("RF tuned test metrics:", metrics)

# classification report + confusion matrix
print("\nClassification report:\n", classification_report(preds_df["true"], preds_df["pred"]))
cm = confusion_matrix(preds_df["true"], preds_df["pred"])
cm_norm = cm.astype("float")/cm.sum(axis=1)[:,None]
print("Confusion matrix (raw):\n", cm)
print("Confusion matrix (normalized):\n", cm_norm)

# save metrics.json
import json
with open(OUT/"rf_tuned_test_metrics.json","w") as f:
    json.dump(metrics, f, indent=2)


RF tuned test metrics: {'accuracy': 0.6923076923076923, 'precision': 0.7083333333333334, 'recall': 0.6891891891891891, 'f1': 0.6986301369863014, 'roc_auc': np.float64(0.7751664708186448), 'pr_auc': np.float64(0.7764052923162905), 'brier': np.float64(0.1952257815851556)}

Classification report:
               precision    recall  f1-score   support

           0       0.68      0.70      0.69        69
           1       0.71      0.69      0.70        74

    accuracy                           0.69       143
   macro avg       0.69      0.69      0.69       143
weighted avg       0.69      0.69      0.69       143

Confusion matrix (raw):
 [[48 21]
 [23 51]]
Confusion matrix (normalized):
 [[0.69565217 0.30434783]
 [0.31081081 0.68918919]]


In [11]:
# Cell 4 — ROC, Precision-Recall, Calibration curve, Brier
import matplotlib.pyplot as plt
from sklearn.calibration import calibration_curve

y_true = preds_df["true"].values
y_prob = preds_df["prob_pos"].values
y_pred = preds_df["pred"].values

# ROC
fpr, tpr, _ = roc_curve(y_true, y_prob)
plt.figure(figsize=(6,4)); plt.plot(fpr, tpr); plt.plot([0,1],[0,1],'--',alpha=0.5)
plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title("ROC Curve"); plt.grid(True)
plt.savefig(OUT/"rf_tuned_roc.png"); plt.close()

# PR
precision, recall, _ = precision_recall_curve(y_true, y_prob)
plt.figure(figsize=(6,4)); plt.plot(recall, precision)
plt.xlabel("Recall"); plt.ylabel("Precision"); plt.title("Precision-Recall Curve")
plt.savefig(OUT/"rf_tuned_pr.png"); plt.close()

# Calibration curve
prob_true, prob_pred = calibration_curve(y_true, y_prob, n_bins=10)
plt.figure(figsize=(6,4)); plt.plot(prob_pred, prob_true, marker='o'); plt.plot([0,1],[0,1],'--',alpha=0.5)
plt.xlabel("Mean predicted probability"); plt.ylabel("Fraction of positives"); plt.title("Calibration curve")
plt.savefig(OUT/"rf_tuned_calibration.png"); plt.close()

print("Saved ROC, PR, calibration plots to", OUT)


Saved ROC, PR, calibration plots to dreaddit_analysis_outputs


In [12]:
# Cell 5 — Bootstrap CIs for F1, ROC-AUC, PR-AUC (1000 resamples)
import numpy as np
from sklearn.metrics import f1_score, roc_auc_score, average_precision_score
rng = np.random.default_rng(42)

def bootstrap_ci(y_true, y_prob, n_boot=1000, alpha=0.05):
    n = len(y_true)
    f1_list, roc_list, pr_list = [], [], []
    for i in range(n_boot):
        idx = rng.integers(0, n, n)
        y_t = y_true[idx]; y_p = y_prob[idx]
        # choose threshold 0.5 for f1 (could optimize threshold per sample)
        y_hat = (y_p >= 0.5).astype(int)
        f1_list.append(f1_score(y_t, y_hat))
        try:
            roc_list.append(roc_auc_score(y_t, y_p))
        except:
            roc_list.append(np.nan)
        try:
            pr_list.append(average_precision_score(y_t, y_p))
        except:
            pr_list.append(np.nan)
    def ci(arr):
        arr = np.array(arr)
        lo = np.nanpercentile(arr, 100*alpha/2)
        hi = np.nanpercentile(arr, 100*(1-alpha/2))
        return float(np.nanmedian(arr)), float(lo), float(hi)
    return {"f1":ci(f1_list), "roc_auc":ci(roc_list), "pr_auc":ci(pr_list)}

ci = bootstrap_ci(y_true, y_prob, n_boot=1000)
print("Bootstrap CIs (median, lo, hi):", ci)
with open(OUT/"rf_tuned_bootstrap_cis.json","w") as f:
    json.dump(ci, f, indent=2)


Bootstrap CIs (median, lo, hi): {'f1': (0.6973684210526315, 0.599963503649635, 0.7755102040816326), 'roc_auc': (0.7726642667215938, 0.6952148223871304, 0.843138140798488), 'pr_auc': (0.7821423428898415, 0.66716011744337, 0.8698809417683042)}


In [19]:
# Cell 7 — LR coefficient analysis (train a TF-IDF-only LR model for interpretability)
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# load train/test raw CSVs with clean_text
train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

tfidf = joblib.load(ROOT/"tfidf"/"tfidf_vectorizer.joblib")
svd = joblib.load(ROOT/"svd"/"tfidf_svd_200.joblib")

# Train TF-IDF only LR (no SVD) for token-level interpretability
vec = TfidfVectorizer(ngram_range=(1,2), max_features=5000, min_df=2)
X_train_t = vec.fit_transform(train_df['clean_text'].astype(str).values)
X_test_t = vec.transform(test_df['clean_text'].astype(str).values)
lr_tok = LogisticRegression(class_weight='balanced', max_iter=2000, solver='liblinear')
lr_tok.fit(X_train_t, train_df['label'].values)

# save
joblib.dump(lr_tok, OUT/"tfidf_lr.joblib")
joblib.dump(vec, OUT/"tfidf_tok_vectorizer.joblib")

# coefficients: top positive and negative tokens
coef = lr_tok.coef_[0]
feat_names = np.array(vec.get_feature_names_out())
top_pos = feat_names[np.argsort(coef)[-30:]][::-1]
top_neg = feat_names[np.argsort(coef)[:30]]
print("Top positive tokens (stress):", top_pos[:20])
print("Top negative tokens (non-stress):", top_neg[:20])


Top positive tokens (stress): ['my' 'me' 'anxiety' 'feel like' 'don know' 'and' 'do' 'anxious' 'like'
 'feel' 'every' 'hard' 'now' 'know' 'am' 'fucking' 'panic' 'my anxiety'
 'myself' 'im']
Top negative tokens (non-stress): ['you' 'for' 'we' 'your' 'she' 'url' 'their' 'first' 'would' 'more'
 'pretty' 'post' 'share' 'if you' 'them' 'homeless' 'who' 'met' 'thanks'
 'together']


In [20]:
# Cell 8 — SHAP (global + local) for RF tuned (tree explainer)
# Requires shap package; install if missing: pip install shap
import shap
rf = joblib.load(rt_path)  # rf_tuned
X_train = np.load(SEL_DIR/"X_train_fused_selected.npy")
explainer = shap.TreeExplainer(rf)
shap_vals = explainer.shap_values(X_train)  # returns list, 2 classes
# global summary
shap.summary_plot(shap_vals[1], X_train, show=False)
plt.savefig(OUT/"shap_summary_rf.png"); plt.close()
# local explanation for first test instance
X_test = np.load(SEL_DIR/"X_test_fused_selected.npy")
shap.force_plot(explainer.expected_value[1], shap_vals[1][0,:], X_train[0,:], matplotlib=True, show=False)
plt.savefig(OUT/"shap_force_sample0.png"); plt.close()

print("Saved SHAP plots to", OUT)


AssertionError: The shape of the shap_values matrix does not match the shape of the provided data matrix.

In [21]:
# Cell 9 — LIME token-level interpretation for TF-IDF LR model
# pip install lime
from lime.lime_text import LimeTextExplainer
explainer = LimeTextExplainer(class_names=["non-stress","stress"])
predict_fn = lambda texts: lr_tok.predict_proba(vec.transform(texts))
i = 0
exp = explainer.explain_instance(test_df['clean_text'].iloc[i], predict_fn, num_features=10)
print("LIME explanation for sample 0:\n", exp.as_list())
# save for multiple samples
lime_out = []
for i in range(min(100, len(test_df))):
    txt = test_df['clean_text'].iloc[i]
    exp = explainer.explain_instance(txt, predict_fn, num_features=10)
    lime_out.append({"idx": i, "text": txt, "explanation": exp.as_list()})
pd.DataFrame(lime_out).to_csv(OUT/"lime_test_explanations.csv", index=False)
print("Saved LIME explanations to", OUT/"lime_test_explanations.csv")


ModuleNotFoundError: No module named 'lime'

In [22]:
# Cell 10 — extract FP & FN from tuned RF test preds; save explanations (SHAP + LIME snippets)
preds = pd.read_csv(OUT/"rf_tuned_test_preds.csv")
# find FP, FN
fp = preds[(preds['true']==0) & (preds['pred']==1)].copy()
fn = preds[(preds['true']==1) & (preds['pred']==0)].copy()
print("FP count:", len(fp), "FN count:", len(fn))

# sample 50 each (or fewer if not available)
fp_sample = fp.sample(n=min(50,len(fp)), random_state=42)
fn_sample = fn.sample(n=min(50,len(fn)), random_state=42)

# For each, compute shap values (local) and LIME explanations using lr_tok/vec
# SHAP local: use explainer from earlier and X_test array
X_test = np.load(SEL_DIR/"X_test_fused_selected.npy")
shap_vals_test = explainer.shap_values(X_test)[1]  # class 1 contributions
def get_shap_for_index(orig_idx):
    # find row index in test array corresponding to orig_index (orig_index stored in test_csv)
    tdf = pd.read_csv(TEST_CSV)
    pos = tdf.reset_index().set_index('orig_index').loc[orig_idx]['index']
    return shap_vals_test[pos]

# collect
rows = []
for df_sample, label in [(fp_sample, "FP"), (fn_sample, "FN")]:
    for _, r in df_sample.iterrows():
        orig_idx = int(r['orig_index'])
        prob = float(r['prob_pos']); pred = int(r['pred']); true = int(r['true'])
        txt = r['text']
        # LIME tokens
        lime_exp = explainer.explain_instance(txt, predict_fn, num_features=6)
        lime_list = lime_exp.as_list()
        # SHAP local vector
        try:
            svals = get_shap_for_index(orig_idx).tolist()
        except Exception:
            svals = []
        rows.append({
            "orig_index": orig_idx, "type": label, "text": txt, "true": true, "pred": pred, "prob": prob,
            "lime": str(lime_list), "shap_top": str(sorted(enumerate(svals), key=lambda x: -abs(x[1]))[:10])
        })
out_df = pd.DataFrame(rows)
out_df.to_csv(OUT/"error_analysis_fp_fn_50_50.csv", index=False)
print("Saved error analysis CSV to", OUT/"error_analysis_fp_fn_50_50.csv")


FP count: 21 FN count: 23


KeyError: 'orig_index'

In [None]:
# Cell 11 — Ablation orchestration
# We will run per-configuration training on unique train/test selected arrays and record test metrics.
import itertools
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# helper to evaluate a pipeline given X_train,y_train,X_test,y_test
def eval_model(clf, X_train, y_train, X_test, y_test):
    clf.fit(X_train, y_train)
    probs = clf.predict_proba(X_test)[:,1]
    preds = (probs >= 0.5).astype(int)
    return compute_core_metrics(y_test, preds, probs)

# Configs to try:
configs = [
    {"name":"tfidf_only","use_tfidf":True,"use_lex":False,"svd_dim":200},
    {"name":"lex_only","use_tfidf":False,"use_lex":True,"svd_dim":0},
    {"name":"fused_selected","use_tfidf":True,"use_lex":True,"svd_dim":200},
    {"name":"fused_no_selection","use_tfidf":True,"use_lex":True,"svd_dim":200,"selection":False}
]

results = []
# load artifacts/vec/svd/lex scaler etc
tfidf = joblib.load(ROOT/"tfidf"/"tfidf_vectorizer.joblib")
svd = joblib.load(ROOT/"svd"/"tfidf_svd_200.joblib")
imputer = joblib.load(ROOT/"lexical"/"lex_imputer.joblib")
scaler = joblib.load(ROOT/"lexical"/"lex_scaler.joblib")
selector = joblib.load(ROOT/"selected_features"/"selector_L1.joblib")
lex_cols = pd.read_csv(ROOT/"lexical"/"lexical_columns_list.csv", header=None)[0].tolist()

train_master = pd.read_csv(TRAIN_CSV); test_master = pd.read_csv(TEST_CSV)
clean_train = train_master['clean_text'].astype(str).values
clean_test  = test_master['clean_text'].astype(str).values

# TF-IDF -> SVD transforms
X_tfidf_train = tfidf.transform(clean_train); X_tfidf_test = tfidf.transform(clean_test)
X_svd_train = svd.transform(X_tfidf_train); X_svd_test = svd.transform(X_tfidf_test)

# lexical matrices
lex_train = train_master[lex_cols].fillna(0).values
lex_test = test_master[lex_cols].fillna(0).values
lex_train = scaler.transform(imputer.transform(lex_train))
lex_test = scaler.transform(imputer.transform(lex_test))

for cfg in configs:
    if cfg.get("use_tfidf") and cfg.get("use_lex"):
        Xtr = np.hstack([X_svd_train, lex_train])
        Xte = np.hstack([X_svd_test, lex_test])
    elif cfg.get("use_tfidf"):
        Xtr, Xte = X_svd_train, X_svd_test
    else:
        Xtr, Xte = lex_train, lex_test

    # selection ON/OFF
    if cfg.get("selection", True):
        Xtr_sel = selector.transform(Xtr)
        Xte_sel = selector.transform(Xte)
    else:
        Xtr_sel, Xte_sel = Xtr, Xte

    # baseline classifier for ablation: Logistic Regression (simple) and RF for non-linear
    lr = LogisticRegression(class_weight='balanced', max_iter=2000, solver='liblinear')
    rf = RandomForestClassifier(n_estimators=200, max_depth=12, class_weight='balanced', n_jobs=-1, random_state=42)

    lr_metrics = eval_model(lr, Xtr_sel, train_master['label'].values, Xte_sel, test_master['label'].values)
    rf_metrics = eval_model(rf, Xtr_sel, train_master['label'].values, Xte_sel, test_master['label'].values)

    results.append({"config":cfg["name"], "lr":lr_metrics, "rf":rf_metrics})
    print("Completed config:", cfg["name"])

pd.DataFrame(results).to_json(OUT/"ablation_results.json", orient='records', indent=2)
print("Saved ablation results to", OUT/"ablation_results.json")
