In [2]:
# Train GaussianNB on each fold (one-by-one baseline). Saves per-fold pipelines & OOF preds.
from pathlib import Path
import numpy as np
import pandas as pd
import joblib
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

ROOT = Path("dreaddit_cv_raw_splits")
FOLDS_DIR = ROOT / "folds_selected"
SAVE_ROOT = Path("Machine learning") / "models" / "gaussiannb"   # per your request
SAVE_ROOT.mkdir(parents=True, exist_ok=True)

fold_dirs = sorted([p for p in FOLDS_DIR.glob("fold_*") if p.is_dir()])
if not fold_dirs:
    raise FileNotFoundError(f"No per-fold directories found in {FOLDS_DIR}. Run folds preparation first.")

oof_rows = []
fold_summaries = []

print("Found fold dirs:", [p.name for p in fold_dirs])

for fd in fold_dirs:
    fold_name = fd.name  # fold_01 etc.
    fold_no = fold_name.split("_")[1]
    # load arrays
    train_X_path = fd / f"fold_{fold_no}_train_selected.npy"
    val_X_path   = fd / f"fold_{fold_no}_val_selected.npy"
    train_csv    = fd / f"fold_{fold_no}_train_selected.csv"
    val_csv      = fd / f"fold_{fold_no}_val_selected.csv"

    if not (train_X_path.exists() and val_X_path.exists() and train_csv.exists() and val_csv.exists()):
        raise FileNotFoundError(f"Missing files for {fold_name} in {fd}")

    X_tr = np.load(train_X_path)
    X_val = np.load(val_X_path)
    df_tr = pd.read_csv(train_csv)
    df_val = pd.read_csv(val_csv)

    y_tr = df_tr['label'].values
    y_val = df_val['label'].values
    idx_val = df_val['orig_index'].values

    print(f"\nFold {fold_no}: X_tr {X_tr.shape}, X_val {X_val.shape}, y_tr counts {np.bincount(y_tr)}, y_val counts {np.bincount(y_val)}")

    # train GaussianNB (no class_weight param for NB)
    clf = GaussianNB()
    clf.fit(X_tr, y_tr)

    # preds
    probs_val = clf.predict_proba(X_val)[:,1]  # probability for class 1
    preds_val = (probs_val >= 0.5).astype(int)

    # per-fold metrics
    acc = accuracy_score(y_val, preds_val)
    prec = precision_score(y_val, preds_val, zero_division=0)
    rec = recall_score(y_val, preds_val, zero_division=0)
    f1 = f1_score(y_val, preds_val, zero_division=0)
    try:
        auc = roc_auc_score(y_val, probs_val)
    except Exception:
        auc = float('nan')

    print(f" Fold {fold_no} metrics -> acc: {acc:.4f}, prec: {prec:.4f}, rec: {rec:.4f}, f1: {f1:.4f}, auc: {auc:.4f}")

    # save fold pipeline (just the classifier here)
    fold_pipe_path = SAVE_ROOT / f"fold_{fold_no}_pipeline.joblib"
    joblib.dump(clf, fold_pipe_path)

    # save OOF preds for this fold
    oof_df = pd.DataFrame({
        "orig_index": idx_val,
        "true_label": y_val,
        "prob_pos": probs_val,
        "pred_label": preds_val
    })
    oof_csv_path = SAVE_ROOT / f"fold_{fold_no}_oof.csv"
    oof_df.to_csv(oof_csv_path, index=False)

    fold_summaries.append({
        "fold": fold_no,
        "train_rows": X_tr.shape[0],
        "val_rows": X_val.shape[0],
        "acc": acc, "precision": prec, "recall": rec, "f1": f1, "auc": auc,
        "pipeline_path": str(fold_pipe_path),
        "oof_csv": str(oof_csv_path)
    })

    # collect rows for global OOF
    for i, idx in enumerate(idx_val):
        oof_rows.append({"orig_index": int(idx), "true_label": int(y_val[i]), "prob_pos": float(probs_val[i]), "pred_label": int(preds_val[i])})

# aggregate OOF
oof_all = pd.DataFrame(oof_rows).sort_values("orig_index").reset_index(drop=True)
oof_all_path = SAVE_ROOT / "oof_predictions.csv"
oof_all.to_csv(oof_all_path, index=False)

# compute overall OOF metrics
y_true = oof_all['true_label'].values
y_pred = oof_all['pred_label'].values
y_prob = oof_all['prob_pos'].values

acc = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred, zero_division=0)
rec = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)
try:
    auc = roc_auc_score(y_true, y_prob)
except Exception:
    auc = float('nan')

metrics = {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1, "auc": auc}
print("\n=== Overall OOF Metrics ===")
print(metrics)

# Save metrics and manifest
import json
metrics_path = SAVE_ROOT / "oof_metrics.json"
with open(metrics_path, "w") as f:
    json.dump(metrics, f, indent=2)

manifest_df = pd.DataFrame(fold_summaries)
manifest_df.to_csv(SAVE_ROOT / "folds_summary.csv", index=False)

print("\nSaved aggregated OOF ->", oof_all_path.resolve())
print("Saved OOF metrics ->", metrics_path.resolve())
print("Saved per-fold pipelines & OOF CSVs ->", SAVE_ROOT.resolve())
print("\nDone. Paste the printed output here and I will prepare the next model (Logistic Regression) when you tell me to continue.")


Found fold dirs: ['fold_01', 'fold_02', 'fold_03', 'fold_04', 'fold_05']

Fold 01: X_tr (457, 34), X_val (115, 34), y_tr counts [221 236], y_val counts [56 59]
 Fold 01 metrics -> acc: 0.7391, prec: 0.6933, rec: 0.8814, f1: 0.7761, auc: 0.8105

Fold 02: X_tr (457, 34), X_val (115, 34), y_tr counts [221 236], y_val counts [56 59]
 Fold 02 metrics -> acc: 0.6696, prec: 0.6207, rec: 0.9153, f1: 0.7397, auc: 0.7778

Fold 03: X_tr (458, 34), X_val (114, 34), y_tr counts [222 236], y_val counts [55 59]
 Fold 03 metrics -> acc: 0.6140, prec: 0.6000, rec: 0.7627, f1: 0.6716, auc: 0.7498

Fold 04: X_tr (458, 34), X_val (114, 34), y_tr counts [222 236], y_val counts [55 59]
 Fold 04 metrics -> acc: 0.7193, prec: 0.7077, rec: 0.7797, f1: 0.7419, auc: 0.7945

Fold 05: X_tr (458, 34), X_val (114, 34), y_tr counts [222 236], y_val counts [55 59]
 Fold 05 metrics -> acc: 0.6140, prec: 0.6000, rec: 0.7627, f1: 0.6716, auc: 0.6878

=== Overall OOF Metrics ===
{'accuracy': 0.6713286713286714, 'precision

In [3]:
# Train Logistic Regression (class_weight='balanced') on each fold (one-by-one)
from pathlib import Path
import numpy as np
import pandas as pd
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import json

ROOT = Path("dreaddit_cv_raw_splits")
FOLDS_DIR = ROOT / "folds_selected"
SAVE_ROOT = Path("Machine learning") / "models" / "logreg"
SAVE_ROOT.mkdir(parents=True, exist_ok=True)

fold_dirs = sorted([p for p in FOLDS_DIR.glob("fold_*") if p.is_dir()])
if not fold_dirs:
    raise FileNotFoundError(f"No per-fold directories found in {FOLDS_DIR}. Run folds preparation first.")

oof_rows = []
fold_summaries = []

print("Found fold dirs:", [p.name for p in fold_dirs])

for fd in fold_dirs:
    fold_name = fd.name  # fold_01 etc.
    fold_no = fold_name.split("_")[1]
    # load arrays & csvs
    train_X_path = fd / f"fold_{fold_no}_train_selected.npy"
    val_X_path   = fd / f"fold_{fold_no}_val_selected.npy"
    train_csv    = fd / f"fold_{fold_no}_train_selected.csv"
    val_csv      = fd / f"fold_{fold_no}_val_selected.csv"

    if not (train_X_path.exists() and val_X_path.exists() and train_csv.exists() and val_csv.exists()):
        raise FileNotFoundError(f"Missing files for {fold_name} in {fd}")

    X_tr = np.load(train_X_path)
    X_val = np.load(val_X_path)
    df_tr = pd.read_csv(train_csv)
    df_val = pd.read_csv(val_csv)

    y_tr = df_tr['label'].values
    y_val = df_val['label'].values
    idx_val = df_val['orig_index'].values

    print(f"\nFold {fold_no}: X_tr {X_tr.shape}, X_val {X_val.shape}, y_tr counts {np.bincount(y_tr)}, y_val counts {np.bincount(y_val)}")

    # Logistic Regression with balanced class weights
    clf = LogisticRegression(penalty='l2', solver='liblinear', C=1.0, max_iter=2000, class_weight='balanced')
    clf.fit(X_tr, y_tr)

    # preds
    probs_val = clf.predict_proba(X_val)[:,1]
    preds_val = (probs_val >= 0.5).astype(int)

    # per-fold metrics
    acc = accuracy_score(y_val, preds_val)
    prec = precision_score(y_val, preds_val, zero_division=0)
    rec = recall_score(y_val, preds_val, zero_division=0)
    f1 = f1_score(y_val, preds_val, zero_division=0)
    try:
        auc = roc_auc_score(y_val, probs_val)
    except Exception:
        auc = float('nan')

    print(f" Fold {fold_no} metrics -> acc: {acc:.4f}, prec: {prec:.4f}, rec: {rec:.4f}, f1: {f1:.4f}, auc: {auc:.4f}")

    # save fold pipeline
    fold_pipe_path = SAVE_ROOT / f"fold_{fold_no}_pipeline.joblib"
    joblib.dump(clf, fold_pipe_path)

    # save per-fold OOF
    oof_df = pd.DataFrame({
        "orig_index": idx_val,
        "true_label": y_val,
        "prob_pos": probs_val,
        "pred_label": preds_val
    })
    oof_csv_path = SAVE_ROOT / f"fold_{fold_no}_oof.csv"
    oof_df.to_csv(oof_csv_path, index=False)

    fold_summaries.append({
        "fold": fold_no,
        "train_rows": X_tr.shape[0],
        "val_rows": X_val.shape[0],
        "acc": acc, "precision": prec, "recall": rec, "f1": f1, "auc": auc,
        "pipeline_path": str(fold_pipe_path),
        "oof_csv": str(oof_csv_path)
    })

    # collect OOF rows
    for i, idx in enumerate(idx_val):
        oof_rows.append({"orig_index": int(idx), "true_label": int(y_val[i]), "prob_pos": float(probs_val[i]), "pred_label": int(preds_val[i])})

# aggregate OOF across folds
oof_all = pd.DataFrame(oof_rows).sort_values("orig_index").reset_index(drop=True)
oof_all_path = SAVE_ROOT / "oof_predictions.csv"
oof_all.to_csv(oof_all_path, index=False)

# compute overall metrics
y_true = oof_all['true_label'].values
y_pred = oof_all['pred_label'].values
y_prob = oof_all['prob_pos'].values

acc = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred, zero_division=0)
rec = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)
try:
    auc = roc_auc_score(y_true, y_prob)
except Exception:
    auc = float('nan')

metrics = {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1, "auc": auc}
print("\n=== Overall OOF Metrics ===")
print(metrics)

# Save metrics and manifest
metrics_path = SAVE_ROOT / "oof_metrics.json"
with open(metrics_path, "w") as f:
    json.dump(metrics, f, indent=2)

manifest_df = pd.DataFrame(fold_summaries)
manifest_df.to_csv(SAVE_ROOT / "folds_summary.csv", index=False)

print("\nSaved aggregated OOF ->", oof_all_path.resolve())
print("Saved OOF metrics ->", metrics_path.resolve())
print("Saved per-fold pipelines & OOF CSVs ->", SAVE_ROOT.resolve())
print("\nDone. Paste the printed output here when complete and I will prepare the next model.")


Found fold dirs: ['fold_01', 'fold_02', 'fold_03', 'fold_04', 'fold_05']

Fold 01: X_tr (457, 34), X_val (115, 34), y_tr counts [221 236], y_val counts [56 59]
 Fold 01 metrics -> acc: 0.6783, prec: 0.7037, rec: 0.6441, f1: 0.6726, auc: 0.8057

Fold 02: X_tr (457, 34), X_val (115, 34), y_tr counts [221 236], y_val counts [56 59]
 Fold 02 metrics -> acc: 0.7217, prec: 0.6957, rec: 0.8136, f1: 0.7500, auc: 0.8208

Fold 03: X_tr (458, 34), X_val (114, 34), y_tr counts [222 236], y_val counts [55 59]
 Fold 03 metrics -> acc: 0.7018, prec: 0.7119, rec: 0.7119, f1: 0.7119, auc: 0.7815

Fold 04: X_tr (458, 34), X_val (114, 34), y_tr counts [222 236], y_val counts [55 59]
 Fold 04 metrics -> acc: 0.7982, prec: 0.7903, rec: 0.8305, f1: 0.8099, auc: 0.8666

Fold 05: X_tr (458, 34), X_val (114, 34), y_tr counts [222 236], y_val counts [55 59]
 Fold 05 metrics -> acc: 0.6930, prec: 0.7069, rec: 0.6949, f1: 0.7009, auc: 0.7676

=== Overall OOF Metrics ===
{'accuracy': 0.7185314685314685, 'precision

In [None]:
# Train Linear SVM (linear kernel) on each fold (one-by-one)
from pathlib import Path
import numpy as np
import pandas as pd
import joblib
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import json

ROOT = Path("dreaddit_cv_raw_splits")
FOLDS_DIR = ROOT / "folds_selected"
SAVE_ROOT = Path("Machine learning") / "models" / "svm"
SAVE_ROOT.mkdir(parents=True, exist_ok=True)

fold_dirs = sorted([p for p in FOLDS_DIR.glob("fold_*") if p.is_dir()])
if not fold_dirs:
    raise FileNotFoundError(f"No per-fold directories found in {FOLDS_DIR}. Run folds preparation first.")

oof_rows = []
fold_summaries = []

print("Found fold dirs:", [p.name for p in fold_dirs])

for fd in fold_dirs:
    fold_name = fd.name  # fold_01 etc.
    fold_no = fold_name.split("_")[1]
    # load arrays & csvs
    train_X_path = fd / f"fold_{fold_no}_train_selected.npy"
    val_X_path   = fd / f"fold_{fold_no}_val_selected.npy"
    train_csv    = fd / f"fold_{fold_no}_train_selected.csv"
    val_csv      = fd / f"fold_{fold_no}_val_selected.csv"

    if not (train_X_path.exists() and val_X_path.exists() and train_csv.exists() and val_csv.exists()):
        raise FileNotFoundError(f"Missing files for {fold_name} in {fd}")

    X_tr = np.load(train_X_path)
    X_val = np.load(val_X_path)
    df_tr = pd.read_csv(train_csv)
    df_val = pd.read_csv(val_csv)

    y_tr = df_tr['label'].values
    y_val = df_val['label'].values
    idx_val = df_val['orig_index'].values

    print(f"\nFold {fold_no}: X_tr {X_tr.shape}, X_val {X_val.shape}, y_tr counts {np.bincount(y_tr)}, y_val counts {np.bincount(y_val)}")

    # SVC with linear kernel and probability estimates (class_weight balanced)
    clf = SVC(kernel='linear', probability=True, class_weight='balanced')
    clf.fit(X_tr, y_tr)

    # preds
    probs_val = clf.predict_proba(X_val)[:,1]
    preds_val = (probs_val >= 0.5).astype(int)

    # per-fold metrics
    acc = accuracy_score(y_val, preds_val)
    prec = precision_score(y_val, preds_val, zero_division=0)
    rec = recall_score(y_val, preds_val, zero_division=0)
    f1 = f1_score(y_val, preds_val, zero_division=0)
    try:
        auc = roc_auc_score(y_val, probs_val)
    except Exception:
        auc = float('nan')

    print(f" Fold {fold_no} metrics -> acc: {acc:.4f}, prec: {prec:.4f}, rec: {rec:.4f}, f1: {f1:.4f}, auc: {auc:.4f}")

    # save fold pipeline
    fold_pipe_path = SAVE_ROOT / f"fold_{fold_no}_pipeline.joblib"
    joblib.dump(clf, fold_pipe_path)

    # save per-fold OOF
    oof_df = pd.DataFrame({
        "orig_index": idx_val,
        "true_label": y_val,
        "prob_pos": probs_val,
        "pred_label": preds_val
    })
    oof_csv_path = SAVE_ROOT / f"fold_{fold_no}_oof.csv"
    oof_df.to_csv(oof_csv_path, index=False)

    fold_summaries.append({
        "fold": fold_no,
        "train_rows": X_tr.shape[0],
        "val_rows": X_val.shape[0],
        "acc": acc, "precision": prec, "recall": rec, "f1": f1, "auc": auc,
        "pipeline_path": str(fold_pipe_path),
        "oof_csv": str(oof_csv_path)
    })

    # collect OOF rows
    for i, idx in enumerate(idx_val):
        oof_rows.append({"orig_index": int(idx), "true_label": int(y_val[i]), "prob_pos": float(probs_val[i]), "pred_label": int(preds_val[i])})

# aggregate OOF across folds
oof_all = pd.DataFrame(oof_rows).sort_values("orig_index").reset_index(drop=True)
oof_all_path = SAVE_ROOT / "oof_predictions.csv"
oof_all.to_csv(oof_all_path, index=False)

# compute overall metrics
y_true = oof_all['true_label'].values
y_pred = oof_all['pred_label'].values
y_prob = oof_all['prob_pos'].values

acc = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred, zero_division=0)
rec = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)
try:
    auc = roc_auc_score(y_true, y_prob)
except Exception:
    auc = float('nan')

metrics = {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1, "auc": auc}
print("\n=== Overall OOF Metrics ===")
print(metrics)

# Save metrics and manifest
metrics_path = SAVE_ROOT / "oof_metrics.json"
with open(metrics_path, "w") as f:
    json.dump(metrics, f, indent=2)

manifest_df = pd.DataFrame(fold_summaries)
manifest_df.to_csv(SAVE_ROOT / "folds_summary.csv", index=False)

print("\nSaved aggregated OOF ->", oof_all_path.resolve())
print("Saved OOF metrics ->", metrics_path.resolve())
print("Saved per-fold pipelines & OOF CSVs ->", SAVE_ROOT.resolve())
print("\nDone. Paste the printed output here and I will prepare the next model when you say so.")


Found fold dirs: ['fold_01', 'fold_02', 'fold_03', 'fold_04', 'fold_05']

Fold 01: X_tr (457, 34), X_val (115, 34), y_tr counts [221 236], y_val counts [56 59]
