# Phase 3: OOF Ensemble + Threshold Optimization (Macro-F1)

This notebook blends model probabilities from:
- DistilBERT outputs (`oof_probs_distilbert.csv`, `test_probs_distilbert.csv`)
- TF-IDF + Logistic outputs (`oof_probs.csv`, `test_probs.csv`)

Then it optimizes:
1) blend weights (OOF),
2) per-label thresholds,
3) optional `non_ESG` consistency rule,
and exports final submission.

## 1) Import Dependencies

In [None]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

## 2) Configure Parameters

In [None]:
LABELS = ['E', 'S', 'G', 'non_ESG']
ID_COL = 'id'

# Root detection
if Path('data_set').exists():
    ROOT = Path('.')
elif Path('../data_set').exists():
    ROOT = Path('..')
else:
    ROOT = Path('.')

DATA_DIR = ROOT / 'data_set'
OUT_DIR = ROOT / 'outputs'
OUT_DIR.mkdir(parents=True, exist_ok=True)

TRAIN_PATH = DATA_DIR / 'train.csv'
SAMPLE_SUB_PATH = DATA_DIR / 'sample_submission.csv'
TEST_PATH = DATA_DIR / 'test.csv'

OOF_BERT_PATH = OUT_DIR / 'oof_probs_distilbert.csv'
TEST_BERT_PATH = OUT_DIR / 'test_probs_distilbert.csv'

OOF_TFIDF_PATH = OUT_DIR / 'oof_probs.csv'
TEST_TFIDF_PATH = OUT_DIR / 'test_probs.csv'

ALPHA_GRID = np.round(np.arange(0.0, 1.0001, 0.05), 4)  # weight for DistilBERT
THRESH_MIN, THRESH_MAX, THRESH_STEP = 0.05, 0.95, 0.01
RULE_GRID = np.round(np.arange(0.30, 0.9001, 0.02), 4)  # max(E,S,G) threshold for non_ESG override
ENABLE_RULE_SEARCH = True

print('ROOT:', ROOT.resolve())

## 3) Load or Create Input Data

In [None]:
for p in [TRAIN_PATH, TEST_PATH, SAMPLE_SUB_PATH, OOF_BERT_PATH, TEST_BERT_PATH, OOF_TFIDF_PATH, TEST_TFIDF_PATH]:
    print(f'{p}:', p.exists())

assert TRAIN_PATH.exists(), 'Missing train.csv'
assert TEST_PATH.exists(), 'Missing test.csv'
assert SAMPLE_SUB_PATH.exists(), 'Missing sample_submission.csv'
assert OOF_BERT_PATH.exists(), 'Missing outputs/oof_probs_distilbert.csv (run phase2 first)'
assert TEST_BERT_PATH.exists(), 'Missing outputs/test_probs_distilbert.csv (run phase2 first)'
assert OOF_TFIDF_PATH.exists(), 'Missing outputs/oof_probs.csv (run TF-IDF baseline first)'
assert TEST_TFIDF_PATH.exists(), 'Missing outputs/test_probs.csv (run TF-IDF baseline first)'

train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
sample_sub_df = pd.read_csv(SAMPLE_SUB_PATH)

y_true = train_df[LABELS].values.astype(int)

oof_bert = pd.read_csv(OOF_BERT_PATH)
test_bert = pd.read_csv(TEST_BERT_PATH)
oof_tfidf = pd.read_csv(OOF_TFIDF_PATH)
test_tfidf = pd.read_csv(TEST_TFIDF_PATH)

# Align by id to avoid ordering mismatch
if ID_COL in train_df.columns and ID_COL in oof_bert.columns and ID_COL in oof_tfidf.columns:
    oof_bert = train_df[[ID_COL]].merge(oof_bert, on=ID_COL, how='left')
    oof_tfidf = train_df[[ID_COL]].merge(oof_tfidf, on=ID_COL, how='left')

if ID_COL in test_df.columns and ID_COL in test_bert.columns and ID_COL in test_tfidf.columns:
    test_bert = test_df[[ID_COL]].merge(test_bert, on=ID_COL, how='left')
    test_tfidf = test_df[[ID_COL]].merge(test_tfidf, on=ID_COL, how='left')

for label in LABELS:
    assert f'{label}_prob' in oof_bert.columns
    assert f'{label}_prob' in oof_tfidf.columns
    assert f'{label}_prob' in test_bert.columns
    assert f'{label}_prob' in test_tfidf.columns

oof_prob_bert = oof_bert[[f'{l}_prob' for l in LABELS]].values.astype(float)
oof_prob_tfidf = oof_tfidf[[f'{l}_prob' for l in LABELS]].values.astype(float)
test_prob_bert = test_bert[[f'{l}_prob' for l in LABELS]].values.astype(float)
test_prob_tfidf = test_tfidf[[f'{l}_prob' for l in LABELS]].values.astype(float)

print('Loaded OOF/Test probability matrices.')
print('OOF shape:', oof_prob_bert.shape, 'Test shape:', test_prob_bert.shape)

## 4) Implement Core Logic

In [None]:
def macro_f1(y_true, y_pred):
    scores = [f1_score(y_true[:, i], y_pred[:, i], zero_division=0) for i in range(y_true.shape[1])]
    return float(np.mean(scores)), scores


def tune_thresholds(y_true, y_prob, tmin=0.05, tmax=0.95, step=0.01):
    grid = np.arange(tmin, tmax + 1e-12, step)
    best = {}
    for i, label in enumerate(LABELS):
        best_t, best_f1 = 0.5, -1.0
        for t in grid:
            pred = (y_prob[:, i] >= t).astype(int)
            f1 = f1_score(y_true[:, i], pred, zero_division=0)
            if f1 > best_f1:
                best_f1 = f1
                best_t = float(t)
        best[label] = round(best_t, 4)
    return best


def apply_thresholds(y_prob, thresholds):
    y_pred = np.zeros_like(y_prob, dtype=int)
    for i, label in enumerate(LABELS):
        y_pred[:, i] = (y_prob[:, i] >= thresholds[label]).astype(int)
    return y_pred


def apply_non_esg_rule_with_confidence(prob_matrix, pred_matrix, trigger=0.5):
    fixed = pred_matrix.copy()
    esg_conf = prob_matrix[:, :3].max(axis=1)
    fixed[esg_conf >= trigger, 3] = 0
    return fixed


# Step A: per-label alpha search (weight for DistilBERT)
alpha_per_label = {}
blended_oof = np.zeros_like(oof_prob_bert)
blended_test = np.zeros_like(test_prob_bert)

for i, label in enumerate(LABELS):
    best_alpha, best_f1 = 0.5, -1.0
    p_bert = oof_prob_bert[:, i]
    p_tfidf = oof_prob_tfidf[:, i]
    y_label = y_true[:, i]

    for alpha in ALPHA_GRID:
        p_blend = alpha * p_bert + (1.0 - alpha) * p_tfidf
        pred = (p_blend >= 0.5).astype(int)
        f1 = f1_score(y_label, pred, zero_division=0)
        if f1 > best_f1:
            best_f1 = f1
            best_alpha = float(alpha)

    alpha_per_label[label] = round(best_alpha, 4)
    blended_oof[:, i] = best_alpha * oof_prob_bert[:, i] + (1.0 - best_alpha) * oof_prob_tfidf[:, i]
    blended_test[:, i] = best_alpha * test_prob_bert[:, i] + (1.0 - best_alpha) * test_prob_tfidf[:, i]

print('Best alpha per label (DistilBERT weight):', alpha_per_label)

# Step B: threshold tuning on blended OOF probs
thresholds = tune_thresholds(y_true, blended_oof, THRESH_MIN, THRESH_MAX, THRESH_STEP)
oof_pred = apply_thresholds(blended_oof, thresholds)
test_pred = apply_thresholds(blended_test, thresholds)

macro_base, per_label_f1 = macro_f1(y_true, oof_pred)
print('Macro-F1 after blend + thresholds:', round(macro_base, 6))

# Step C: optional non_ESG rule search
rule_applied = False
best_rule_trigger = None
macro_after_rule = macro_base

if ENABLE_RULE_SEARCH:
    best_score = macro_base
    best_pred_oof = oof_pred
    best_pred_test = test_pred

    for trig in RULE_GRID:
        cand_oof = apply_non_esg_rule_with_confidence(blended_oof, oof_pred, trigger=float(trig))
        score, _ = macro_f1(y_true, cand_oof)
        if score > best_score:
            best_score = score
            best_rule_trigger = float(trig)
            best_pred_oof = cand_oof
            best_pred_test = apply_non_esg_rule_with_confidence(blended_test, test_pred, trigger=float(trig))

    if best_score > macro_base:
        rule_applied = True
        macro_after_rule = best_score
        oof_pred = best_pred_oof
        test_pred = best_pred_test

print('Rule applied:', rule_applied, '| best trigger:', best_rule_trigger)
print('Final OOF macro-F1:', round(macro_after_rule, 6))

## 5) Run Validation Checks

In [None]:
assert blended_oof.shape == (len(train_df), len(LABELS))
assert blended_test.shape == (len(test_df), len(LABELS))
assert set(alpha_per_label.keys()) == set(LABELS)
assert set(thresholds.keys()) == set(LABELS)

final_macro, final_label_scores = macro_f1(y_true, oof_pred)
print('Checks passed.')
print('Final macro-F1:', round(final_macro, 6))
print('Final per-label F1:', {LABELS[i]: round(float(final_label_scores[i]), 6) for i in range(len(LABELS))})

## 6) Visualize Key Outputs

In [None]:
summary_df = pd.DataFrame({
    'label': LABELS,
    'alpha_distilbert': [alpha_per_label[l] for l in LABELS],
    'threshold': [thresholds[l] for l in LABELS],
    'f1_oof': [float(final_label_scores[i]) for i in range(len(LABELS))],
})

display(summary_df)

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
summary_df.plot(kind='bar', x='label', y='alpha_distilbert', ax=axes[0], legend=False, title='Blend Weights')
summary_df.plot(kind='bar', x='label', y='threshold', ax=axes[1], legend=False, title='Tuned Thresholds')
plt.tight_layout()
plt.show()

## 7) Export Results

In [None]:
submission = sample_sub_df.copy()
if ID_COL in submission.columns and ID_COL in test_df.columns:
    submission[ID_COL] = test_df[ID_COL].values

for i, label in enumerate(LABELS):
    submission[label] = test_pred[:, i].astype(int)

oof_prob_out = pd.DataFrame({ID_COL: train_df[ID_COL].values}) if ID_COL in train_df.columns else pd.DataFrame(index=np.arange(len(train_df)))
test_prob_out = pd.DataFrame({ID_COL: test_df[ID_COL].values}) if ID_COL in test_df.columns else pd.DataFrame(index=np.arange(len(test_df)))
for i, label in enumerate(LABELS):
    oof_prob_out[f'{label}_prob'] = blended_oof[:, i]
    test_prob_out[f'{label}_prob'] = blended_test[:, i]

report = {
    'labels': LABELS,
    'alpha_per_label_distilbert_weight': {k: float(v) for k, v in alpha_per_label.items()},
    'thresholds': {k: float(v) for k, v in thresholds.items()},
    'macro_f1_oof_before_rule': float(macro_base),
    'macro_f1_oof_final': float(final_macro),
    'per_label_f1_oof_final': {LABELS[i]: float(final_label_scores[i]) for i in range(len(LABELS))},
    'non_esg_rule_applied': bool(rule_applied),
    'non_esg_rule_trigger': None if best_rule_trigger is None else float(best_rule_trigger),
    'search': {
        'alpha_grid': [float(a) for a in ALPHA_GRID.tolist()],
        'threshold_grid': [THRESH_MIN, THRESH_MAX, THRESH_STEP],
        'rule_grid': [float(r) for r in RULE_GRID.tolist()] if ENABLE_RULE_SEARCH else None,
    },
    'sources': {
        'oof_distilbert': str(OOF_BERT_PATH),
        'oof_tfidf': str(OOF_TFIDF_PATH),
        'test_distilbert': str(TEST_BERT_PATH),
        'test_tfidf': str(TEST_TFIDF_PATH),
    },
}

submission_path = OUT_DIR / 'submission_ensemble_phase3.csv'
report_path = OUT_DIR / 'ensemble_phase3_report.json'
oof_path = OUT_DIR / 'oof_probs_ensemble_phase3.csv'
test_prob_path = OUT_DIR / 'test_probs_ensemble_phase3.csv'

submission.to_csv(submission_path, index=False)
oof_prob_out.to_csv(oof_path, index=False)
test_prob_out.to_csv(test_prob_path, index=False)
with open(report_path, 'w', encoding='utf-8') as f:
    json.dump(report, f, indent=2)

print('Saved:')
print(submission_path)
print(report_path)
print(oof_path)
print(test_prob_path)
submission.head()