# Phase 4: 3-Model Ensemble (TF-IDF + DistilBERT + RoBERTa)

This notebook optimizes per-label blend weights and thresholds on OOF probabilities, then exports final submission.

## 1) Import Dependencies

In [None]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

## 2) Configure Parameters

In [None]:
LABELS = ['E', 'S', 'G', 'non_ESG']
ID_COL = 'id'

if Path('data_set').exists():
    ROOT = Path('.')
elif Path('../data_set').exists():
    ROOT = Path('..')
else:
    ROOT = Path('.')

DATA_DIR = ROOT / 'data_set'
OUT_DIR = ROOT / 'outputs'
OUT_DIR.mkdir(parents=True, exist_ok=True)

TRAIN_PATH = DATA_DIR / 'train.csv'
TEST_PATH = DATA_DIR / 'test.csv'
SAMPLE_SUB_PATH = DATA_DIR / 'sample_submission.csv'

OOF_TFIDF = OUT_DIR / 'oof_probs.csv'
TEST_TFIDF = OUT_DIR / 'test_probs.csv'
OOF_DISTIL = OUT_DIR / 'oof_probs_distilbert.csv'
TEST_DISTIL = OUT_DIR / 'test_probs_distilbert.csv'
OOF_ROBERTA = OUT_DIR / 'oof_probs_roberta.csv'
TEST_ROBERTA = OUT_DIR / 'test_probs_roberta.csv'

WEIGHT_STEP = 0.1
THRESH_MIN, THRESH_MAX, THRESH_STEP = 0.05, 0.95, 0.01
RULE_GRID = np.round(np.arange(0.30, 0.9001, 0.02), 4)
ENABLE_RULE_SEARCH = True

## 3) Load or Create Input Data

In [None]:
for p in [TRAIN_PATH, TEST_PATH, SAMPLE_SUB_PATH, OOF_TFIDF, TEST_TFIDF, OOF_DISTIL, TEST_DISTIL, OOF_ROBERTA, TEST_ROBERTA]:
    print(f'{p}:', p.exists())

assert TRAIN_PATH.exists() and TEST_PATH.exists() and SAMPLE_SUB_PATH.exists()
assert OOF_TFIDF.exists() and TEST_TFIDF.exists(), 'Run Phase 1 first'
assert OOF_DISTIL.exists() and TEST_DISTIL.exists(), 'Run DistilBERT Phase 2 first'
assert OOF_ROBERTA.exists() and TEST_ROBERTA.exists(), 'Run RoBERTa Phase 2 first'

train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
sample_sub_df = pd.read_csv(SAMPLE_SUB_PATH)
y_true = train_df[LABELS].values.astype(int)


def load_prob_pair(oof_path, test_path):
    oof_df = pd.read_csv(oof_path)
    test_dfp = pd.read_csv(test_path)
    if ID_COL in train_df.columns and ID_COL in oof_df.columns:
        oof_df = train_df[[ID_COL]].merge(oof_df, on=ID_COL, how='left')
    if ID_COL in test_df.columns and ID_COL in test_dfp.columns:
        test_dfp = test_df[[ID_COL]].merge(test_dfp, on=ID_COL, how='left')
    for label in LABELS:
        assert f'{label}_prob' in oof_df.columns
        assert f'{label}_prob' in test_dfp.columns
    oof_prob = oof_df[[f'{l}_prob' for l in LABELS]].values.astype(float)
    test_prob = test_dfp[[f'{l}_prob' for l in LABELS]].values.astype(float)
    return oof_prob, test_prob


oof_tfidf, test_tfidf = load_prob_pair(OOF_TFIDF, TEST_TFIDF)
oof_distil, test_distil = load_prob_pair(OOF_DISTIL, TEST_DISTIL)
oof_roberta, test_roberta = load_prob_pair(OOF_ROBERTA, TEST_ROBERTA)

print('All probability files loaded.')

## 4) Implement Core Logic

In [None]:
def macro_f1(y_true, y_pred):
    scores = [f1_score(y_true[:, i], y_pred[:, i], zero_division=0) for i in range(y_true.shape[1])]
    return float(np.mean(scores)), scores


def tune_thresholds(y_true, y_prob, tmin=0.05, tmax=0.95, step=0.01):
    grid = np.arange(tmin, tmax + 1e-12, step)
    best = {}
    for i, label in enumerate(LABELS):
        best_t, best_f1 = 0.5, -1.0
        for t in grid:
            pred = (y_prob[:, i] >= t).astype(int)
            f1 = f1_score(y_true[:, i], pred, zero_division=0)
            if f1 > best_f1:
                best_f1 = f1
                best_t = float(t)
        best[label] = round(best_t, 4)
    return best


def apply_thresholds(y_prob, thresholds):
    y_pred = np.zeros_like(y_prob, dtype=int)
    for i, label in enumerate(LABELS):
        y_pred[:, i] = (y_prob[:, i] >= thresholds[label]).astype(int)
    return y_pred


def apply_non_esg_rule_with_conf(prob_matrix, pred_matrix, trigger):
    fixed = pred_matrix.copy()
    esg_conf = prob_matrix[:, :3].max(axis=1)
    fixed[esg_conf >= trigger, 3] = 0
    return fixed


# weight search per label: w_tfidf + w_distil + w_roberta = 1
weights_per_label = {}
blend_oof = np.zeros_like(oof_tfidf)
blend_test = np.zeros_like(test_tfidf)

wvals = np.round(np.arange(0.0, 1.0001, WEIGHT_STEP), 4)

for i, label in enumerate(LABELS):
    best = (-1.0, (0.33, 0.33, 0.34))
    for wd in wvals:
        for wr in wvals:
            wt = 1.0 - wd - wr
            if wt < 0 or wt > 1:
                continue
            p = wt * oof_tfidf[:, i] + wd * oof_distil[:, i] + wr * oof_roberta[:, i]
            pred = (p >= 0.5).astype(int)
            f1 = f1_score(y_true[:, i], pred, zero_division=0)
            if f1 > best[0]:
                best = (f1, (wt, wd, wr))

    wt, wd, wr = best[1]
    weights_per_label[label] = {
        'tfidf': round(float(wt), 4),
        'distilbert': round(float(wd), 4),
        'roberta': round(float(wr), 4),
    }
    blend_oof[:, i] = wt * oof_tfidf[:, i] + wd * oof_distil[:, i] + wr * oof_roberta[:, i]
    blend_test[:, i] = wt * test_tfidf[:, i] + wd * test_distil[:, i] + wr * test_roberta[:, i]

thresholds = tune_thresholds(y_true, blend_oof, THRESH_MIN, THRESH_MAX, THRESH_STEP)
oof_pred = apply_thresholds(blend_oof, thresholds)
test_pred = apply_thresholds(blend_test, thresholds)

macro_before_rule, label_scores = macro_f1(y_true, oof_pred)
print('OOF macro-F1 after 3-model blend + thresholds:', round(macro_before_rule, 6))

rule_applied = False
best_rule_trigger = None
best_macro = macro_before_rule
if ENABLE_RULE_SEARCH:
    for trig in RULE_GRID:
        cand = apply_non_esg_rule_with_conf(blend_oof, oof_pred, float(trig))
        score, _ = macro_f1(y_true, cand)
        if score > best_macro:
            best_macro = score
            best_rule_trigger = float(trig)

    if best_rule_trigger is not None:
        rule_applied = True
        oof_pred = apply_non_esg_rule_with_conf(blend_oof, oof_pred, best_rule_trigger)
        test_pred = apply_non_esg_rule_with_conf(blend_test, test_pred, best_rule_trigger)

final_macro, final_label_scores = macro_f1(y_true, oof_pred)
print('Rule applied:', rule_applied, '| trigger:', best_rule_trigger)
print('Final OOF macro-F1:', round(final_macro, 6))

## 5) Run Validation Checks

In [None]:
assert blend_oof.shape == (len(train_df), len(LABELS))
assert blend_test.shape == (len(test_df), len(LABELS))
assert set(weights_per_label.keys()) == set(LABELS)
assert set(thresholds.keys()) == set(LABELS)

print('Validation checks passed.')
print('Final per-label F1:', {LABELS[i]: round(float(final_label_scores[i]), 6) for i in range(len(LABELS))})

## 6) Visualize Key Outputs

In [None]:
weights_df = pd.DataFrame([
    {'label': k, **v} for k, v in weights_per_label.items()
])
metrics_df = pd.DataFrame({
    'label': LABELS,
    'threshold': [thresholds[l] for l in LABELS],
    'f1_oof': [float(final_label_scores[i]) for i in range(len(LABELS))]
})

display(weights_df)
display(metrics_df)

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
weights_df.set_index('label')[['tfidf', 'distilbert', 'roberta']].plot(kind='bar', stacked=True, ax=axes[0], title='Blend Weights')
metrics_df.plot(kind='bar', x='label', y='threshold', ax=axes[1], legend=False, title='Tuned Thresholds')
plt.tight_layout()
plt.show()

## 7) Export Results

In [None]:
submission = sample_sub_df.copy()
if ID_COL in submission.columns and ID_COL in test_df.columns:
    submission[ID_COL] = test_df[ID_COL].values
for i, label in enumerate(LABELS):
    submission[label] = test_pred[:, i].astype(int)

oof_prob_out = pd.DataFrame({ID_COL: train_df[ID_COL].values}) if ID_COL in train_df.columns else pd.DataFrame(index=np.arange(len(train_df)))
test_prob_out = pd.DataFrame({ID_COL: test_df[ID_COL].values}) if ID_COL in test_df.columns else pd.DataFrame(index=np.arange(len(test_df)))
for i, label in enumerate(LABELS):
    oof_prob_out[f'{label}_prob'] = blend_oof[:, i]
    test_prob_out[f'{label}_prob'] = blend_test[:, i]

report = {
    'labels': LABELS,
    'weights_per_label': weights_per_label,
    'thresholds': {k: float(v) for k, v in thresholds.items()},
    'macro_f1_oof_before_rule': float(macro_before_rule),
    'macro_f1_oof_final': float(final_macro),
    'non_esg_rule_applied': bool(rule_applied),
    'non_esg_rule_trigger': None if best_rule_trigger is None else float(best_rule_trigger),
}

submission_path = OUT_DIR / 'submission_ensemble_3models.csv'
report_path = OUT_DIR / 'ensemble_3models_report.json'
oof_path = OUT_DIR / 'oof_probs_ensemble_3models.csv'
test_path = OUT_DIR / 'test_probs_ensemble_3models.csv'

submission.to_csv(submission_path, index=False)
oof_prob_out.to_csv(oof_path, index=False)
test_prob_out.to_csv(test_path, index=False)
with open(report_path, 'w', encoding='utf-8') as f:
    json.dump(report, f, indent=2)

print('Saved:')
print(submission_path)
print(report_path)
print(oof_path)
print(test_path)
submission.head()