# Phase 6: Two-Stage ESG Gate + Retrieval Stacking (Macro-F1)

A completely different approach from transformer swapping:
1) Stage-1 binary ESG gate (`is_esg`)
2) Stage-2 conditional E/S/G classifier
3) kNN retrieval priors from TF-IDF space
4) OOF threshold optimization + final submission

## 1) Import Dependencies

In [None]:
# If needed in Colab, uncomment:
# !pip install -q pandas numpy scikit-learn iterative-stratification matplotlib

from pathlib import Path
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import NearestNeighbors

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

## 2) Configure Parameters

In [None]:
LABELS = ['E', 'S', 'G', 'non_ESG']
ESG_LABELS = ['E', 'S', 'G']
ID_COL = 'id'
TEXT_COL = 'text'

SEED = 42
N_SPLITS = 5
MAX_FEATURES = 120_000
MIN_DF = 2

K_NEIGHBORS = 20
ALPHA_RETRIEVAL = 0.35  # blend retrieval priors with stage-2 probs

GATE_TMIN, GATE_TMAX, GATE_STEP = 0.05, 0.95, 0.01
LBL_TMIN, LBL_TMAX, LBL_STEP = 0.05, 0.95, 0.01

if Path('data_set').exists():
    ROOT = Path('.')
elif Path('../data_set').exists():
    ROOT = Path('..')
else:
    ROOT = Path('.')

TRAIN_PATH = ROOT / 'data_set' / 'train.csv'
TEST_PATH = ROOT / 'data_set' / 'test.csv'
SAMPLE_SUB_PATH = ROOT / 'data_set' / 'sample_submission.csv'
OUTPUT_DIR = ROOT / 'outputs'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print('ROOT:', ROOT.resolve())

## 3) Load or Create Input Data

In [None]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
sample_sub_df = pd.read_csv(SAMPLE_SUB_PATH)

assert all(c in train_df.columns for c in [ID_COL, TEXT_COL] + LABELS)
assert all(c in test_df.columns for c in [ID_COL, TEXT_COL])

train_df[TEXT_COL] = train_df[TEXT_COL].fillna('').astype(str)
test_df[TEXT_COL] = test_df[TEXT_COL].fillna('').astype(str)

y_full = train_df[LABELS].values.astype(int)
y_esg = train_df[ESG_LABELS].values.astype(int)
y_gate = (y_esg.sum(axis=1) > 0).astype(int)

print('Train:', train_df.shape, 'Test:', test_df.shape)
print('ESG rate:', y_gate.mean())
print(train_df[LABELS].mean())

## 4) Implement Core Logic

In [None]:
def macro_f1(y_true, y_pred):
    vals = [f1_score(y_true[:, i], y_pred[:, i], zero_division=0) for i in range(y_true.shape[1])]
    return float(np.mean(vals)), vals


def best_threshold_binary(y_true, prob, tmin=0.05, tmax=0.95, step=0.01):
    grid = np.arange(tmin, tmax + 1e-12, step)
    best_t, best_f = 0.5, -1.0
    for t in grid:
        pred = (prob >= t).astype(int)
        score = f1_score(y_true, pred, zero_division=0)
        if score > best_f:
            best_f, best_t = score, float(t)
    return best_t, best_f


def best_thresholds_multilabel(y_true, prob, labels, tmin=0.05, tmax=0.95, step=0.01):
    grid = np.arange(tmin, tmax + 1e-12, step)
    out = {}
    for i, label in enumerate(labels):
        best_t, best_f = 0.5, -1.0
        for t in grid:
            pred = (prob[:, i] >= t).astype(int)
            score = f1_score(y_true[:, i], pred, zero_division=0)
            if score > best_f:
                best_f, best_t = score, float(t)
        out[label] = round(best_t, 4)
    return out


# Shared text arrays
x_train = train_df[TEXT_COL].values
x_test = test_df[TEXT_COL].values

# --- Stage 1: ESG Gate (binary) ---
skf_gate = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
oof_gate_prob = np.zeros(len(train_df), dtype=float)
test_gate_folds = []

for tr_idx, va_idx in skf_gate.split(x_train, y_gate):
    vec = TfidfVectorizer(
        lowercase=True,
        strip_accents='unicode',
        ngram_range=(1, 2),
        min_df=MIN_DF,
        max_features=MAX_FEATURES,
        sublinear_tf=True,
    )
    xtr = vec.fit_transform(x_train[tr_idx])
    xva = vec.transform(x_train[va_idx])
    xte = vec.transform(x_test)

    gate_clf = LogisticRegression(C=2.0, max_iter=2500, class_weight='balanced', solver='liblinear')
    gate_clf.fit(xtr, y_gate[tr_idx])

    oof_gate_prob[va_idx] = gate_clf.predict_proba(xva)[:, 1]
    test_gate_folds.append(gate_clf.predict_proba(xte)[:, 1])

test_gate_prob = np.mean(np.vstack(test_gate_folds), axis=0)
gate_t, gate_f1 = best_threshold_binary(y_gate, oof_gate_prob, GATE_TMIN, GATE_TMAX, GATE_STEP)
print('Best gate threshold:', gate_t, '| Gate F1:', round(gate_f1, 6))

# --- Stage 2: ESG labels (E/S/G) with retrieval priors ---
mskf = MultilabelStratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
oof_esg_prob = np.zeros((len(train_df), 3), dtype=float)
oof_retrieval = np.zeros((len(train_df), 3), dtype=float)
test_esg_folds, test_ret_folds = [], []

for tr_idx, va_idx in mskf.split(x_train, y_esg):
    # Train stage-2 only on ESG-positive samples in train fold
    tr_pos_mask = y_gate[tr_idx] == 1
    tr_pos_idx = tr_idx[tr_pos_mask]

    vec = TfidfVectorizer(
        lowercase=True,
        strip_accents='unicode',
        ngram_range=(1, 2),
        min_df=MIN_DF,
        max_features=MAX_FEATURES,
        sublinear_tf=True,
    )

    xtr_pos = vec.fit_transform(x_train[tr_pos_idx])
    xva = vec.transform(x_train[va_idx])
    xte = vec.transform(x_test)

    ytr_pos = y_esg[tr_pos_idx]

    clf_esg = OneVsRestClassifier(
        LogisticRegression(C=2.0, max_iter=2500, class_weight='balanced', solver='liblinear')
    )
    clf_esg.fit(xtr_pos, ytr_pos)

    va_prob = clf_esg.predict_proba(xva)
    te_prob = clf_esg.predict_proba(xte)

    # Retrieval priors from nearest ESG-positive train docs in fold
    k = min(K_NEIGHBORS, xtr_pos.shape[0])
    nn = NearestNeighbors(n_neighbors=max(1, k), metric='cosine')
    nn.fit(xtr_pos)

    va_idx_nn = nn.kneighbors(xva, return_distance=False)
    te_idx_nn = nn.kneighbors(xte, return_distance=False)

    va_ret = ytr_pos[va_idx_nn].mean(axis=1)
    te_ret = ytr_pos[te_idx_nn].mean(axis=1)

    oof_esg_prob[va_idx] = va_prob
    oof_retrieval[va_idx] = va_ret
    test_esg_folds.append(te_prob)
    test_ret_folds.append(te_ret)

# Blend stage-2 probs with retrieval priors
blend_oof_esg = (1 - ALPHA_RETRIEVAL) * oof_esg_prob + ALPHA_RETRIEVAL * oof_retrieval
blend_test_esg = (1 - ALPHA_RETRIEVAL) * np.mean(np.stack(test_esg_folds), axis=0) + ALPHA_RETRIEVAL * np.mean(np.stack(test_ret_folds), axis=0)

esg_thresholds = best_thresholds_multilabel(y_esg, blend_oof_esg, ESG_LABELS, LBL_TMIN, LBL_TMAX, LBL_STEP)
print('ESG thresholds:', esg_thresholds)

# Convert to final 4-label predictions
oof_gate_pred = (oof_gate_prob >= gate_t).astype(int)
test_gate_pred = (test_gate_prob >= gate_t).astype(int)

oof_esg_pred = np.column_stack([
    (blend_oof_esg[:, i] >= esg_thresholds[ESG_LABELS[i]]).astype(int)
    for i in range(3)
])
test_esg_pred = np.column_stack([
    (blend_test_esg[:, i] >= esg_thresholds[ESG_LABELS[i]]).astype(int)
    for i in range(3)
])

# Gate enforcement: if gate says non-ESG, force E/S/G to 0
oof_esg_pred[oof_gate_pred == 0] = 0
test_esg_pred[test_gate_pred == 0] = 0

# non_ESG derived from gate
oof_non_esg_pred = (oof_gate_pred == 0).astype(int)
test_non_esg_pred = (test_gate_pred == 0).astype(int)

oof_pred = np.column_stack([oof_esg_pred, oof_non_esg_pred])
test_pred = np.column_stack([test_esg_pred, test_non_esg_pred])

oof_macro, oof_per_label = macro_f1(y_full, oof_pred)
print('Final OOF Macro-F1:', round(oof_macro, 6))

## 5) Run Validation Checks

In [None]:
assert oof_pred.shape == (len(train_df), 4)
assert test_pred.shape == (len(test_df), 4)
assert all(lbl in esg_thresholds for lbl in ESG_LABELS)
assert 0 <= gate_t <= 1

print('Validation checks passed.')
print('Per-label OOF F1:', {LABELS[i]: round(float(oof_per_label[i]), 6) for i in range(4)})

## 6) Visualize Key Outputs

In [None]:
summary_df = pd.DataFrame({
    'label': LABELS,
    'f1_oof': [float(v) for v in oof_per_label]
})

display(summary_df)

plt.figure(figsize=(7, 4))
plt.bar(summary_df['label'], summary_df['f1_oof'])
plt.ylim(0, 1)
plt.title('OOF F1 per Label (Two-Stage + Retrieval)')
plt.tight_layout()
plt.show()

## 7) Export Results

In [None]:
submission = sample_sub_df.copy()
if ID_COL in submission.columns and ID_COL in test_df.columns:
    submission[ID_COL] = test_df[ID_COL].values
for i, label in enumerate(LABELS):
    submission[label] = test_pred[:, i].astype(int)

report = {
    'approach': 'two_stage_gate_plus_retrieval_stacking',
    'gate_threshold': float(gate_t),
    'gate_f1_oof': float(gate_f1),
    'esg_thresholds': {k: float(v) for k, v in esg_thresholds.items()},
    'alpha_retrieval': float(ALPHA_RETRIEVAL),
    'k_neighbors': int(K_NEIGHBORS),
    'oof_macro_f1': float(oof_macro),
    'oof_f1_per_label': {LABELS[i]: float(oof_per_label[i]) for i in range(4)},
}

submission_path = OUTPUT_DIR / 'submission_two_stage_retrieval.csv'
report_path = OUTPUT_DIR / 'two_stage_retrieval_report.json'

submission.to_csv(submission_path, index=False)
with open(report_path, 'w', encoding='utf-8') as f:
    json.dump(report, f, indent=2)

print('Saved:')
print(submission_path)
print(report_path)
submission.head()