# Phase 2 (RoBERTa): Multi-Label Pipeline (Macro-F1)

Standalone RoBERTa version of Phase 2.

Outputs:
- `outputs/submission_roberta.csv`
- `outputs/roberta_threshold_report.json`
- `outputs/oof_probs_roberta.csv`
- `outputs/test_probs_roberta.csv`

## 1) Import Dependencies

In [None]:
# If needed in Colab, uncomment:
# !pip install -q transformers datasets accelerate iterative-stratification scikit-learn pandas numpy tqdm matplotlib

import gc
import json
import random
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from sklearn.metrics import f1_score

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments

import matplotlib.pyplot as plt

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print('Torch:', torch.__version__)
print('CUDA available:', torch.cuda.is_available())

## 2) Configure Parameters

In [None]:
LABELS = ['E', 'S', 'G', 'non_ESG']
ID_COL = 'id'
TEXT_COL = 'text'

MODEL_NAME = 'roberta-base'
MAX_LENGTH = 256
N_SPLITS = 5
LR = 2e-5
TRAIN_BS = 12
EVAL_BS = 24
NUM_EPOCHS = 3
WEIGHT_DECAY = 0.01

THRESHOLD_MIN = 0.05
THRESHOLD_MAX = 0.95
THRESHOLD_STEP = 0.01
ENABLE_NON_ESG_RULE = True

if Path('data_set').exists():
    ROOT = Path('.')
elif Path('../data_set').exists():
    ROOT = Path('..')
else:
    ROOT = Path('.')

TRAIN_PATH = ROOT / 'data_set' / 'train.csv'
TEST_PATH = ROOT / 'data_set' / 'test.csv'
SAMPLE_SUB_PATH = ROOT / 'data_set' / 'sample_submission.csv'
OUTPUT_DIR = ROOT / 'outputs'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print('ROOT:', ROOT.resolve())
print('TRAIN_PATH exists:', TRAIN_PATH.exists())
print('TEST_PATH exists:', TEST_PATH.exists())

## 3) Load or Create Input Data

In [None]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
sample_sub_df = pd.read_csv(SAMPLE_SUB_PATH)

print('Train shape:', train_df.shape)
print('Test shape:', test_df.shape)
print('Sample submission shape:', sample_sub_df.shape)

required_train_cols = [ID_COL, TEXT_COL] + LABELS
missing_cols = [c for c in required_train_cols if c not in train_df.columns]
assert len(missing_cols) == 0, f'Missing train columns: {missing_cols}'
assert ID_COL in test_df.columns and TEXT_COL in test_df.columns

train_df[TEXT_COL] = train_df[TEXT_COL].fillna('').astype(str)
test_df[TEXT_COL] = test_df[TEXT_COL].fillna('').astype(str)

y = train_df[LABELS].values.astype(np.float32)
prevalence = train_df[LABELS].mean().sort_values(ascending=False)
print('\nLabel prevalence:')
print(prevalence)
train_df.head(3)

## 4) Implement Core Logic

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class ESGDataset(torch.utils.data.Dataset):
    def __init__(self, texts, tokenizer, max_length, labels=None):
        self.texts = list(texts)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float32)
        return item


def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))


def macro_f1(y_true, y_pred):
    scores = [f1_score(y_true[:, i], y_pred[:, i], zero_division=0) for i in range(y_true.shape[1])]
    return float(np.mean(scores)), scores


def tune_thresholds(y_true, y_prob, tmin=0.05, tmax=0.95, step=0.01):
    grid = np.arange(tmin, tmax + 1e-12, step)
    best = {}
    for i, label in enumerate(LABELS):
        best_t, best_f1 = 0.5, -1.0
        for t in grid:
            pred = (y_prob[:, i] >= t).astype(int)
            f1 = f1_score(y_true[:, i], pred, zero_division=0)
            if f1 > best_f1:
                best_f1 = f1
                best_t = float(t)
        best[label] = round(best_t, 4)
    return best


def apply_thresholds(y_prob, thresholds):
    y_pred = np.zeros_like(y_prob, dtype=int)
    for i, label in enumerate(LABELS):
        y_pred[:, i] = (y_prob[:, i] >= thresholds[label]).astype(int)
    return y_pred


def apply_non_esg_rule(y_pred):
    fixed = y_pred.copy()
    esg_any = (fixed[:, 0] + fixed[:, 1] + fixed[:, 2]) > 0
    fixed[esg_any, 3] = 0
    return fixed

In [None]:
mskf = MultilabelStratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

oof_prob = np.zeros((len(train_df), len(LABELS)), dtype=np.float32)
test_prob_folds = []
fold_metrics = []

for fold, (tr_idx, va_idx) in enumerate(mskf.split(train_df[TEXT_COL].values, y), start=1):
    print(f'\n===== Fold {fold}/{N_SPLITS} =====')

    x_tr = train_df.iloc[tr_idx][TEXT_COL].tolist()
    x_va = train_df.iloc[va_idx][TEXT_COL].tolist()
    y_tr = y[tr_idx]
    y_va = y[va_idx]

    train_ds = ESGDataset(x_tr, tokenizer, MAX_LENGTH, y_tr)
    valid_ds = ESGDataset(x_va, tokenizer, MAX_LENGTH, y_va)
    test_ds = ESGDataset(test_df[TEXT_COL].tolist(), tokenizer, MAX_LENGTH, labels=None)

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=len(LABELS),
        problem_type='multi_label_classification'
    )

    args = TrainingArguments(
        output_dir=str(OUTPUT_DIR / f'roberta_fold_{fold}'),
        learning_rate=LR,
        per_device_train_batch_size=TRAIN_BS,
        per_device_eval_batch_size=EVAL_BS,
        num_train_epochs=NUM_EPOCHS,
        weight_decay=WEIGHT_DECAY,
        evaluation_strategy='epoch',
        save_strategy='no',
        logging_steps=50,
        report_to='none',
        fp16=torch.cuda.is_available(),
        dataloader_num_workers=2,
        remove_unused_columns=False,
        seed=SEED,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=valid_ds,
        tokenizer=tokenizer,
    )

    trainer.train()

    va_logits = trainer.predict(valid_ds).predictions
    te_logits = trainer.predict(test_ds).predictions

    va_prob = sigmoid(va_logits)
    te_prob = sigmoid(te_logits)

    oof_prob[va_idx] = va_prob
    test_prob_folds.append(te_prob)

    fold_pred_default = (va_prob >= 0.5).astype(int)
    fold_macro, fold_label_scores = macro_f1(y_va.astype(int), fold_pred_default)
    fold_metrics.append({
        'fold': fold,
        'macro_f1@0.5': round(fold_macro, 6),
        'label_f1@0.5': {LABELS[i]: round(float(fold_label_scores[i]), 6) for i in range(len(LABELS))}
    })
    print('Fold macro-F1 @0.5:', round(fold_macro, 6))

    del model, trainer, train_ds, valid_ds, test_ds
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

test_prob = np.mean(np.stack(test_prob_folds, axis=0), axis=0)
print('\nFinished CV. OOF shape:', oof_prob.shape, ' Test prob shape:', test_prob.shape)

## 5) Run Validation Checks

In [None]:
y_true_int = y.astype(int)
thresholds = tune_thresholds(y_true_int, oof_prob, THRESHOLD_MIN, THRESHOLD_MAX, THRESHOLD_STEP)

oof_pred = apply_thresholds(oof_prob, thresholds)
test_pred = apply_thresholds(test_prob, thresholds)

macro_base, per_label_base = macro_f1(y_true_int, oof_pred)
print('Macro-F1 after threshold tuning:', round(macro_base, 6))

rule_applied = False
macro_after_rule = macro_base
if ENABLE_NON_ESG_RULE:
    oof_pred_rule = apply_non_esg_rule(oof_pred)
    macro_rule, _ = macro_f1(y_true_int, oof_pred_rule)
    print('Macro-F1 after non_ESG rule:', round(macro_rule, 6))
    if macro_rule >= macro_base:
        oof_pred = oof_pred_rule
        test_pred = apply_non_esg_rule(test_pred)
        macro_after_rule = macro_rule
        rule_applied = True

assert oof_prob.shape == (len(train_df), len(LABELS))
assert test_prob.shape == (len(test_df), len(LABELS))
assert set(thresholds.keys()) == set(LABELS)
print('Validation checks passed.')

## 6) Visualize Key Outputs

In [None]:
summary_df = pd.DataFrame({
    'label': LABELS,
    'prevalence': [float(prevalence[l]) for l in LABELS],
    'threshold': [float(thresholds[l]) for l in LABELS],
    'f1_oof': [float(per_label_base[i]) for i in range(len(LABELS))],
}).sort_values('prevalence', ascending=False)

display(summary_df)

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
summary_df.plot(kind='bar', x='label', y='prevalence', ax=axes[0], legend=False, title='Label Prevalence')
summary_df.plot(kind='bar', x='label', y='threshold', ax=axes[1], legend=False, title='Tuned Thresholds')
plt.tight_layout()
plt.show()

## 7) Export Results

In [None]:
submission = sample_sub_df.copy()
if ID_COL in submission.columns and ID_COL in test_df.columns:
    submission[ID_COL] = test_df[ID_COL].values
for i, label in enumerate(LABELS):
    submission[label] = test_pred[:, i].astype(int)

oof_prob_df = pd.DataFrame({ID_COL: train_df[ID_COL].values})
for i, label in enumerate(LABELS):
    oof_prob_df[f'{label}_prob'] = oof_prob[:, i]

test_prob_df = pd.DataFrame({ID_COL: test_df[ID_COL].values})
for i, label in enumerate(LABELS):
    test_prob_df[f'{label}_prob'] = test_prob[:, i]

report = {
    'model_name': MODEL_NAME,
    'labels': LABELS,
    'thresholds': {k: float(v) for k, v in thresholds.items()},
    'macro_f1_oof_threshold_tuned': float(macro_base),
    'macro_f1_oof_after_optional_non_esg_rule': float(macro_after_rule),
    'non_esg_rule_applied': bool(rule_applied),
    'fold_metrics_default_threshold_0_5': fold_metrics,
}

submission_path = OUTPUT_DIR / 'submission_roberta.csv'
report_path = OUTPUT_DIR / 'roberta_threshold_report.json'
oof_path = OUTPUT_DIR / 'oof_probs_roberta.csv'
test_prob_path = OUTPUT_DIR / 'test_probs_roberta.csv'

submission.to_csv(submission_path, index=False)
oof_prob_df.to_csv(oof_path, index=False)
test_prob_df.to_csv(test_prob_path, index=False)
with open(report_path, 'w', encoding='utf-8') as f:
    json.dump(report, f, indent=2)

print('Saved:')
print(submission_path)
print(report_path)
print(oof_path)
print(test_prob_path)
submission.head()