# Phase 0: Preprocessing Ablation for Macro-F1

Goal: quantify whether preprocessing improves macro-F1 before retraining heavy transformer models.

Outputs:
- `outputs/preprocessing_ablation_report.csv`
- `outputs/preprocessing_examples.csv`
- `outputs/train_cleaned_esg_normalized.csv`
- `outputs/test_cleaned_esg_normalized.csv`

## 1) Import Dependencies

In [None]:
# If needed in Colab, uncomment:
# !pip install -q pandas numpy scikit-learn iterative-stratification matplotlib

from pathlib import Path
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

# Ensure local module import works from notebooks/
if Path('.').resolve().name == 'notebooks':
    sys.path.append(str(Path('..').resolve()))
else:
    sys.path.append(str(Path('.').resolve()))

from src.text_preprocessing import PreprocessConfig, clean_text, build_ablation_configs

## 2) Configure Parameters

In [None]:
LABELS = ['E', 'S', 'G', 'non_ESG']
ID_COL = 'id'
TEXT_COL = 'text'

N_SPLITS = 5
SEED = 42
MAX_FEATURES = 120_000
MIN_DF = 2

if Path('data_set').exists():
    ROOT = Path('.')
elif Path('../data_set').exists():
    ROOT = Path('..')
else:
    ROOT = Path('.')

TRAIN_PATH = ROOT / 'data_set' / 'train.csv'
TEST_PATH = ROOT / 'data_set' / 'test.csv'
OUTPUT_DIR = ROOT / 'outputs'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print('ROOT:', ROOT.resolve())
print('TRAIN_PATH exists:', TRAIN_PATH.exists())
print('TEST_PATH exists:', TEST_PATH.exists())

## 3) Load or Create Input Data

In [None]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

assert all(col in train_df.columns for col in [ID_COL, TEXT_COL] + LABELS)
assert all(col in test_df.columns for col in [ID_COL, TEXT_COL])

train_df[TEXT_COL] = train_df[TEXT_COL].fillna('').astype(str)
test_df[TEXT_COL] = test_df[TEXT_COL].fillna('').astype(str)

print('Train shape:', train_df.shape)
print('Test shape:', test_df.shape)
print('\nLabel prevalence:')
print(train_df[LABELS].mean())

train_df.head(3)

## 4) Implement Core Logic

In [None]:
def macro_f1(y_true, y_pred):
    vals = [f1_score(y_true[:, i], y_pred[:, i], zero_division=0) for i in range(y_true.shape[1])]
    return float(np.mean(vals)), vals


def run_tfidf_cv(text_series, y, n_splits=5, seed=42):
    cv = MultilabelStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof_prob = np.zeros((len(text_series), y.shape[1]), dtype=float)

    for tr_idx, va_idx in cv.split(text_series.values, y):
        x_tr = text_series.iloc[tr_idx].values
        x_va = text_series.iloc[va_idx].values
        y_tr = y[tr_idx]

        vec = TfidfVectorizer(
            lowercase=True,
            strip_accents='unicode',
            ngram_range=(1, 2),
            min_df=MIN_DF,
            max_features=MAX_FEATURES,
            sublinear_tf=True,
        )
        xtr = vec.fit_transform(x_tr)
        xva = vec.transform(x_va)

        clf = OneVsRestClassifier(
            LogisticRegression(C=2.0, max_iter=2500, class_weight='balanced', solver='liblinear')
        )
        clf.fit(xtr, y_tr)
        oof_prob[va_idx] = clf.predict_proba(xva)

    y_pred = (oof_prob >= 0.5).astype(int)
    macro, per_label = macro_f1(y, y_pred)
    return macro, per_label, oof_prob


y = train_df[LABELS].values.astype(int)
configs = build_ablation_configs()
rows = []
examples = []

for name, cfg in configs.items():
    cleaned = train_df[TEXT_COL].apply(lambda t: clean_text(t, cfg))
    macro, per_label, _ = run_tfidf_cv(cleaned, y, N_SPLITS, SEED)
    rows.append({
        'config': name,
        'macro_f1': round(macro, 6),
        **{f'f1_{LABELS[i]}': round(float(per_label[i]), 6) for i in range(len(LABELS))}
    })

    for i in range(3):
        examples.append({
            'config': name,
            'id': int(train_df.iloc[i][ID_COL]),
            'original': train_df.iloc[i][TEXT_COL],
            'cleaned': cleaned.iloc[i]
        })

results_df = pd.DataFrame(rows).sort_values('macro_f1', ascending=False).reset_index(drop=True)
examples_df = pd.DataFrame(examples)

results_df

## 5) Run Validation Checks

In [None]:
assert not results_df.empty
assert set(['config', 'macro_f1']).issubset(results_df.columns)
assert results_df['macro_f1'].between(0, 1).all()

best_config_name = results_df.iloc[0]['config']
best_cfg = configs[best_config_name]
print('Best preprocessing config:', best_config_name)
print(results_df[['config', 'macro_f1']])

## 6) Visualize Key Outputs

In [None]:
display(results_df)

display(examples_df.head(9))

plt.figure(figsize=(8, 4))
plt.bar(results_df['config'], results_df['macro_f1'])
plt.title('Macro-F1 by Preprocessing Configuration')
plt.ylabel('Macro-F1')
plt.ylim(0, 1)
plt.xticks(rotation=20)
plt.tight_layout()
plt.show()

## 7) Export Results

In [None]:
best_train = train_df.copy()
best_test = test_df.copy()

best_train[TEXT_COL] = best_train[TEXT_COL].apply(lambda t: clean_text(t, best_cfg))
best_test[TEXT_COL] = best_test[TEXT_COL].apply(lambda t: clean_text(t, best_cfg))

report_path = OUTPUT_DIR / 'preprocessing_ablation_report.csv'
examples_path = OUTPUT_DIR / 'preprocessing_examples.csv'
train_clean_path = OUTPUT_DIR / f'train_cleaned_{best_config_name}.csv'
test_clean_path = OUTPUT_DIR / f'test_cleaned_{best_config_name}.csv'

results_df.to_csv(report_path, index=False)
examples_df.to_csv(examples_path, index=False)
best_train.to_csv(train_clean_path, index=False)
best_test.to_csv(test_clean_path, index=False)

print('Saved:')
print(report_path)
print(examples_path)
print(train_clean_path)
print(test_clean_path)