<a href="https://colab.research.google.com/github/cec70/HEARTS-French-Adaptation/blob/main/HEARTS_EMGSD_ALBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# HEARTS Baseline: ALBERT-V2 (EMGSD Dataset Only)

## 0. Installments

In [None]:
!git clone https://github.com/holistic-ai/HEARTS-Text-Stereotype-Detection.git
%cd HEARTS-Text-Stereotype-Detection
!pip install codecarbon --quiet


## 1. Setup and imports

In [None]:
import pandas as pd
import numpy as np
import os
import logging

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_recall_fscore_support, balanced_accuracy_score

from datasets import Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, pipeline

from codecarbon import EmissionsTracker

logging.basicConfig(level=logging.INFO)


## 2. Data loading

In [None]:
def data_loader(csv_file_path, labelling_criteria, dataset_name, sample_size=1000000, num_examples=5):
    """
    Loads one dataset, applies binary labeling, samples if needed,
    and creates train/test split.
    """

    df = pd.read_csv(csv_file_path, usecols=['text', 'label', 'group'])

    # Binary label mapping
    label2id = {label: (1 if label == labelling_criteria else 0)
                for label in df['label'].unique()}
    df['label'] = df['label'].map(label2id)

    df['data_name'] = dataset_name

    # Sampling
    if sample_size < len(df):
        df, _ = train_test_split(
            df,
            train_size=sample_size / len(df),
            stratify=df['label'],
            random_state=42
        )

    # Train/test split
    train_df, test_df = train_test_split(
        df,
        test_size=0.2,
        stratify=df['label'],
        random_state=42
    )

    print(f"\nLoaded {dataset_name}")
    print(f"Train size: {len(train_df)}")
    print(f"Test size: {len(test_df)}\n")

    return train_df, test_df

## 3. Merge datasets

In [None]:
def merge_datasets(train_a, test_a, train_b, test_b, num_examples=5):
    merged_train = pd.concat([train_a, train_b], ignore_index=True)
    merged_test = pd.concat([test_a, test_b], ignore_index=True)

    print("\nMerged Dataset:")
    print(f"Train size: {len(merged_train)}")
    print(f"Test size: {len(merged_test)}\n")

    return merged_train, merged_test

## 4. Training function (ALBERT-V2)

In [None]:
def train_model(train_df, model_path, batch_size, epochs, lr, output_base, dataset_name, seed):

    np.random.seed(seed)
    num_labels = len(train_df['label'].unique())

    print(f"\nNumber of labels: {num_labels}")

    tracker = EmissionsTracker()
    tracker.start()

    # Load model + tokenizer
    model = AutoModelForSequenceClassification.from_pretrained(
        model_path,
        num_labels=num_labels,
        ignore_mismatched_sizes=True
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    # Special GPT fix
    if model_path.startswith("gpt"):
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = tokenizer.pad_token_id

    def tokenize(batch):
        return tokenizer(batch["text"], padding=True, truncation=True, max_length=512)

    # Train/val split
    train_df, val_df = train_test_split(
        train_df,
        test_size=0.2,
        stratify=train_df['label'],
        random_state=42
    )

    train_ds = Dataset.from_pandas(train_df).map(tokenize, batched=True)
    train_ds = train_ds.map(lambda x: {"labels": x["label"]})

    val_ds = Dataset.from_pandas(val_df).map(tokenize, batched=True)
    val_ds = val_ds.map(lambda x: {"labels": x["label"]})

    # Metrics
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=1)
        precision, recall, f1, _ = precision_recall_fscore_support(
            labels, preds, average='macro'
        )
        bal_acc = balanced_accuracy_score(labels, preds)
        return {
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "balanced_accuracy": bal_acc
        }

    # Output folder
    output_dir = os.path.join(output_base, dataset_name)
    os.makedirs(output_dir, exist_ok=True)

    args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=lr,
        eval_strategy="epoch",
        save_strategy="epoch",
        weight_decay=0.01,
        load_best_model_at_end=True,
        save_total_limit=1
    )

    trainer = Trainer(
        model=model,
        args=args,
        tokenizer=tokenizer,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        compute_metrics=compute_metrics
    )

    trainer.train()
    trainer.save_model(output_dir)

    emissions = tracker.stop()
    print(f"\nEstimated emissions: {emissions:.4f} kg")

    return output_dir

## 5. Evaluation function

In [None]:
def evaluate_model(test_df, model_dir, results_base, dataset_name, seed):

    np.random.seed(seed)
    num_labels = len(test_df['label'].unique())
    print(f"\nEvaluating - Number of labels: {num_labels}")

    # Load model
    model = AutoModelForSequenceClassification.from_pretrained(
        model_dir,
        num_labels=num_labels,
        ignore_mismatched_sizes=True
    )
    tokenizer = AutoTokenizer.from_pretrained(model_dir)

    def tokenize(batch):
        return tokenizer(batch["text"], padding=True, truncation=True, max_length=512)

    # Tokenize for pipeline
    pipe = pipeline(
        "text-classification",
        model=model,
        tokenizer=tokenizer,
        device=-1
    )

    # Predictions
    predictions = pipe(test_df['text'].tolist(), return_all_scores=True)
    pred_labels = [int(max(p, key=lambda x: x['score'])['label'].split('_')[-1])
                   for p in predictions]
    pred_probs = [max(p, key=lambda x: x['score'])['score']
                  for p in predictions]

    # Save results
    out_dir = os.path.join(results_base, dataset_name)
    os.makedirs(out_dir, exist_ok=True)

    results = pd.DataFrame({
        "text": test_df['text'],
        "predicted_label": pred_labels,
        "predicted_probability": pred_probs,
        "actual_label": test_df['label'],
        "group": test_df['group'],
        "dataset_name": test_df['data_name']
    })

    results.to_csv(os.path.join(out_dir, "full_results.csv"), index=False)

    # Classification report
    report = classification_report(test_df['label'], pred_labels, output_dict=True)
    report_df = pd.DataFrame(report).transpose()
    report_df.to_csv(os.path.join(out_dir, "classification_report.csv"))

    print("\nClassification Report:")
    display(report_df)

    return report_df

## 6. Load and merge datasets

In [None]:
train_wq, test_wq = data_loader(
    "Model Training and Evaluation/Winoqueer - GPT Augmentation.csv",
    labelling_criteria="stereotype",
    dataset_name="WinoQueer"
)

train_sg, test_sg = data_loader(
    "Model Training and Evaluation/SeeGULL - GPT Augmentation.csv",
    labelling_criteria="stereotype",
    dataset_name="SeeGULL"
)

train_mgsd, test_mgsd = data_loader(
    "Model Training and Evaluation/MGSD.csv",
    labelling_criteria="stereotype",
    dataset_name="MGSD"
)

# Merge WinoQueer + SeeGULL
train_wq_sg, test_wq_sg = merge_datasets(train_wq, test_wq, train_sg, test_sg)

# Merge (WQ + SG) with MGSD
train_merged, test_merged = merge_datasets(train_wq_sg, test_wq_sg, train_mgsd, test_mgsd)

print("\nFinal merged dataset shapes:")
print("Train:", train_merged.shape)
print("Test:", test_merged.shape)


## 7. Train ALBERT-V2 on the merged dataset

In [None]:
model_output_dir = train_model(
    train_df=train_merged,
    model_path="albert/albert-base-v2",
    batch_size=64,
    epochs=6,
    lr=2e-5,
    output_base="model_output_albert_baseline",
    dataset_name="merged_wq_seegull_mgsd",
    seed=42
)

#### Results

| Epoch	| Training Loss	| Validation Loss	| Precision	| Recall | F1	|Balanced Accuracy |
| --- | --- | --- | --- | --- | --- | --- |
| 1	| 0.469300 | 0.415653	| 0.768429 | 0.795270 | 0.772058 | 0.795270 |
| 2	| 0.345400 | 0.365308	| 0.810502 | 0.817595 | 0.813748 | 0.817595 |
| 3	| 0.258900 | 0.394409	| 0.812467 | 0.824586 | 0.817594 | 0.824586 |
| 4	| 0.186800 | 0.458717	| 0.815537 | 0.818966 | 0.817186 | 0.818966 |
| 5	| 0.114500 | 0.574235	| 0.817568 | 0.816777 | 0.817169 | 0.816777 |
| 6	| 0.061600 | 0.732173	| 0.817217 | 0.814263 | 0.815695 | 0.814263 |

Estimated emissions: 0.0318 kg


## 8. Evaluation on test set

In [None]:
results = evaluate_model(
    test_df=test_merged,
    model_dir=model_output_dir,
    results_base="results_albert_baseline",
    dataset_name="merged_wq_seegull_mgsd",
    seed=42
)

#### Results

Classification report:

|index|precision|recall|f1-score|support|
|---|---|---|---|---|
|0|0\.8779|0\.8601|0\.8689|7540\.0|
|1|0\.7398|0\.7688|0\.7540|3901\.0|
|accuracy|0\.8289|0\.8289|0\.8289|0\.8289|
|macro avg|0\.8088|0\.8144|0\.8114|11441\.0|
|weighted avg|0\.8308|0\.8289|0\.8297|11441\.0|

Macro F1: **81.14%**

#### Discussion & Conclusion

The ALBERT baseline trained and tested on the EMGSD dataset achieved a macro-F1 score of **81.14%**, which is highly consistent with the **81.5%** macro-F1 reported in the original HEARTS paper. The difference of **0.36%** is well within the requirement of reproducing results within **Â±5%** of the paper's baseline performance.

Overall, the successful replication of the baseline demonstrates both the stability of the initial approach and the robustness of the pipeline implemented.