<h1 align="center">Lab 2: Sexism Identification in Twitter</h1>
<h2 align="center">Session 3. Transformers: Fine-tuning for multi-label classification
<h3 style="display:block; margin-top:5px;" align="center">Natural Language and Information Retrieval</h3>
<h3 style="display:block; margin-top:5px;" align="center">Degree in Data Science</h3>
<h3 style="display:block; margin-top:5px;" align="center">2024-2025</h3>    
<h3 style="display:block; margin-top:5px;" align="center">ETSInf. Universitat Politècnica de València</h3>
<br>

### Put your names here

- Kacper Multan
- Filip Polacik

In [None]:
# Install required libraries
!pip install transformers --upgrade
!pip install datasets accelerate
!pip install PyEvALL
!pip install scikit-learn
!pip install peft
# !pip install jupyter --upgrade
# !pip install ipywidgets --upgrade

## Many libraries

In [None]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import random
import os
import pandas as pd
import sys
import tempfile
import time
import matplotlib.pyplot as plt
import seaborn as sns
from peft import LoraConfig, get_peft_model

# Importing the required modules to use the ICM measure
from pyevall.evaluation import PyEvALLEvaluation
from pyevall.metrics.metricfactory import MetricFactory
from pyevall.reports.reports import PyEvALLReport
from pyevall.utils.utils import PyEvALLUtils

from functools import partial

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

In [None]:
# IF YOU USE GOOGLE COLAB -> COLAB=True
COLAB = False

In [None]:
if COLAB is True:
    from google.colab import drive
    drive.mount('/content/drive')
    base_path = "/content/drive/MyDrive/LNR/"
else:
    base_path = "../"

## Import readerEXIST2025 library

In [None]:
library_path = os.path.join(base_path, "Lab2-S1")
sys.path.append(library_path)
from readerEXIST2025 import EXISTReader

In [None]:
# Path to the dataset, adapt this path wherever you have the dataset
dataset_path = "EXIST_2025_Dataset_V0.2/"

file_train = os.path.join(dataset_path, "EXIST2025_training.json")
file_dev = os.path.join(dataset_path, "EXIST2025_dev.json")

reader_train = EXISTReader(file_train)
reader_dev = EXISTReader(file_dev)

EnTrainTask3, EnDevTask3 = reader_train.get(lang="EN", subtask="3"), reader_dev.get(lang="EN", subtask="3")
SpTrainTask3, SpDevTask3 = reader_train.get(lang="ES", subtask="3"), reader_dev.get(lang="ES", subtask="3")

# Wrapper to compute ICM measure

In [None]:
def ICMWrapper(pred, labels, multi=False, ids=None):
    test = PyEvALLEvaluation()
    metrics = [MetricFactory.ICM.value]
    params = {}
    fillLabel = None
    if multi:
        params[PyEvALLUtils.PARAM_FORMAT] = PyEvALLUtils.PARAM_VALUE_FORMAT_MULTILABEL
        fillLabel = "O"
    
    with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as pred_file, \
         tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as gold_file:
        pred_dict = {}
        gold_dict = {}
        for i, (p, l) in enumerate(zip(pred, labels)):
            curr_id = ids[i] if ids is not None else str(i)
            pred_dict[curr_id] = p
            gold_dict[curr_id] = l
        
        PyEvALLUtils.write_json_file(pred_file.name, pred_dict)
        PyEvALLUtils.write_json_file(gold_file.name, gold_dict)
        
        report = test.evaluate(gold_file.name, pred_file.name, metrics, params, fillLabel)
    
    os.unlink(pred_file.name)
    os.unlink(gold_file.name)
    
    icm_score = report.get_metrics()[MetricFactory.ICM.value]
    return icm_score

## Custom Dataset and Pipeline

In [None]:
# Custom Dataset for multi-label classification
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.mlb = MultiLabelBinarizer(classes=[
            'IDEOLOGICAL-INEQUALITY', 'STEREOTYPING-DOMINANCE', 'OBJECTIFICATION',
            'SEXUAL-VIOLENCE', 'MISOGYNY-NON-SEXUAL-VIOLENCE'
        ])
        self.labels = self.mlb.fit_transform([d['labels_task3'] for d in data])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]['text']
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.FloatTensor(self.labels[idx]),
            'id': self.data[idx]['id']
        }

# Compute metrics function
def compute_metrics(pred):
    logits = pred.predictions
    labels = pred.label_ids
    # Apply sigmoid and threshold at 0.5 for multi-label
    preds = (torch.sigmoid(torch.tensor(logits)) > 0.5).int().numpy()
    # Compute ICM
    pred_labels = [list(np.array(mlb.classes_)[p.astype(bool)]) for p in preds]
    true_labels = [list(np.array(mlb.classes_)[l.astype(bool)]) for l in labels]
    icm = ICMWrapper(pred_labels, true_labels, multi=True, ids=None)
    # Compute Macro-F1
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    return {
        'icm': icm,
        'macro_f1': f1,
        'precision': precision,
        'recall': recall
    }

# Pipeline for training and evaluation
def sexism_classification_pipeline_task3(train_data, dev_data, model_name, technique='fine-tune', config='conservative'):
    global mlb  # For compute_metrics
    start_time = time.time()
    
    # Initialize tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=5,
        problem_type="multi_label_classification"
    )
    
    # Apply LoRA if specified
    if technique == 'lora':
        lora_config = LoraConfig(
            r=8 if config == 'conservative' else 16,
            lora_alpha=16 if config == 'conservative' else 32,
            target_modules=["query", "value"],
            lora_dropout=0.1,
            bias="none",
            task_type="SEQ_CLS"
        )
        model = get_peft_model(model, lora_config)
        model.print_trainable_parameters()
    
    # Create datasets
    train_dataset = CustomDataset(train_data, tokenizer)
    dev_dataset = CustomDataset(dev_data, tokenizer)
    mlb = train_dataset.mlb  # For compute_metrics
    
    # Define training arguments
    training_args = TrainingArguments(
        output_dir=f"./results_{model_name.split('/')[-1]}_{technique}_{config}",
        learning_rate=2e-5 if config == 'conservative' else 5e-5,
        per_device_train_batch_size=16 if config == 'conservative' else 8,
        per_device_eval_batch_size=16 if config == 'conservative' else 8,
        num_train_epochs=10,
        weight_decay=0.01 if config == 'conservative' else 0.1,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="macro_f1",
        logging_dir='./logs',
        logging_steps=10,
        fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
        gradient_accumulation_steps=2 if config == 'aggressive' else 1
    )
    
    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=dev_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3 if config == 'conservative' else 2)]
    )
    
    # Train model
    trainer.train()
    
    # Evaluate model
    results = trainer.evaluate()
    
    # Compute per-label F1 scores
    predictions = trainer.predict(dev_dataset)
    logits = predictions.predictions
    labels = predictions.label_ids
    preds = (torch.sigmoid(torch.tensor(logits)) > 0.5).int().numpy()
    per_label_f1 = precision_recall_fscore_support(labels, preds, average=None)[2]
    per_label_f1_dict = {label: f1 for label, f1 in zip(mlb.classes_, per_label_f1)}
    
    # Store training time
    training_time = time.time() - start_time
    results['training_time'] = training_time
    results['per_label_f1'] = per_label_f1_dict
    results['epochs'] = trainer.state.epoch
    results['time_per_epoch'] = training_time / trainer.state.epoch if trainer.state.epoch > 0 else training_time
    
    return model, results

## Train English Models

In [None]:
# Store all results
all_results = []

# English: BERT
print("Training BERT English (Fine-Tune, Conservative)")
bert_en_ft_cons_model, bert_en_ft_cons_results = sexism_classification_pipeline_task3(
    EnTrainTask3, EnDevTask3, model_name="bert-base-uncased", technique="fine-tune", config="conservative"
)
all_results.append({
    'language': 'English', 'model': 'BERT', 'technique': 'Fine-Tune', 'config': 'Conservative',
    **bert_en_ft_cons_results
})

print("Training BERT English (Fine-Tune, Aggressive)")
bert_en_ft_aggr_model, bert_en_ft_aggr_results = sexism_classification_pipeline_task3(
    EnTrainTask3, EnDevTask3, model_name="bert-base-uncased", technique="fine-tune", config="aggressive"
)
all_results.append({
    'language': 'English', 'model': 'BERT', 'technique': 'Fine-Tune', 'config': 'Aggressive',
    **bert_en_ft_aggr_results
})

print("Training BERT English (LoRA, Conservative)")
bert_en_lora_cons_model, bert_en_lora_cons_results = sexism_classification_pipeline_task3(
    EnTrainTask3, EnDevTask3, model_name="bert-base-uncased", technique="lora", config="conservative"
)
all_results.append({
    'language': 'English', 'model': 'BERT', 'technique': 'LoRA', 'config': 'Conservative',
    **bert_en_lora_cons_results
})

print("Training BERT English (LoRA, Aggressive)")
bert_en_lora_aggr_model, bert_en_lora_aggr_results = sexism_classification_pipeline_task3(
    EnTrainTask3, EnDevTask3, model_name="bert-base-uncased", technique="lora", config="aggressive"
)
all_results.append({
    'language': 'English', 'model': 'BERT', 'technique': 'LoRA', 'config': 'Aggressive',
    **bert_en_lora_aggr_results
})

# English: RoBERTa
print("Training RoBERTa English (Fine-Tune, Conservative)")
roberta_en_ft_cons_model, roberta_en_ft_cons_results = sexism_classification_pipeline_task3(
    EnTrainTask3, EnDevTask3, model_name="roberta-base", technique="fine-tune", config="conservative"
)
all_results.append({
    'language': 'English', 'model': 'RoBERTa', 'technique': 'Fine-Tune', 'config': 'Conservative',
    **roberta_en_ft_cons_results
})

print("Training RoBERTa English (Fine-Tune, Aggressive)")
roberta_en_ft_aggr_model, roberta_en_ft_aggr_results = sexism_classification_pipeline_task3(
    EnTrainTask3, EnDevTask3, model_name="roberta-base", technique="fine-tune", config="aggressive"
)
all_results.append({
    'language': 'English', 'model': 'RoBERTa', 'technique': 'Fine-Tune', 'config': 'Aggressive',
    **roberta_en_ft_aggr_results
})

print("Training RoBERTa English (LoRA, Conservative)")
roberta_en_lora_cons_model, roberta_en_lora_cons_results = sexism_classification_pipeline_task3(
    EnTrainTask3, EnDevTask3, model_name="roberta-base", technique="lora", config="conservative"
)
all_results.append({
    'language': 'English', 'model': 'RoBERTa', 'technique': 'LoRA', 'config': 'Conservative',
    **roberta_en_lora_cons_results
})

print("Training RoBERTa English (LoRA, Aggressive)")
roberta_en_lora_aggr_model, roberta_en_lora_aggr_results = sexism_classification_pipeline_task3(
    EnTrainTask3, EnDevTask3, model_name="roberta-base", technique="lora", config="aggressive"
)
all_results.append({
    'language': 'English', 'model': 'RoBERTa', 'technique': 'LoRA', 'config': 'Aggressive',
    **roberta_en_lora_aggr_results
})

## Train Spanish Models

In [None]:
# Spanish: BETO
print("Training BETO Spanish (Fine-Tune, Conservative)")
beto_es_ft_cons_model, beto_es_ft_cons_results = sexism_classification_pipeline_task3(
    SpTrainTask3, SpDevTask3, model_name="dccuchile/bert-base-spanish-wwm-uncased", technique="fine-tune", config="conservative"
)
all_results.append({
    'language': 'Spanish', 'model': 'BETO', 'technique': 'Fine-Tune', 'config': 'Conservative',
    **beto_es_ft_cons_results
})

print("Training BETO Spanish (Fine-Tune, Aggressive)")
beto_es_ft_aggr_model, beto_es_ft_aggr_results = sexism_classification_pipeline_task3(
    SpTrainTask3, SpDevTask3, model_name="dccuchile/bert-base-spanish-wwm-uncased", technique="fine-tune", config="aggressive"
)
all_results.append({
    'language': 'Spanish', 'model': 'BETO', 'technique': 'Fine-Tune', 'config': 'Aggressive',
    **beto_es_ft_aggr_results
})

print("Training BETO Spanish (LoRA, Conservative)")
beto_es_lora_cons_model, beto_es_lora_cons_results = sexism_classification_pipeline_task3(
    SpTrainTask3, SpDevTask3, model_name="dccuchile/bert-base-spanish-wwm-uncased", technique="lora", config="conservative"
)
all_results.append({
    'language': 'Spanish', 'model': 'BETO', 'technique': 'LoRA', 'config': 'Conservative',
    **beto_es_lora_cons_results
})

print("Training BETO Spanish (LoRA, Aggressive)")
beto_es_lora_aggr_model, beto_es_lora_aggr_results = sexism_classification_pipeline_task3(
    SpTrainTask3, SpDevTask3, model_name="dccuchile/bert-base-spanish-wwm-uncased", technique="lora", config="aggressive"
)
all_results.append({
    'language': 'Spanish', 'model': 'BETO', 'technique': 'LoRA', 'config': 'Aggressive',
    **beto_es_lora_aggr_results
})

# Spanish: RoBERTa-BNE
print("Training RoBERTa-BNE Spanish (Fine-Tune, Conservative)")
roberta_bne_es_ft_cons_model, roberta_bne_es_ft_cons_results = sexism_classification_pipeline_task3(
    SpTrainTask3, SpDevTask3, model_name="PlanTL-GOB-ES/roberta-base-bne", technique="fine-tune", config="conservative"
)
all_results.append({
    'language': 'Spanish', 'model': 'RoBERTa-BNE', 'technique': 'Fine-Tune', 'config': 'Conservative',
    **roberta_bne_es_ft_cons_results
})

print("Training RoBERTa-BNE Spanish (Fine-Tune, Aggressive)")
roberta_bne_es_ft_aggr_model, roberta_bne_es_ft_aggr_results = sexism_classification_pipeline_task3(
    SpTrainTask3, SpDevTask3, model_name="PlanTL-GOB-ES/roberta-base-bne", technique="fine-tune", config="aggressive"
)
all_results.append({
    'language': 'Spanish', 'model': 'RoBERTa-BNE', 'technique': 'Fine-Tune', 'config': 'Aggressive',
    **roberta_bne_es_ft_aggr_results
})

print("Training RoBERTa-BNE Spanish (LoRA, Conservative)")
roberta_bne_es_lora_cons_model, roberta_bne_es_lora_cons_results = sexism_classification_pipeline_task3(
    SpTrainTask3, SpDevTask3, model_name="PlanTL-GOB-ES/roberta-base-bne", technique="lora", config="conservative"
)
all_results.append({
    'language': 'Spanish', 'model': 'RoBERTa-BNE', 'technique': 'LoRA', 'config': 'Conservative',
    **roberta_bne_es_lora_cons_results
})

print("Training RoBERTa-BNE Spanish (LoRA, Aggressive)")
roberta_bne_es_lora_aggr_model, roberta_bne_es_lora_aggr_results = sexism_classification_pipeline_task3(
    SpTrainTask3, SpDevTask3, model_name="PlanTL-GOB-ES/roberta-base-bne", technique="lora", config="aggressive"
)
all_results.append({
    'language': 'Spanish', 'model': 'RoBERTa-BNE', 'technique': 'LoRA', 'config': 'Aggressive',
    **roberta_bne_es_lora_aggr_results
})

## Show Results

In [None]:
# Convert results to DataFrame
results_df = pd.DataFrame(all_results)

# Simplify per-label F1 into separate columns
for label in ['IDEOLOGICAL-INEQUALITY', 'STEREOTYPING-DOMINANCE', 'OBJECTIFICATION',
              'SEXUAL-VIOLENCE', 'MISOGYNY-NON-SEXUAL-VIOLENCE']:
    results_df[f'f1_{label}'] = results_df['per_label_f1'].apply(lambda x: x.get(label, 0.0))

# Save results to CSV
results_df.to_csv('sexism_classification_results.csv', index=False)

# Display summary table
summary_df = results_df[['language', 'model', 'technique', 'config', 'icm', 'macro_f1', 'time_per_epoch']]
print("\nResults Summary:\n")
print(summary_df.to_string(index=False))

# Group by language
for lang in ['English', 'Spanish']:
    print(f"\n{lang}\n")
    for tech in ['Fine-Tune', 'LoRA']:
        print(f"{tech}")
        lang_tech_df = summary_df[(summary_df['language'] == lang) & (summary_df['technique'] == tech)]
        for _, row in lang_tech_df.iterrows():
            print(f"\t{row['model']} ({row['config']}): ICM:{row['icm']:.4f}  Macro-F1:{row['macro_f1']:.4f} ({row['time_per_epoch']:.2f}s per epoch)")

## Visualizations

In [None]:
# Bar Plot: Macro-F1 by Model, Technique, and Language
plt.figure(figsize=(12, 6))
sns.barplot(x='macro_f1', y='model', hue='technique', style='config', size='config',
            sizes={'Conservative': 1, 'Aggressive': 2}, palette='viridis',
            data=results_df)
plt.title('Macro-F1 Comparison by Model, Technique, and Configuration')
plt.xlabel('Macro-F1')
plt.ylabel('Model')
plt.legend(title='Technique / Config')
plt.tight_layout()
plt.savefig('macro_f1_comparison.png')
plt.show()

# Bar Plot: ICM by Model, Technique, and Language
plt.figure(figsize=(12, 6))
sns.barplot(x='icm', y='model', hue='technique', style='config', size='config',
            sizes={'Conservative': 1, 'Aggressive': 2}, palette='magma',
            data=results_df)
plt.title('ICM Comparison by Model, Technique, and Configuration')
plt.xlabel('ICM')
plt.ylabel('Model')
plt.legend(title='Technique / Config')
plt.tight_layout()
plt.savefig('icm_comparison.png')
plt.show()

# Per-Label F1 Scores Heatmap
per_label_df = results_df[['language', 'model', 'technique', 'config',
                          'f1_IDEOLOGICAL-INEQUALITY', 'f1_STEREOTYPING-DOMINANCE',
                          'f1_OBJECTIFICATION', 'f1_SEXUAL-VIOLENCE',
                          'f1_MISOGYNY-NON-SEXUAL-VIOLENCE']]
per_label_df = per_label_df.melt(id_vars=['language', 'model', 'technique', 'config'],
                                 var_name='label', value_name='f1')
per_label_df['label'] = per_label_df['label'].str.replace('f1_', '')

plt.figure(figsize=(10, 8))
pivot_table = per_label_df.pivot_table(values='f1', index=['language', 'model', 'technique', 'config'],
                                      columns='label')
sns.heatmap(pivot_table, annot=True, cmap='YlGnBu', fmt='.3f')
plt.title('Per-Label F1 Scores')
plt.tight_layout()
plt.savefig('per_label_f1_heatmap.png')
plt.show()

# Note: Learning curves require trainer logs, which are not easily accessible here.
# To add learning curves, modify the pipeline to save trainer.state.log_history to a file.
# Example code (if logs are available):
# plt.plot(logs['epoch'], logs['eval_loss'], label='Validation Loss')
# plt.xlabel('Epoch')
# plt.ylabel('Loss')
# plt.legend()
# plt.show()