<h1 align="center">Lab 2:  Sexism Identification in Twitter</h1>
<h2 align="center">Session 3. Transformers: Fine-tuning for multi-label classification
<h3 style="display:block; margin-top:5px;" align="center">Natural Language and Information Retrieval</h3>
<h3 style="display:block; margin-top:5px;" align="center">Degree in Data Science</h3>
<h3 style="display:block; margin-top:5px;" align="center">2024-2025</h3>    
<h3 style="display:block; margin-top:5px;" align="center">ETSInf. Universitat Politècnica de València</h3>
<br>

### Put your names here

- Kacper Multan
- Filip Polacik

In [1]:
 !pip install transformers --upgrade
 !pip  install datasets accelerate
 !pip install PyEvALL
 !pip install scikit-learn
 !pip install peft
 #!pip install jupyter --upgrade
 #!pip install ipywidgets --upgrade

Collecting transformers
  Downloading transformers-4.51.2-py3-none-any.whl.metadata (38 kB)
Downloading transformers-4.51.2-py3-none-any.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.51.1
    Uninstalling transformers-4.51.1:
      Successfully uninstalled transformers-4.51.1
Successfully installed transformers-4.51.2


## Many libraries

In [2]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import  AutoTokenizer, AutoModelForSequenceClassification,  Trainer, TrainingArguments,  EarlyStoppingCallback
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import random
import os
import pandas as pd
import sys
import tempfile
import time

#Importing the required modules to use the ICM measure

from pyevall.evaluation import PyEvALLEvaluation
from pyevall.metrics.metricfactory import MetricFactory
from pyevall.reports.reports import PyEvALLReport
from pyevall.utils.utils import PyEvALLUtils

from functools import partial

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# IF YOU USE GOOGLE COLAB -> COLAB=True
COLAB = False

In [4]:
if COLAB is True:
  from google.colab import drive
  drive.mount('/content/drive')
  base_path = "/content/drive/MyDrive/LNR/"
else:
  base_path = "../"

## Import readerEXIST2025 library

In [5]:
library_path = os.path.join(base_path, "Lab2-S1")
sys.path.append(library_path)
from readerEXIST2025 import EXISTReader

In [6]:
# path to the dataset, adapt this path wherever you have the dataset
dataset_path = "EXIST_2025_Dataset_V0.2/"

file_train = os.path.join(dataset_path, "EXIST2025_training.json")
file_dev = os.path.join(dataset_path, "EXIST2025_dev.json")

reader_train = EXISTReader(file_train)
reader_dev = EXISTReader(file_dev)

EnTrainTask3, EnDevTask3 = reader_train.get(lang="EN", subtask="3"), reader_dev.get(lang="EN", subtask="3")
SpTrainTask3, SpDevTask3 = reader_train.get(lang="ES", subtask="3"), reader_dev.get(lang="ES", subtask="3")

# Wrapper to compute ICM measure

In [7]:
def ICMWrapper(pred, labels, multi=False,ids=None):
    test = PyEvALLEvaluation()
    metrics=[MetricFactory.ICM.value]
    params= dict()
    fillLabel=None
    if multi:
        params[PyEvALLUtils.PARAM_REPORT]="embedded"
        hierarchy={"True":['IDEOLOGICAL-INEQUALITY', 'STEREOTYPING-DOMINANCE', 'MISOGYNY-NON-SEXUAL-VIOLENCE', 'OBJECTIFICATION', 'SEXUAL-VIOLENCE'],
        "False":[]}
        params[PyEvALLUtils.PARAM_HIERARCHY]=hierarchy
        fillLabel = lambda x: ["False"] if len(x)== 0 else x
    else:
        params[PyEvALLUtils.PARAM_REPORT]="simple"
        fillLabel = lambda x: str(x)


    truth_name, predict_name=None, None
    if ids is None:
        ids=list(range(len(labels)))

    with tempfile.NamedTemporaryFile(mode='w', delete=False, encoding='utf-8') as truth:
        truth_name=truth.name
        truth_df=pd.DataFrame({'test_case': ['EXIST2025']*len(labels),
                        'id': [str(x) for x in ids],
                        'value': [fillLabel(x) for x in labels]})
        if multi==True:
            truth_df=truth_df.astype('object')
        truth.write(truth_df.to_json(orient="records"))

    with  tempfile.NamedTemporaryFile(mode='w', delete=False) as predict:
        predict_name=predict.name
        predict_df=pd.DataFrame({'test_case': ['EXIST2025']*len(pred),
                        'id': [str(x) for x in ids],
                        'value': [fillLabel(x) for x in pred]})
        if multi==True:
            predict_df=predict_df.astype('object')
        predict.write(predict_df.to_json(orient="records"))

    report = test.evaluate(predict_name, truth_name, metrics, **params)
    os.unlink(truth_name)
    os.unlink(predict_name)

    icm = None
    if 'metrics' in report.report:
        if 'ICM' in report.report["metrics"]: icm=float(report.report["metrics"]['ICM']["results"]["average_per_test_case"])
    return icm



## Set the seed

In [8]:
def set_seed(seed=1234):
    """
    Sets the seed to make everything deterministic, for reproducibility of experiments
    Parameters:
    seed: the number to set the seed to
    Return: None
    """
    # Random seed
    random.seed(seed)
    # Numpy seed
    np.random.seed(seed)
    # Torch seed
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    # os seed
    os.environ['PYTHONHASHSEED'] = str(seed)

## Dataset class

In [9]:
class SexismDatasetMulti(Dataset):
    def __init__(self, texts, labels, ids, tokenizer, max_len=128, pad="max_length", trunc=True,rt='pt'):
        self.texts = texts.tolist()
        self.labels = labels
        self.ids = ids
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.pad = pad
        self.trunc = trunc
        self.rt = rt


    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,padding=self.pad, truncation=self.trunc,
            return_tensors=self.rt
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float),
            'id': torch.tensor(self.ids[idx], dtype=torch.long)}

# Metrics for subtask 3

In [10]:
def compute_metrics_3(pred, lencoder):
    labels = pred.label_ids
    #preds = pred.predictions.argmax(-1)
    preds = torch.sigmoid(torch.tensor(pred.predictions)).numpy()
    preds_binary = (preds >= 0.5).astype(int)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds_binary, average=None, zero_division=0
    )
    acc = accuracy_score(labels, preds_binary)
    icm= ICMWrapper(lencoder.inverse_transform(preds_binary), lencoder.inverse_transform(labels), multi=True)
    # Macro averages
    precision_macro = np.mean(precision)
    recall_macro = np.mean(recall)
    f1_macro = np.mean(f1)
    metrics = {}
    metrics.update({
        'precision_macro': precision_macro,
        'recall_macro': recall_macro,
        'f1_macro': f1_macro,
        'ICM': icm
    })
    return metrics

# Pipeline

In [11]:
def sexism_classification_pipeline_task3(trainInfo, devInfo, testInfo=None, model_name='roberta-base', nlabels=5, ptype="multi_label_classification", **args):
    # Model and Tokenizer
    labelEnc= MultiLabelBinarizer()
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=nlabels,
        problem_type=ptype)

    # Prepare datasets
    train_dataset = SexismDatasetMulti(trainInfo[1], labelEnc.fit_transform(trainInfo[2]),[int(x) for x in trainInfo[0]], tokenizer )
    val_dataset = SexismDatasetMulti(devInfo[1], labelEnc.transform(devInfo[2]), [int(x) for x in devInfo[0]], tokenizer)

    # Training Arguments
    training_args = TrainingArguments(
        report_to="none", # alt: "wandb", "tensorboard" "comet_ml" "mlflow" "clearml"
        output_dir= args.get('output_dir', './results'),
        num_train_epochs= args.get('num_train_epochs', 5),
        learning_rate=args.get('learning_rate', 5e-5),
        per_device_train_batch_size=args.get('per_device_train_batch_size', 16),
        per_device_eval_batch_size=args.get('per_device_eval_batch_size', 64),
        warmup_steps=args.get('warmup_steps', 500),
        weight_decay=args.get('weight_decay',0.01),
        logging_dir=args.get('logging_dir', './logs'),
        logging_steps=args.get('logging_steps', 10),
        eval_strategy=args.get('eval_strategy','epoch'),
        save_strategy=args.get('save_strategy', "epoch"),
        save_total_limit=args.get('save_total_limit', 1),
        load_best_model_at_end=args.get('load_best_model_at_end', True),
        metric_for_best_model=args.get('metric_for_best_model',"ICM")
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        #compute_metrics=compute_metrics_3,
        compute_metrics = partial(compute_metrics_3, lencoder=labelEnc),
        callbacks=[EarlyStoppingCallback(early_stopping_patience=args.get("early_stopping_patience",3))]
    )

    # Fine-tune the model
    trainer.train()

    # Evaluate on validation set
    eval_results = trainer.evaluate()
    print("Validation Results:", eval_results)

    if testInfo is not None:
      # Prepare test dataset for prediction
      test_dataset = SexismDatasetMulti(testInfo[1], [[0,0,0,0,0]] * len(testInfo[1]),  [int(x) for x in testInfo[0]],   tokenizer)

      # Predict test set labels
      predictions = trainer.predict(test_dataset)
      #predicted_labels = np.argmax(predictions.predictions, axis=1)
      predicted_probs = torch.sigmoid(torch.tensor(predictions.predictions)).numpy()
      predicted_labels = (predicted_probs >= 0.5).astype(int)

      # Create submission DataFrame
      submission_df = pd.DataFrame({
          'id': testInfo[0],
          'label': labelEnc.inverse_transform(predicted_labels),
          "test_case": ["EXIST2025"]*len(predicted_labels)

      })
      submission_df.to_csv('sexism_predictions_task3.csv', index=False)
      print("Prediction TASK3 completed. Results saved to sexism_predictions_task2.csv")
      return model, submission_df
    return model, eval_results

# LoRA pipeline

In [12]:
from peft import get_peft_model, LoraConfig, TaskType

def sexism_classification_pipeline_task3_lora(trainInfo, devInfo, testInfo=None, model_name='roberta-base', nlabels=5, ptype="multi_label_classification", **args):
    labelEnc = MultiLabelBinarizer()
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    base_model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=nlabels,
        problem_type=ptype
    )

    peft_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=8,
        lora_alpha=16,
        lora_dropout=0.1,
        bias="none"
    )

    model = get_peft_model(base_model, peft_config)

    train_dataset = SexismDatasetMulti(trainInfo[1], labelEnc.fit_transform(trainInfo[2]), [int(x) for x in trainInfo[0]], tokenizer)
    val_dataset = SexismDatasetMulti(devInfo[1], labelEnc.transform(devInfo[2]), [int(x) for x in devInfo[0]], tokenizer)

    training_args = TrainingArguments(
        report_to="none",
        output_dir=args.get('output_dir', './results'),
        num_train_epochs=args.get('num_train_epochs', 5),
        learning_rate=args.get('learning_rate', 5e-5),
        per_device_train_batch_size=args.get('per_device_train_batch_size', 16),
        per_device_eval_batch_size=args.get('per_device_eval_batch_size', 64),
        warmup_steps=args.get('warmup_steps', 500),
        weight_decay=args.get('weight_decay', 0.01),
        logging_dir=args.get('logging_dir', './logs'),
        logging_steps=args.get('logging_steps', 10),
        evaluation_strategy=args.get('eval_strategy', 'epoch'),
        save_strategy=args.get('save_strategy', "epoch"),
        save_total_limit=args.get('save_total_limit', 1),
        load_best_model_at_end=args.get('load_best_model_at_end', True),
        metric_for_best_model=args.get('metric_for_best_model', "ICM")
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=partial(compute_metrics_3, lencoder=labelEnc),
        callbacks=[EarlyStoppingCallback(early_stopping_patience=args.get("early_stopping_patience", 3))]
    )

    trainer.train()
    eval_results = trainer.evaluate()
    print("Validation Results:", eval_results)

    if testInfo is not None:
        test_dataset = SexismDatasetMulti(testInfo[1], [[0, 0, 0, 0, 0]] * len(testInfo[1]), [int(x) for x in testInfo[0]], tokenizer)
        predictions = trainer.predict(test_dataset)
        predicted_probs = torch.sigmoid(torch.tensor(predictions.predictions)).numpy()
        predicted_labels = (predicted_probs >= 0.5).astype(int)

        submission_df = pd.DataFrame({
            'id': testInfo[0],
            'label': labelEnc.inverse_transform(predicted_labels),
            'test_case': ['EXIST2025'] * len(predicted_labels)
        })

        submission_df.to_csv('sexism_predictions_task3_lora.csv', index=False)
        print("Prediction TASK3 with LoRA completed. Results saved to sexism_predictions_task3_lora.csv")
        return model, submission_df

    return model, eval_results


# Experimentation

## Do it in English

In [None]:
bert_en_normal_model, bert_en_normal_results = sexism_classification_pipeline_task3(EnTrainTask3, EnDevTask3, model_name="bert-base-uncased")
roberta_en_normal_model, roberta_en_normal_results = sexism_classification_pipeline_task3(EnTrainTask3, EnDevTask3, model_name="roberta-base")

bert_en_lora_model, bert_en_lora_results = sexism_classification_pipeline_task3_lora(EnTrainTask3, EnDevTask3, model_name="bert-base-uncased")
roberta_en_lora_model, roberta_en_lora_results = sexism_classification_pipeline_task3_lora(EnTrainTask3, EnDevTask3, model_name="roberta-base")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

## Do it in Spanish

In [None]:
bert_es_normal_model, bert_es_normal_results = sexism_classification_pipeline_task3(EnTrainTask3, EnDevTask3, model_name="dccuchile/bert-base-spanish-wwm-uncased")
roberta_es_normal_model, roberta_es_normal_results = sexism_classification_pipeline_task3(EnTrainTask3, EnDevTask3, model_name="PlanTL-GOB-ES/roberta-base-bne")

bert_es_lora_model, bert_es_normal_results = sexism_classification_pipeline_task3_lora(EnTrainTask3, EnDevTask3, model_name="dccuchile/bert-base-spanish-wwm-uncased")
roberta_es_lora_model, roberta_es_lora_results = sexism_classification_pipeline_task3_lora(EnTrainTask3, EnDevTask3, model_name="PlanTL-GOB-ES/roberta-base-bne")

# Show Results

In [None]:
# COMPLETE
# write code here

English
fine-tuning
	subtask3: ICM:0.4042  macro-f1:0.7222 (25.4 per epoch)
	subtask3: ICM:0.3341  macro-f1:0.7181 (27.18 per epoch)
LoRA
	subtask3: ICM:-0.2474  macro-f1:0.6803 (17.48 per epoch)
	subtask3: ICM:-0.3825  macro-f1:0.5285 (17.37 per epoch)
Spanish
fine-tuning
	subtask3: ICM:0.1132  macro-f1:0.7109 (27.33 per epoch)
	subtask3: ICM:0.03221  macro-f1:0.7181 (28.55 per epoch)
LoRA
	subtask3: ICM:0.2726  macro-f1:0.7175 (21.51 per epoch)
	subtask3: ICM:0.2246  macro-f1:0.712 (21.84 per epoch)
