In [1]:
!pip install accelerate

Collecting accelerate
  Downloading accelerate-0.30.0-py3-none-any.whl (302 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/302.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m163.8/302.4 kB[0m [31m5.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.4/302.4 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from 

In [7]:
%cd /content/drive/MyDrive/University/Research/EXIST 2024

/content/drive/MyDrive/University/Research/EXIST 2024


In [3]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import pipeline, XLMRobertaTokenizer, XLMRobertaForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.metrics import classification_report
from tqdm import tqdm
import numpy as np
import random

In [4]:
GLOBAL_SEED = 255

np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)
torch.manual_seed(GLOBAL_SEED)
torch.use_deterministic_algorithms(True)
%env CUBLAS_WORKSPACE_CONFIG=:4096:8

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

env: CUBLAS_WORKSPACE_CONFIG=:4096:8


In [8]:
train_df = pd.read_json('./Data/EXIST 2024 Tweets Dataset/training/EXIST2024_training.json').transpose()
dev_df = pd.read_json('./Data/EXIST 2024 Tweets Dataset/dev/EXIST2024_dev.json').transpose()
test_df = pd.read_json('./Data/EXIST 2024 Tweets Dataset/test/EXIST2023_test_clean.json').transpose()

In [9]:
label2id = {'NO': 0, 'DIRECT': 1, 'REPORTED': 2, 'JUDGEMENTAL': 3}
id2label = {0: 'NO', 1: 'DIRECT', 2: 'REPORTED', 3: 'JUDGEMENTAL'}

def preprocess(df):
  if 'predicted_label' not in df.columns:
    df['predicted_label'] = df['labels_task2'].apply(lambda x: np.argmax([x.count('-'), x.count('DIRECT'), x.count('REPORTED'), x.count('JUDGEMENTAL')]))
  for column_name in ['number_annotators', 'annotators', 'gender_annotators', 'age_annotators', 'ethnicities_annotators', 'study_levels_annotators', 'countries_annotators', 'split']:
    if column_name in df.columns:
      df.drop(column_name, axis=1, inplace=True)
  return df


In [10]:
train_df = preprocess(train_df)
dev_df = preprocess(dev_df)

In [11]:
train_df

Unnamed: 0,id_EXIST,lang,tweet,labels_task1,labels_task2,labels_task3,predicted_label
100001,100001,es,"@TheChiflis Ignora al otro, es un capullo.El p...","[YES, YES, NO, YES, YES, YES]","[REPORTED, JUDGEMENTAL, -, REPORTED, JUDGEMENT...","[[OBJECTIFICATION], [OBJECTIFICATION, SEXUAL-V...",2
100002,100002,es,@ultimonomada_ Si comicsgate se parece en algo...,"[NO, NO, NO, NO, YES, NO]","[-, -, -, -, DIRECT, -]","[[-], [-], [-], [-], [OBJECTIFICATION], [-]]",0
100003,100003,es,"@Steven2897 Lee sobre Gamergate, y como eso ha...","[NO, NO, NO, NO, NO, NO]","[-, -, -, -, -, -]","[[-], [-], [-], [-], [-], [-]]",0
100004,100004,es,@Lunariita7 Un retraso social bastante lamenta...,"[NO, NO, YES, NO, YES, YES]","[-, -, DIRECT, -, REPORTED, REPORTED]","[[-], [-], [IDEOLOGICAL-INEQUALITY], [-], [IDE...",0
100005,100005,es,@novadragon21 @icep4ck @TvDannyZ Entonces como...,"[YES, NO, YES, NO, YES, YES]","[REPORTED, -, JUDGEMENTAL, -, JUDGEMENTAL, DIR...","[[STEREOTYPING-DOMINANCE, OBJECTIFICATION], [-...",0
...,...,...,...,...,...,...,...
203256,203256,en,idk why y’all bitches think having half your a...,"[YES, YES, YES, YES, YES, YES]","[JUDGEMENTAL, DIRECT, DIRECT, DIRECT, JUDGEMEN...","[[OBJECTIFICATION], [STEREOTYPING-DOMINANCE, S...",1
203257,203257,en,This has been a part of an experiment with @Wo...,"[YES, YES, YES, YES, YES, YES]","[JUDGEMENTAL, REPORTED, JUDGEMENTAL, DIRECT, J...","[[OBJECTIFICATION], [OBJECTIFICATION], [OBJECT...",3
203258,203258,en,"""Take me already"" ""Not yet. You gotta be ready...","[NO, YES, NO, YES, YES, YES]","[-, DIRECT, -, DIRECT, DIRECT, JUDGEMENTAL]","[[-], [OBJECTIFICATION], [-], [SEXUAL-VIOLENCE...",1
203259,203259,en,@clintneedcoffee why do you look like a whore?...,"[YES, YES, YES, YES, YES, YES]","[DIRECT, DIRECT, DIRECT, DIRECT, JUDGEMENTAL, ...","[[OBJECTIFICATION, SEXUAL-VIOLENCE, MISOGYNY-N...",1


In [12]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [18]:
def load_model_and_tokenizer(model_name, number_of_labels):
  # Initialize tokenizer and model
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  #model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = number_of_labels, device_map = 'auto')
  model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = number_of_labels)

  return model, tokenizer

def train_model(model, tokenizer, train_df, val_df, id2label, batch_size = 8, number_of_epochs = 3, learning_rates = [1e-5, 1e-5, 1e-5], weight_decay = 0.01):

  #save the best model based on accuracy on dev set
  best_model = model
  best_dev_loss = np.inf

  # Move the model to GPU
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  model.to(device)

  # Create datasets and dataloaders
  train_dataset = CustomDataset(train_df['tweet'].values, train_df['predicted_label'].values, tokenizer)
  val_dataset = CustomDataset(val_df['tweet'].values, val_df['predicted_label'].values, tokenizer)

  train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
  val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

  # Training loop (optional if you have labeled data)
  # Skip this part if you're doing zero-shot learning

  # Define early stopping
  best_val_loss = np.Inf
  patience = 3
  early_stopping_counter = 0

  # Train the model
  for epoch in range(number_of_epochs):
      # Define optimizer and loss function
      optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rates[epoch], weight_decay=weight_decay)
      total_steps = len(train_dataloader) * number_of_epochs
      scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
      model.train()
      total_train_loss = 0
      total_train_correct = 0
      total_train_samples = 0

      for batch in tqdm(train_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_train_loss += loss.item()

        preds = torch.argmax(outputs.logits, dim=1)
        total_train_correct += (preds == labels).sum().item()
        total_train_samples += labels.shape[0]

        loss.backward()
        optimizer.step()
        scheduler.step()

      train_loss = total_train_loss / len(train_dataloader)
      train_acc = total_train_correct / total_train_samples


      # Evaluation for this epoch
      model.eval()
      val_losses = []
      total_dev_loss = 0
      total_dev_correct = 0
      total_dev_samples = 0

      for batch in tqdm(val_dataloader):
          input_ids = batch['input_ids'].to(device)
          attention_mask = batch['attention_mask'].to(device)
          labels = batch['labels'].to(device)

          with torch.no_grad():
              outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
              val_losses.append(outputs.loss.item())

          loss = outputs.loss
          total_dev_loss += loss.item()

          preds = torch.argmax(outputs.logits, dim=1)
          total_dev_correct += (preds == labels).sum().item()
          total_dev_samples += labels.shape[0]

      avg_val_loss = np.mean(val_losses)

      dev_loss = total_dev_loss / len(val_dataloader)
      dev_acc = total_dev_correct / total_dev_samples

      if dev_loss < best_dev_loss:
        best_model = model
        best_dev_loss = dev_loss

      print(f'Epoch {epoch+1}/{number_of_epochs}')
      print(f'Train loss {train_loss}, accuracy {train_acc}')
      print(f'Dev loss {dev_loss}, accuracy {dev_acc}')

  # Load the best model
  return best_model, model, tokenizer

def Evaluate(model, tokenizer, val_df, id2label, batch_size = 8):

  # Move the model to GPU
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  model.to(device)

  # Create datasets and dataloaders
  val_dataset = CustomDataset(val_df['tweet'].values, val_df['predicted_label'].values, tokenizer)

  val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

  # Evaluation
  model.eval()
  all_preds = []
  all_labels = []

  for batch in val_dataloader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)

      with torch.no_grad():
          outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

      logits = outputs.logits
      preds = torch.argmax(logits, dim=1).cpu().numpy()

      all_preds.extend(preds)
      all_labels.extend(labels.cpu().numpy())

  # Convert numerical labels back to text labels
  predicted_labels = [id2label[label] for label in all_preds]
  true_labels = [id2label[label] for label in all_labels]

  # Print classification report
  print(classification_report(true_labels, predicted_labels))
  return predicted_labels


In [14]:
from google.colab import userdata

def login2HF():
  !huggingface-cli login --token '{userdata.get('HF_token')}'

In [15]:
login2HF()

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [16]:
def save_and_upload(model, tokenizer, model_name2save):

  model.save_pretrained(model_name2save)
  tokenizer.save_pretrained(model_name2save)

## RoBerta

In [19]:
roberta_large_model, roberta_large_tokenizer = load_model_and_tokenizer('xlm-roberta-large', 4)
best_roberta_large_model, roberta_large_model, roberta_large_tokenizer = train_model(roberta_large_model, roberta_large_tokenizer, train_df, dev_df, id2label, batch_size=16, number_of_epochs=3, learning_rates=[2e-5, 1e-5, 1e-5], weight_decay=0.03)
predicted_labels = Evaluate(best_roberta_large_model, roberta_large_tokenizer, dev_df, id2label, batch_size = 16)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 433/433 [08:47<00:00,  1.22s/it]
100%|██████████| 65/65 [00:23<00:00,  2.72it/s]


Epoch 1/3
Train loss 0.8651107886234002, accuracy 0.6708092485549133
Dev loss 0.8275826793450576, accuracy 0.6714836223506744


100%|██████████| 433/433 [08:46<00:00,  1.22s/it]
100%|██████████| 65/65 [00:23<00:00,  2.72it/s]


Epoch 2/3
Train loss 0.686354171080347, accuracy 0.7263005780346821
Dev loss 0.7184075181300823, accuracy 0.7061657032755299


100%|██████████| 433/433 [08:46<00:00,  1.22s/it]
100%|██████████| 65/65 [00:23<00:00,  2.71it/s]


Epoch 3/3
Train loss 0.598318569723508, accuracy 0.761849710982659
Dev loss 0.747743614591085, accuracy 0.720616570327553
              precision    recall  f1-score   support

      DIRECT       0.62      0.57      0.59       230
 JUDGEMENTAL       0.62      0.13      0.21        79
          NO       0.80      0.89      0.84       646
    REPORTED       0.35      0.37      0.36        83

    accuracy                           0.72      1038
   macro avg       0.60      0.49      0.50      1038
weighted avg       0.71      0.72      0.70      1038



In [20]:
save_and_upload(roberta_large_model, roberta_large_tokenizer, 'EXIST2024_Task2_xlmRoberta_large_3_16')
!huggingface-cli upload 'EXIST2024_Task2_xlmRoberta_large_3_16'

Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.
Upload 3 LFS files:   0% 0/3 [00:00<?, ?it/s]
sentencepiece.bpe.model:   0% 0.00/5.07M [00:00<?, ?B/s][A

tokenizer.json:   0% 0.00/17.1M [00:00<?, ?B/s][A[A
sentencepiece.bpe.model:   0% 16.4k/5.07M [00:00<01:12, 70.0kB/s][A

tokenizer.json:   0% 16.4k/17.1M [00:00<03:51, 73.6kB/s][A[A


model.safetensors:   0% 0.00/2.24G [00:00<?, ?B/s][A[A[A
sentencepiece.bpe.model:  29% 1.47M/5.07M [00:00<00:00, 5.28MB/s][A

tokenizer.json:   9% 1.52M/17.1M [00:00<00:02, 5.56MB/s][A[A
sentencepiece.bpe.model:  62% 3.16M/5.07M [00:00<00:00, 8.07MB/s][A

tokenizer.json:  16% 2.69M/17.1M [00:00<00:02, 6.40MB/s][A[A


model.safetensors:   0% 16.4k/2.24G [00:00<6:59:31, 89.0kB/s][A[A[A


model.safetensors:   0% 999k/2.24G [00:00<08:47, 4.25MB/s]   [A[A[A

tokenizer.json:  29% 4.95M/17.1M [00:00<00:01, 10.0MB/s][A[

In [22]:
roberta_large_model, roberta_large_tokenizer = load_model_and_tokenizer('am-azadi/EXIST2024_Task2_xlmRoberta_large_3_16', 4)
best_roberta_large_model, roberta_large_model, roberta_large_tokenizer = train_model(roberta_large_model, roberta_large_tokenizer, train_df, dev_df, id2label, batch_size=16, number_of_epochs=3, learning_rates=[5e-6, 2e-6, 1e-6], weight_decay=0.07)
predicted_labels = Evaluate(best_roberta_large_model, roberta_large_tokenizer, dev_df, id2label, batch_size = 16)

tokenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

  0%|          | 0/433 [00:01<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 17.06 MiB is free. Process 4616 has 14.73 GiB memory in use. Of the allocated memory 14.53 GiB is allocated by PyTorch, and 25.30 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## mbert

In [None]:
mBert_model, mBert_tokenizer = train_model('bert-base-multilingual-cased', train_df, dev_df, 2, id2label, False, 3, 16)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 433/433 [02:45<00:00,  2.62it/s]
100%|██████████| 65/65 [00:08<00:00,  8.06it/s]


Epoch 1/3
Train loss 0.5581313246392212, accuracy 0.7031791907514451
Dev loss 0.4620477463190372, accuracy 0.7832369942196532


100%|██████████| 433/433 [02:45<00:00,  2.62it/s]
100%|██████████| 65/65 [00:08<00:00,  8.05it/s]


Epoch 2/3
Train loss 0.41873579580706755, accuracy 0.8145953757225434
Dev loss 0.4259020508481906, accuracy 0.8140655105973025


100%|██████████| 433/433 [02:45<00:00,  2.61it/s]
100%|██████████| 65/65 [00:08<00:00,  8.08it/s]


Epoch 3/3
Train loss 0.3016430305243915, accuracy 0.8754335260115607
Dev loss 0.49505373457303414, accuracy 0.7967244701348748
              precision    recall  f1-score   support

          NO       0.76      0.82      0.79       479
         YES       0.83      0.78      0.80       559

    accuracy                           0.80      1038
   macro avg       0.80      0.80      0.80      1038
weighted avg       0.80      0.80      0.80      1038



# deberta model

In [None]:
deberta_v3_model, deberta_v3_tokenizer = train_model('microsoft/deberta-v3-base', train_df, dev_df, 2, id2label, False, 3, 16)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 433/433 [05:39<00:00,  1.28it/s]
100%|██████████| 65/65 [00:10<00:00,  6.40it/s]


Epoch 1/3
Train loss 0.5708562995352315, accuracy 0.7010115606936416
Dev loss 0.4414301775968992, accuracy 0.7928709055876686


100%|██████████| 433/433 [05:38<00:00,  1.28it/s]
100%|██████████| 65/65 [00:10<00:00,  6.40it/s]


Epoch 2/3
Train loss 0.41849323024857127, accuracy 0.8210982658959538
Dev loss 0.44537427213329533, accuracy 0.7996146435452793


100%|██████████| 433/433 [05:38<00:00,  1.28it/s]
100%|██████████| 65/65 [00:10<00:00,  6.39it/s]


Epoch 3/3
Train loss 0.33082303958869697, accuracy 0.8647398843930636
Dev loss 0.49292246636289816, accuracy 0.8111753371868978
              precision    recall  f1-score   support

          NO       0.78      0.82      0.80       479
         YES       0.84      0.81      0.82       559

    accuracy                           0.81      1038
   macro avg       0.81      0.81      0.81      1038
weighted avg       0.81      0.81      0.81      1038



In [None]:
save_and_upload(deberta_v3_model, deberta_v3_tokenizer, 'EXIST2024_Task1_deberta_v3_3_16')
!huggingface-cli upload 'EXIST2024_Task1_deberta_v3_3_16'

Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.
Upload 2 LFS files:   0% 0/2 [00:00<?, ?it/s]
spm.model:   0% 0.00/2.46M [00:00<?, ?B/s][A
spm.model:   1% 16.4k/2.46M [00:00<00:18, 132kB/s][A

model.safetensors:   0% 0.00/738M [00:00<?, ?B/s][A[A

model.safetensors:   0% 360k/738M [00:00<03:24, 3.60MB/s][A[A

spm.model: 100% 2.46M/2.46M [00:00<00:00, 6.32MB/s]


model.safetensors:   2% 17.4M/738M [00:00<00:21, 33.1MB/s][A[A

model.safetensors:   4% 31.6M/738M [00:00<00:11, 59.5MB/s][A[A

model.safetensors:   5% 39.4M/738M [00:00<00:18, 38.2MB/s][A[A

model.safetensors:   7% 48.0M/738M [00:01<00:20, 33.9MB/s][A[A

model.safetensors:   9% 63.0M/738M [00:01<00:13, 51.7MB/s][A[A

model.safetensors:  10% 70.8M/738M [00:01<00:15, 42.6MB/s][A[A

model.safetensors:  10% 77.0M/738M [00:01<00:16, 39.0MB/s][A[A

model.safetensors:  11% 82.2M/738M [00:02<00:

In [None]:
deberta_v3_model, deberta_v3_tokenizer = train_model('am-azadi/EXIST2024_Task1_deberta_v3_3_16', train_df, dev_df, 2, id2label, False, 2, 16)

tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/880 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/738M [00:00<?, ?B/s]

100%|██████████| 433/433 [05:38<00:00,  1.28it/s]
100%|██████████| 65/65 [00:10<00:00,  6.38it/s]


Epoch 1/2
Train loss 0.25462271668464287, accuracy 0.901878612716763
Dev loss 0.47036931423040534, accuracy 0.8044315992292871


100%|██████████| 433/433 [05:38<00:00,  1.28it/s]
100%|██████████| 65/65 [00:10<00:00,  6.45it/s]


Epoch 2/2
Train loss 0.1887398541197353, accuracy 0.9296242774566474
Dev loss 0.5458899609457988, accuracy 0.8140655105973025
              precision    recall  f1-score   support

          NO       0.79      0.82      0.80       479
         YES       0.84      0.81      0.82       559

    accuracy                           0.81      1038
   macro avg       0.81      0.81      0.81      1038
weighted avg       0.82      0.81      0.81      1038



# PyEvALL

In [2]:
!pip install PyEvALL

Collecting PyEvALL
  Downloading PyEvALL-0.1.63.tar.gz (35 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jsbeautifier==1.14.9 (from PyEvALL)
  Downloading jsbeautifier-1.14.9.tar.gz (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.0/75.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting jsonschema==4.21.1 (from PyEvALL)
  Downloading jsonschema-4.21.1-py3-none-any.whl (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy==1.26.4 (from PyEvALL)
  Downloading numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m66.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [9]:
from pyevall.evaluation import PyEvALLEvaluation
from pyevall.utils.utils import PyEvALLUtils

predictions = "./Data/evaluation/baselines/EXIST2024_dev_task1_majority_class_hard.json"
gold = "./Data/evaluation/golds/EXIST2024_dev_task1_gold_hard.json"
test = PyEvALLEvaluation()
params= dict()
params[PyEvALLUtils.PARAM_REPORT]= PyEvALLUtils.PARAM_OPTION_REPORT_EMBEDDED
metrics=["ICM", "ICMNorm" ,"FMeasure"]
report= test.evaluate(predictions, gold, metrics, **params)
report.print_report()

2024-05-09 09:27:02,724 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM', 'ICMNorm', 'FMeasure']
2024-05-09 09:27:03,301 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2024-05-09 09:27:04,006 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM Normalized evaluation method
2024-05-09 09:27:04,011 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2024-05-09 09:27:04,769 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2024-05-09 09:27:05,466 - pyevall.metrics.metrics - INFO -             evaluate() - Executing fmeasure evaluation method
{
  "metrics": {
    "ICM": {
      "name": "Information Contrast model",
      "acronym": "ICM",
      "description": "Coming soon!",
      "status": "OK",
      "results": {
        "test_cases": [{
          "name": "EXIST2024",
          "average": -0.48069