# Main imports and code

In [1]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs, MultiLabelClassificationModel, MultiLabelClassificationArgs
from urllib import request
import pandas as pd
import logging
import torch
from collections import Counter
from ast import literal_eval

2025-03-01 16:10:13.049019: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740845413.071411  577787 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740845413.078050  577787 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-01 16:10:13.100681: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# prepare logger
logging.basicConfig(level=logging.INFO)

transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# check gpu
cuda_available = torch.cuda.is_available()

print('Cuda available? ',cuda_available)

Cuda available?  True


In [3]:
if cuda_available:
  import tensorflow as tf
  # Get the GPU device name.
  device_name = tf.test.gpu_device_name()
  # The device name should look like the following:
  if device_name == '/device:GPU:0':
      print('Found GPU at: {}'.format(device_name))
  else:
      raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


I0000 00:00:1740845439.701704  577787 gpu_device.cc:2022] Created device /device:GPU:0 with 7894 MB memory:  -> device: 0, name: NVIDIA A100 80GB PCIe MIG 1g.10gb, pci bus id: 0000:01:00.0, compute capability: 8.0


# Fetch Don't Patronize Me! data manager module

In [4]:
module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
#with open("file_1.txt") as f1, open("file_2.txt") as f2
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))

Fetching https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py


In [5]:
# helper function to save predictions to an output file
def labels2file(p, outf_path):
	with open(outf_path,'w') as outf:
		for pi in p:
			outf.write(','.join([str(k) for k in pi])+'\n')

In [6]:
from dont_patronize_me import DontPatronizeMe

In [7]:
dpm = DontPatronizeMe('.', '.')

In [8]:
dpm.load_task1()
dpm.load_task2(return_one_hot=True)

Map of label to numerical label:
{'Unbalanced_power_relations': 0, 'Shallow_solution': 1, 'Presupposition': 2, 'Authority_voice': 3, 'Metaphors': 4, 'Compassion': 5, 'The_poorer_the_merrier': 6}


# Load paragraph IDs

In [9]:
trids = pd.read_csv('train_semeval_parids-labels.csv')
teids = pd.read_csv('dev_semeval_parids-labels.csv')

In [10]:
trids.par_id = trids.par_id.astype(str)
teids.par_id = teids.par_id.astype(str)

In [11]:
data=dpm.train_task1_df

In [12]:
data

Unnamed: 0,par_id,art_id,keyword,country,text,label,orig_label
0,1,@@24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0,0
1,2,@@21968160,migrant,gh,"In Libya today , there are countless number of...",0,0
2,3,@@16584954,immigrant,ie,"""White House press secretary Sean Spicer said ...",0,0
3,4,@@7811231,disabled,nz,Council customers only signs would be displaye...,0,0
4,5,@@1494111,refugee,ca,""""""" Just like we received migrants fleeing El ...",0,0
...,...,...,...,...,...,...,...
10464,10465,@@14297363,women,lk,"""Sri Lankan norms and culture inhibit women fr...",0,1
10465,10466,@@70091353,vulnerable,ph,He added that the AFP will continue to bank on...,0,0
10466,10467,@@20282330,in-need,ng,""""""" She has one huge platform , and informatio...",1,3
10467,10468,@@16753236,hopeless,in,""""""" Anja Ringgren Loven I ca n't find a word t...",1,4




# Rebuild training set (Task 1)

In [13]:
rows = [] # will contain par_id, label and text
for idx in range(len(trids)):
  parid = trids.par_id[idx]
  #print(parid)
  # select row from original dataset to retrieve `text` and binary label
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })


In [14]:
import random

In [15]:
trdf1 = pd.DataFrame(rows)

In [16]:
trdf1

Unnamed: 0,par_id,community,text,label
0,4341,poor-families,"The scheme saw an estimated 150,000 children f...",1
1,4136,homeless,Durban 's homeless communities reconciliation ...,1
2,10352,poor-families,The next immediate problem that cropped up was...,1
3,8279,vulnerable,Far more important than the implications for t...,1
4,1164,poor-families,To strengthen child-sensitive social protectio...,1
...,...,...,...,...
8370,8380,refugee,Rescue teams search for survivors on the rubbl...,0
8371,8381,hopeless,The launch of ' Happy Birthday ' took place la...,0
8372,8382,homeless,"The unrest has left at least 20,000 people dea...",0
8373,8383,hopeless,You have to see it from my perspective . I may...,0


# Rebuild test set (Task 1)

In [17]:
rows = [] # will contain par_id, label and text
for idx in range(len(teids)):
  parid = teids.par_id[idx]
  #print(parid)
  # select row from original dataset
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })


In [18]:
len(rows)

2094

In [19]:
tedf1 = pd.DataFrame(rows)

In [20]:
tedf1

Unnamed: 0,par_id,community,text,label
0,4046,hopeless,We also know that they can benefit by receivin...,1
1,1279,refugee,Pope Francis washed and kissed the feet of Mus...,1
2,8330,refugee,Many refugees do n't want to be resettled anyw...,1
3,4063,in-need,"""Budding chefs , like """" Fred """" , """" Winston ...",1
4,4089,homeless,"""In a 90-degree view of his constituency , one...",1
...,...,...,...,...
2089,10462,homeless,"The sad spectacle , which occurred on Saturday...",0
2090,10463,refugee,""""""" The Pakistani police came to our house and...",0
2091,10464,disabled,"""When Marie O'Donoghue went looking for a spec...",0
2092,10465,women,"""Sri Lankan norms and culture inhibit women fr...",0


In [21]:
# random.shuffle(tedf1)
tedf1 = tedf1.sample(frac=1, random_state=42).reset_index(drop=True)


# RoBERTa Baseline for Task 1

In [22]:
# downsample negative instances
pcldf = trdf1[trdf1.label==1]
npos = len(pcldf)

training_set1 = pd.concat([pcldf,trdf1[trdf1.label==0][:npos*2]])

In [23]:
training_set1

Unnamed: 0,par_id,community,text,label
0,4341,poor-families,"The scheme saw an estimated 150,000 children f...",1
1,4136,homeless,Durban 's homeless communities reconciliation ...,1
2,10352,poor-families,The next immediate problem that cropped up was...,1
3,8279,vulnerable,Far more important than the implications for t...,1
4,1164,poor-families,To strengthen child-sensitive social protectio...,1
...,...,...,...,...
2377,1775,refugee,Last but not the least element of culpability ...,0
2378,1776,refugee,"Then , taking the art of counter-intuitive non...",0
2379,1777,refugee,Kagunga village was reported to lack necessary...,0
2380,1778,vulnerable,"""After her parents high-profile divorce after ...",0


In [None]:

task1_model_args = ClassificationArgs(num_train_epochs=1,
                                      no_save=True,
                                      no_cache=True,
                                      overwrite_output_dir=True)
task1_model = ClassificationModel("roberta",
                                  'roberta-base',
                                  args = task1_model_args,
                                  num_labels=2,
                                  use_cuda=cuda_available)
# train model
task1_model.train_model(training_set1[['text', 'label']])
# run predictions
preds_task1, _ = task1_model.predict(tedf1.text.tolist())

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
5it [00:00,  9.17it/s]                       
  scaler = amp.GradScaler()
  with amp.autocast():
Epochs 1/1. Running Loss:    0.2181: 100%|██████████| 298/298 [00:43<00:00,  6.91it/s]
Epoch 1 of 1: 100%|██████████| 1/1 [00:43<00:00, 43.11s/it]
INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to outputs/.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
  0%|          | 0/4 [00:00<?, ?it/s]Process ForkPoolWo

In [None]:
Counter(preds_task1)

NameError: name 'preds_task1' is not defined

In [None]:
labels2file([[k] for k in preds_task1], 'task1.txt')

NameError: name 'preds_task1' is not defined

In [24]:
from transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification, Trainer, TrainingArguments

In [25]:
MODEL_NAME = "microsoft/deberta-v3-base"
custom_model = DebertaV2ForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
custom_tokenizer = DebertaV2Tokenizer.from_pretrained(MODEL_NAME)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:

# task1_model_deberta_args = ClassificationArgs(num_train_epochs=1,
#                                       no_save=True,
#                                       no_cache=True,
#                                       overwrite_output_dir=True)

# task1_model_deberta = ClassificationModel(
#                                         model_type="deberta",
#                                         model_name=MODEL_NAME,
#                                         args=task1_model_deberta_args,
#                                         num_labels=2,
#                                         use_cuda=cuda_available,
#                                         tokenizer=custom_tokenizer,
#                                         model=custom_model
#                                     )
# # train model
# task1_model_deberta.train_model(training_set1[['text', 'label']])
# # run predictions
# preds_task1_deberta, _ = task1_model_deberta.predict(tedf1.text.tolist())

In [30]:
from transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification, Trainer, TrainingArguments
import torch
import pandas as pd
from datasets import Dataset
import numpy as np
from sklearn.metrics import f1_score

# Load tokenizer & model (Use DeBERTaV2Tokenizer for v3 models)
MODEL_NAME = "microsoft/deberta-v3-small"
tokenizer = DebertaV2Tokenizer.from_pretrained(MODEL_NAME)
model = DebertaV2ForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# Convert Pandas DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(training_set1)
dev_dataset = Dataset.from_pandas(tedf1)

train_dataset = train_dataset.map(tokenize_function, batched=True)
dev_dataset = dev_dataset.map(tokenize_function, batched=True)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,  # More epochs for better fine-tuning
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_dir="./logs",  # ✅ Logs training progress
    logging_steps=10,  # ✅ Prints progress every 10 steps
    report_to="none"  # ✅ Avoids unnecessary logging to WandB/Hugging Face
)

# Define evaluation metrics
def compute_metrics(pred):
    predictions = np.argmax(pred.predictions, axis=1)
    return {"f1": f1_score(pred.label_ids, predictions)}

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics,
)

# Train DeBERTa
trainer.train()

# Evaluate on dev set
results = trainer.evaluate()
print(f"✅ DeBERTa Dev F1 Score: {results['eval_f1']:.4f}")

# Get predictions
predictions = trainer.predict(dev_dataset)
y_pred = np.argmax(predictions.predictions, axis=1).tolist()

# Save predictions in a new file

with open("task1_deberta.txt", "w") as f:
    for pred in y_pred:
        f.write(str(pred) + "\n")

print("✅ DeBERTa predictions saved in task1_deberta.txt")


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 2382/2382 [00:01<00:00, 2230.61 examples/s]
Map: 100%|██████████| 2094/2094 [00:00<00:00, 2342.92 examples/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 376.00 MiB. GPU 0 has a total capacity of 9.50 GiB of which 94.94 MiB is free. Process 447856 has 2.69 GiB memory in use. Process 465534 has 5.26 GiB memory in use. Process 567164 has 3.97 GiB memory in use. Process 568359 has 76.00 MiB memory in use. Process 576101 has 198.00 MiB memory in use. Including non-PyTorch memory, this process has 9.19 GiB memory in use. Of the allocated memory 9.08 GiB is allocated by PyTorch, and 19.86 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

# Rebuild training set (Task 2)

## Prepare submission

In [29]:
!cat task1.txt | head -n 10

0
0
0
1
0
0
1
1
0
1


In [None]:
!zip submission.zip task1.txt

  adding: task1.txt (deflated 92%)
  adding: task2.txt (deflated 97%)
