In [1]:
!pip install transformers
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension

^C
[31mERROR: Operation cancelled by user[0m[31m
[0mEnabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [2]:
#!pip install protobuf==3.20.*
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
Collecting safetensors>=0.3.1
  Downloading safetensors-0.4.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: safetensors, accelerate
Successfully installed accelerate-0.27.2 safetensors-0.4.2
[0m

In [3]:
import torch
import transformers
import torch.nn as nn
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm

from transformers import Trainer, TrainingArguments
from transformers import RobertaTokenizer, RobertaModel, RobertaConfig

import pandas as pd
import numpy as np
import os
from pathlib import Path

from sklearn.metrics import classification_report

if not torch.cuda.is_available():
    if torch.backends.mps.is_available():
        DEVICE = 'mps'
    else:
        DEVICE = 'cpu'
else:
    DEVICE = 'cuda:0'
print("Device:", DEVICE)
device = torch.device(DEVICE)

Device: cuda:0


In [4]:
# Initialization Cell
WORKING_ENV = 'PAPERSPACE' # Can be LABS, COLAB, PAPERSPACE, SAGEMAKER
USERNAME = '' # If working on Lab Machines - Your college username
assert WORKING_ENV in ['LABS', 'COLAB', 'PAPERSPACE', 'SAGEMAKER']

if WORKING_ENV == 'PAPERSPACE': # Using Paperspace
    !pip install ipywidgets
    content_path = '/notebooks/'
    data_path = './data/'
    
else:
  raise NotImplementedError()

content_path = Path(content_path)

[0m

In [None]:
# running locally
import os
content_path = os.getcwd()
data_path = f'{content_path}/data/'
content_path = Path(content_path)

# Setting up data and utils

In [5]:
# helper function to save predictions to an output file
def labels2file(p, outf_path):
	with open(outf_path,'w') as outf:
		for pi in p:
			outf.write(','.join([str(k) for k in pi])+'\n')

# Defining dataset class

In [6]:
from torch.utils.data import Dataset

# Define the custom dataset class
class PCLDataset(Dataset):
    def __init__(self, tokenizer, dataframe, is_multiclass=False):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.is_multiclass = is_multiclass

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        # Use 'orig_label' for multi-class and 'label' for binary
        if self.is_multiclass:
            dict_item = {'text': item['text'], 'label': item['label'], 'orig_label': item['orig_label']}
        else:
            dict_item = {'text': item['text'], 'label': item['label']}
        return dict_item

    def collate_fn(self, batch):
        texts = [item['text'] for item in batch]
        labels = torch.tensor([item['label'] for item in batch], dtype=torch.long)
        encodings = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=128)
        encodings['labels'] = labels
        if self.is_multiclass:
            orig_labels = torch.tensor([item['orig_label'] for item in batch], dtype=torch.long)
            encodings['orig_labels'] = orig_labels
        return encodings


# Custom Roberta

In [7]:
from transformers import RobertaModel, RobertaPreTrainedModel

class RoBERTaForPCL(RobertaPreTrainedModel):
    def __init__(self, config, dropout_rate=0.1, num_frozen_layers=0, is_multiclass=False, extra_hidden_layer=False):
        super().__init__(config)
        self.roberta = RobertaModel(config)
        self.dropout = torch.nn.Dropout(dropout_rate)
        self.is_multiclass = is_multiclass
        output_dim = 5 if is_multiclass else 1

        if extra_hidden_layer:
            self.classifier = torch.nn.Sequential(
                torch.nn.Linear(config.hidden_size, 256),
                torch.nn.ReLU(),
                torch.nn.Dropout(dropout_rate),
                torch.nn.Linear(256, output_dim)
            )
        else:
            self.classifier = torch.nn.Linear(config.hidden_size, output_dim)

        # Freeze specified bottom layers
        if num_frozen_layers > 0:
            # Freeze embeddings if num_frozen_layers includes them
            if num_frozen_layers >= 1:
                for param in self.roberta.embeddings.parameters():
                    param.requires_grad = False
            
            # Freeze bottom transformer layers as specified by num_frozen_layers
            for layer in self.roberta.encoder.layer[:num_frozen_layers]:
                for param in layer.parameters():
                    param.requires_grad = False

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        outputs = self.roberta(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids,
                               position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds,
                               output_attentions=output_attentions, output_hidden_states=output_hidden_states,
                               return_dict=return_dict)
        pooled_output = self.dropout(outputs[1])
        logits = self.classifier(pooled_output)

        return logits


In [8]:
from transformers import AdamW
def get_groupwise_lr_decay_optimizer(model, learning_rate=1e-5, weight_decay=0.01, lr_decay=0.95, num_groups=3):

    n_layers = len(model.roberta.encoder.layer)  # Total number of layers
    layers_per_group = max(n_layers // num_groups, 1)  # Ensure at least one layer per group

    # Initialize grouped parameters list
    grouped_parameters = []

    # Embeddings parameters
    embedding_decayed_lr = learning_rate * (lr_decay ** num_groups)
    grouped_parameters.append({"params": model.roberta.embeddings.parameters(), 'lr': embedding_decayed_lr})

    # Encoder layers parameters
    for group_idx in range(num_groups):
        # Calculate decayed learning rate for this group
        decayed_lr = learning_rate * (lr_decay ** (num_groups - 1 - group_idx))
        
        # Calculate the start and end layer index for this group
        start_layer = group_idx * layers_per_group
        end_layer = (group_idx + 1) * layers_per_group if group_idx < num_groups - 1 else n_layers
        
        # Aggregate parameters from layers in this group
        group_params = []
        for layer_idx in range(start_layer, end_layer):
            group_params.extend(list(model.roberta.encoder.layer[layer_idx].parameters()))
        
        grouped_parameters.append({"params": group_params, 'lr': decayed_lr})

    # Pooler parameters
    pooled_decayed_lr = learning_rate 
    grouped_parameters.append({"params": model.roberta.pooler.parameters(), 'lr': pooled_decayed_lr})

    # Classifier parameters
    classifier_lr = learning_rate 
    grouped_parameters.append({"params": model.classifier.parameters(), 'lr': classifier_lr})
    
    optimizer = AdamW(grouped_parameters, lr=learning_rate, weight_decay=weight_decay)

    return optimizer


# Trainer and evaluation function definition

In [9]:
def evaluate(model, tokenizer, data_loader):
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            inputs = tokenizer(batch['text'], return_tensors='pt', padding=True, truncation=True, max_length=128)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            labels = batch['label'].to(device)
            outputs = model(**inputs)
            
            if model.is_multiclass:
                # Convert logits to probabilities and then to multi-class predictions
                preds = outputs.argmax(dim=1)
                # Map multi-class labels to binary
                mapped_preds = preds.clone()
                mapped_preds[mapped_preds < 2] = 0  # Map 0,1 to 0
                mapped_preds[mapped_preds >= 2] = 1  # Map 2,3,4 to 1
                preds = mapped_preds

            else:
                # Convert logits to probabilities and then to binary predictions for binary classification
                probs = torch.sigmoid(outputs).squeeze()
                preds = (probs > 0.5).long()
            
            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())

    # Compute classification report based on actual task type
    report = classification_report(all_labels, all_preds, target_names=["Not PCL", "PCL"], output_dict=True, zero_division=0)
    model.train()
    return report


In [10]:
import wandb

class PCLTrainer(Trainer):
    def __init__(self, *args, optimizer_function=None, **kwargs):
        super().__init__(*args, **kwargs)
        if optimizer_function:
            self.optimizer = optimizer_function(self.model)
            
    def compute_loss(self, model, inputs, return_outputs=False):
    
        if model.is_multiclass:
            loss_labels = inputs.pop("orig_labels")
            outputs = model(**inputs)
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(outputs, loss_labels)
        else:
            labels = inputs.pop("labels")
            outputs = model(**inputs)
            loss_fct = nn.BCEWithLogitsLoss()
            loss = loss_fct(outputs.view(-1), labels.float().view(-1))
        return (loss, outputs) if return_outputs else loss
            

    def evaluate(self, ignore_keys=None):
        eval_results = evaluate(self.model, self.tokenizer, self.eval_dataset)
        f1_score = eval_results['PCL']['f1-score']
        precision = eval_results['PCL']['precision']
        recall = eval_results['PCL']['recall']
        accuracy = eval_results['accuracy']

        print(f"Accuracy: {accuracy}, F1 Score: {f1_score}, Precision: {precision}, Recall: {recall}")

        # Log the results with wandb
        wandb.log({"eval_f1": f1_score, "precision": precision, "recall": recall, "accuracy": accuracy})
        return {"eval_f1": f1_score}

## Weights and Biases Hyperparameter Search

In [11]:
from dont_patronize_me import DontPatronizeMe
from sklearn.utils import resample


def load_datasets():
    dpm = DontPatronizeMe('.', '.')
    dpm.load_task1()
    trids = pd.read_csv('internal_train_par_ids.csv')
    teids = pd.read_csv('internal_dev_par_ids.csv')

    trids.par_id = trids.par_id.astype(str)
    teids.par_id = teids.par_id.astype(str)

    data=dpm.train_task1_df

    rows = [] # will contain par_id, label and text
    for idx in range(len(trids)):
        parid = trids.par_id[idx]
        keyword = data.loc[data.par_id == parid].keyword.values[0]
        text = data.loc[data.par_id == parid].text.values[0]
        orig_label = int(data.loc[data.par_id == parid].orig_label.values[0])
        label = data.loc[data.par_id == parid].label.values[0]
        rows.append({
            'par_id':parid,
            'community':keyword,
            'text':text,
            'label':label,
            'orig_label':orig_label
        })

    trdf1 = pd.DataFrame(rows)


    traindf_majority = trdf1[trdf1['label'] == 0]
    traindf_minority = trdf1[trdf1['label'] == 1]
    traindf_minority_oversampled = resample(traindf_minority,
                                   replace=True,
                                   n_samples=len(traindf_majority),
                                   random_state=42)
    traindf_combined = pd.concat([traindf_majority, traindf_minority_oversampled])
    traindf_combined = traindf_combined.sample(frac=1, random_state=42).reset_index(drop=True)
    trdf1 = traindf_combined
    
    rows = [] # will contain par_id, label and text
    for idx in range(len(teids)):
        parid = teids.par_id[idx]
        #print(parid)
        # select row from original dataset
        keyword = data.loc[data.par_id == parid].keyword.values[0]
        text = data.loc[data.par_id == parid].text.values[0]
        orig_label = int(data.loc[data.par_id == parid].orig_label.values[0])
        label = data.loc[data.par_id == parid].label.values[0]
        rows.append({
            'par_id':parid,
            'community':keyword,
            'text':text,
            'label':label,
            'orig_label':orig_label
        })

    tedf1 = pd.DataFrame(rows)

    return trdf1, tedf1

In [61]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
train_df, dev_df = load_datasets()

In [13]:
import wandb
os.environ["WANDB_NOTEBOOK_NAME"] = "roberta_finetuning.ipynb"
wandb.login()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
sweep_config = {
    'method': 'bayes',
    'metric': {
      'name': 'f1',
      'goal': 'maximize'   
    },
    'parameters': {
        'num_train_epochs': {
            'values': [5]
        },
        'learning_rate': {
            'values': [5e-5, 1e-4]
        },
        'per_device_train_batch_size': {
            'values': [32, 64]
        },
        'frozen_layers': {
            'values': [0, 8]
        },
        'dropout_rate': {
            'values': [0, 0.1]
        },
        'weight_decay': {
            'values': [0, 0.01]
        },
        'scheduler': {
            'values': ['linear', 'cosine']
        },
        'lr_decay': {
            'values': [0.8, 0.9, 0.95]
        },
        'num_groups': {
            'values': [1, 2, 4, 12]
        },
        'is_multiclass': {
            'values': [False, True]
        },
        'extra_hidden_layer': {
            'values': [False, True]
        }
    }
}


In [None]:
def tune_hyperparameters(config=None):
    # Initialize a new wandb run
    with wandb.init(config=config):
        config = wandb.config
        torch.manual_seed(6)

        # Load the datasets
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

        train_set = PCLDataset(tokenizer, train_df, is_multiclass=config.is_multiclass)
        dev_set_PCL = PCLDataset(tokenizer, dev_df, is_multiclass=config.is_multiclass)
        dev_set = DataLoader(dev_set_PCL, batch_size=32)

        training_args = TrainingArguments(
            output_dir='./results',
            num_train_epochs=config.num_train_epochs,
            learning_rate=config.learning_rate,
            per_device_train_batch_size=config.per_device_train_batch_size,
            weight_decay=config.weight_decay,
            lr_scheduler_type=config.scheduler,
            overwrite_output_dir=True,
            evaluation_strategy="epoch",
            report_to="wandb",
            run_name="roberta-finetuning-test",
            remove_unused_columns=False,
            logging_strategy='epoch',
            load_best_model_at_end=True,
            metric_for_best_model="eval_f1",
            greater_is_better=True,
            save_strategy="epoch",
            save_total_limit=1
        )


        model = RoBERTaForPCL.from_pretrained('roberta-base', dropout_rate=config.dropout_rate, 
                                              num_frozen_layers=config.frozen_layers, is_multiclass=config.is_multiclass, 
                                              extra_hidden_layer=config.extra_hidden_layer).to(device)

        def optimizer_function(model):
            return get_groupwise_lr_decay_optimizer(
                model, 
                learning_rate=config.learning_rate, 
                weight_decay=config.weight_decay, 
                lr_decay=config.lr_decay,
                num_groups=config.num_groups
    )


        print(f"The model has {sum(p.numel() for p in model.parameters() if p.requires_grad)} trainable parameters")
        # Initialize Trainer
        trainer = PCLTrainer(
            model=model,
            args=training_args,
            train_dataset=train_set,
            eval_dataset=dev_set,
            data_collator=train_set.collate_fn,
            tokenizer=tokenizer,
            optimizer_function=optimizer_function
        )
        # Train the model
        trainer.train()
        # Evaluate the model
        results = trainer.evaluate()

        # Save the best model manually if it's better than the previous best
        if results["eval_f1"] > wandb.run.summary.get('best_f1', 0):
            wandb.run.summary['best_f1'] = results["eval_f1"]
            model_path = os.path.join('./best_model', wandb.run.name) 
            model.save_pretrained(model_path)

In [None]:
#sweep_id = wandb.sweep(sweep=sweep_config, project="NLP_CW_Final")

In [None]:
wandb.agent(sweep_id="6xm4y1gh", function=tune_hyperparameters, count=10, project="NLP_CW_Final")

# Training three best models and Ensembling

## Model 1

Final configuration 1:
- Number of training epochs: 20 (saving the best model along the way)
- Use multiclass labels: False
- Batch size: 64
- Learning rate: 1e-4
- Weight decay: 0.01
- Dropout rate in linear layers: 0.1
- LR scheduler: Cosine
- Number of frozen layers: 8
- Extra linear layer: True
- Number of layer groups: 1
- Layer-wise decay rate: 0.95

In [None]:
torch.manual_seed(6)

# Load the datasets
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

train_set = PCLDataset(tokenizer, train_df, is_multiclass=False)
dev_set_PCL = PCLDataset(tokenizer, dev_df, is_multiclass=False)
dev_set = DataLoader(dev_set_PCL, batch_size=32)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=20,
    learning_rate=1e-4,
    per_device_train_batch_size=64,
    weight_decay=0.01,
    lr_scheduler_type='cosine',
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    report_to="wandb",
    run_name="roberta-finetuning-final-first",
    remove_unused_columns=False,
    logging_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    save_strategy="epoch",
    save_total_limit=1
)


model = RoBERTaForPCL.from_pretrained('roberta-base', dropout_rate=0.1, 
                                        num_frozen_layers=8, is_multiclass=False, 
                                        extra_hidden_layer=True).to(device)

def optimizer_function(model):
    return get_groupwise_lr_decay_optimizer(
        model, 
        learning_rate=1e-4, 
        weight_decay=0.01, 
        lr_decay=0.95,
        num_groups=1
)


print(f"The model has {sum(p.numel() for p in model.parameters() if p.requires_grad)} trainable parameters")
# Initialize Trainer
trainer = PCLTrainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=dev_set,
    data_collator=train_set.collate_fn,
    tokenizer=tokenizer,
    optimizer_function=optimizer_function
)
# Train the model
trainer.train()
# Evaluate the model
results = trainer.evaluate()

In [16]:
model_path = os.path.join('./best_model', 'final_model_1') 
trainer.model.save_pretrained(model_path)

Configuration saved in ./best_model/final_model_1/config.json
Model weights saved in ./best_model/final_model_1/pytorch_model.bin


## Model 2

Final configuration 2:
- Number of training epochs: 20 (saving the best model along the way)
- Use multiclass labels: False
- Batch size: 64
- Learning rate: 1e-4
- Weight decay: 0.01
- Dropout rate in linear layers: 0
- LR scheduler: Cosine
- Number of frozen layers: 0
- Extra linear layer: False
- Number of layer groups: 12
- Layer-wise decay rate: 0.8

In [62]:
torch.manual_seed(6)

# Load the datasets
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

train_set = PCLDataset(tokenizer, train_df, is_multiclass=False)
dev_set_PCL = PCLDataset(tokenizer, dev_df, is_multiclass=False)
dev_set = DataLoader(dev_set_PCL, batch_size=32)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    learning_rate=1e-4,
    per_device_train_batch_size=64,
    weight_decay=0.01,
    lr_scheduler_type='cosine',
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    report_to="wandb",
    run_name="roberta-finetuning-final",
    remove_unused_columns=False,
    logging_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    save_strategy="epoch",
    save_total_limit=1
)


model = RoBERTaForPCL.from_pretrained('roberta-base', dropout_rate=0, 
                                        num_frozen_layers=0, is_multiclass=False, 
                                        extra_hidden_layer=False).to(device)

def optimizer_function(model):
    return get_groupwise_lr_decay_optimizer(
        model, 
        learning_rate=1e-4, 
        weight_decay=0.01, 
        lr_decay=0.8,
        num_groups=12
)


print(f"The model has {sum(p.numel() for p in model.parameters() if p.requires_grad)} trainable parameters")
# Initialize Trainer
trainer = PCLTrainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=dev_set,
    data_collator=train_set.collate_fn,
    tokenizer=tokenizer,
    optimizer_function=optimizer_function
)
# Train the model
trainer.train()
# Evaluate the model
results = trainer.evaluate()

Some weights of the model checkpoint at roberta-base were not used when initializing RoBERTaForPCL: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RoBERTaForPCL from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RoBERTaForPCL from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RoBERTaForPCL were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias', 'roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions an

The model has 124646401 trainable parameters


[34m[1mwandb[0m: Currently logged in as: [33malan-picucci[0m ([33malans-team[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss


Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]

Saving model checkpoint to ./results/checkpoint-178
Configuration saved in ./results/checkpoint-178/config.json


Accuracy: 0.8806112702960841, F1 Score: 0.5567375886524824, Precision: 0.4301369863013699, Recall: 0.7889447236180904


Model weights saved in ./results/checkpoint-178/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-178/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-178/special_tokens_map.json
Deleting older checkpoint [results/checkpoint-1602] due to args.save_total_limit


Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]

Saving model checkpoint to ./results/checkpoint-356
Configuration saved in ./results/checkpoint-356/config.json


Accuracy: 0.9173829990448902, F1 Score: 0.5435356200527706, Precision: 0.5722222222222222, Recall: 0.5175879396984925


Model weights saved in ./results/checkpoint-356/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-356/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-356/special_tokens_map.json
Deleting older checkpoint [results/checkpoint-3026] due to args.save_total_limit


Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]

Saving model checkpoint to ./results/checkpoint-534
Configuration saved in ./results/checkpoint-534/config.json


Accuracy: 0.9192932187201528, F1 Score: 0.5867970660146699, Precision: 0.5714285714285714, Recall: 0.6030150753768844


Model weights saved in ./results/checkpoint-534/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-534/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-534/special_tokens_map.json
Deleting older checkpoint [results/checkpoint-178] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./results/checkpoint-534 (score: 0.5867970660146699).


Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]

Accuracy: 0.9192932187201528, F1 Score: 0.5867970660146699, Precision: 0.5714285714285714, Recall: 0.6030150753768844


In [None]:
model_path = os.path.join('./best_model', 'final_model_2') 
trainer.model.save_pretrained(model_path)

## Model 3

Final configuration 3:
- Number of training epochs: 20 (saving the best model along the way)
- Use multiclass labels: True
- Batch size: 64
- Learning rate: 5e-5
- Weight decay: 0
- Dropout rate in linear layers: 0.1
- LR scheduler: Linear
- Number of frozen layers: 8
- Extra linear layer: True
- Number of layer groups: 12
- Layer-wise decay rate: 0.9

In [None]:
torch.manual_seed(6)

# Load the datasets
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

train_set = PCLDataset(tokenizer, train_df, is_multiclass=True)
dev_set_PCL = PCLDataset(tokenizer, dev_df, is_multiclass=True)
dev_set = DataLoader(dev_set_PCL, batch_size=32)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=20,
    learning_rate=5e-5,
    per_device_train_batch_size=64,
    weight_decay=0,
    lr_scheduler_type='linear',
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    report_to="wandb",
    run_name="roberta-finetuning-final-third",
    remove_unused_columns=False,
    logging_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    save_strategy="epoch",
    save_total_limit=1
)


model = RoBERTaForPCL.from_pretrained('roberta-base', dropout_rate=0.1, 
                                        num_frozen_layers=8, is_multiclass=True, 
                                        extra_hidden_layer=True).to(device)

def optimizer_function(model):
    return get_groupwise_lr_decay_optimizer(
        model, 
        learning_rate=5e-5, 
        weight_decay=0, 
        lr_decay=0.9,
        num_groups=12
)


print(f"The model has {sum(p.numel() for p in model.parameters() if p.requires_grad)} trainable parameters")
# Initialize Trainer
trainer = PCLTrainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=dev_set,
    data_collator=train_set.collate_fn,
    tokenizer=tokenizer,
    optimizer_function=optimizer_function
)
# Train the model
trainer.train()
# Evaluate the model
results = trainer.evaluate()

In [None]:
model_path = os.path.join('./best_model', 'final_model_3') 
trainer.save_pretrained(model_path)

# Ensembling

In [29]:
def ensemble(models, tokenizer, data_loader):
    all_preds = []
    all_labels = []
    
    for batch in tqdm(data_loader, desc="Evaluating"):
        inputs = tokenizer(batch['text'], return_tensors='pt', padding=True, truncation=True, max_length=128)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        labels = batch['label'].to(device)
        batch_probs = []  # Store probabilities for this batch

        for model in models:
            model.eval()
            with torch.no_grad():
                outputs = model(**inputs)
                
                if model.is_multiclass:
                    # Convert logits to probabilities and then to multi-class predictions
                    probs_multiclass = torch.softmax(outputs, dim=1)
                    #Map classes [0,1] to 0 and classes [2,3,4] to 1
                    probs = probs_multiclass[:, 2] + probs_multiclass[:, 3] + probs_multiclass[:, 4]  # Probability of label 1
                    batch_probs.append(probs.cpu().numpy())  # Store binary probabilities

                else:
                    # Convert logits to probabilities and then to binary predictions for binary classification
                    probs = torch.sigmoid(outputs).squeeze()
                    batch_probs.append(probs.cpu().numpy())

        # Average the probabilities across models for this batch
        avg_probs = np.mean(batch_probs, axis=0)
        # Convert averaged probabilities to binary predictions
        aggregated_preds = (avg_probs > 0.5).astype(int)
        
        all_preds.extend(aggregated_preds)
        all_labels.extend(labels.cpu().tolist())

    # Compute classification report based on actual task type
    report = classification_report(all_labels, all_preds, target_names=["Not PCL", "PCL"], output_dict=True, zero_division=0)
    return report

In [None]:
from transformers import RobertaConfig


model_path = './best_model/final_model_1'
config = RobertaConfig.from_pretrained(model_path)
model = RoBERTaForPCL.from_pretrained(model_path, config=config, dropout_rate=0.1, num_frozen_layers=8, is_multiclass=False, extra_hidden_layer=True).to(device)

model_path2 = './best_model/final_model_2'
config2 = RobertaConfig.from_pretrained(model_path2)
model2 = RoBERTaForPCL.from_pretrained(model_path2, config=config2, dropout_rate=0, num_frozen_layers=0, is_multiclass=False, extra_hidden_layer=False).to(device)

model_path3 = './best_model/final_model_3'
config3 = RobertaConfig.from_pretrained(model_path3)
model3 = RoBERTaForPCL.from_pretrained(model_path, config=config3, dropout_rate=0.1, num_frozen_layers=8, is_multiclass=True, extra_hidden_layer=True).to(device)

models = [model, model2, model3]

In [23]:
torch.manual_seed(6)

# Load the datasets
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

train_set = PCLDataset(tokenizer, train_df, is_multiclass=True)
dev_set_PCL = PCLDataset(tokenizer, dev_df, is_multiclass=True)
dev_set = DataLoader(dev_set_PCL, batch_size=32)

report = ensemble(models, tokenizer, dev_set)

Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]

In [25]:
report

{'Not PCL': {'precision': 0.9564761405348715,
  'recall': 0.9625329815303431,
  'f1-score': 0.9594950026301946,
  'support': 1895},
 'PCL': {'precision': 0.6203208556149733,
  'recall': 0.5829145728643216,
  'f1-score': 0.6010362694300517,
  'support': 199},
 'accuracy': 0.9264565425023877,
 'macro avg': {'precision': 0.7883984980749223,
  'recall': 0.7727237771973323,
  'f1-score': 0.7802656360301232,
  'support': 2094},
 'weighted avg': {'precision': 0.9245301511847953,
  'recall': 0.9264565425023877,
  'f1-score': 0.9254294401149948,
  'support': 2094}}

# Predicting Dev and Test Set

In [12]:
from dont_patronize_me import DontPatronizeMe
from sklearn.utils import resample

def load_datasets_official():
    dpm = DontPatronizeMe('.', '.')
    dpm.load_task1()
    trids = pd.read_csv('train_semeval_parids-labels.csv')
    teids = pd.read_csv('dev_semeval_parids-labels.csv')

    trids.par_id = trids.par_id.astype(str)
    teids.par_id = teids.par_id.astype(str)

    data=dpm.train_task1_df

    rows = [] # will contain par_id, label and text
    for idx in range(len(trids)):
        parid = trids.par_id[idx]
        keyword = data.loc[data.par_id == parid].keyword.values[0]
        text = data.loc[data.par_id == parid].text.values[0]
        orig_label = int(data.loc[data.par_id == parid].orig_label.values[0])
        label = data.loc[data.par_id == parid].label.values[0]
        rows.append({
            'par_id':parid,
            'community':keyword,
            'text':text,
            'label':label,
            'orig_label':orig_label
        })

    trdf1 = pd.DataFrame(rows)


    traindf_majority = trdf1[trdf1['label'] == 0]
    traindf_minority = trdf1[trdf1['label'] == 1]
    traindf_minority_oversampled = resample(traindf_minority,
                                   replace=True,
                                   n_samples=len(traindf_majority),
                                   random_state=42)
    traindf_combined = pd.concat([traindf_majority, traindf_minority_oversampled])
    traindf_combined = traindf_combined.sample(frac=1, random_state=42).reset_index(drop=True)
    trdf1 = traindf_combined
    
    rows = [] # will contain par_id, label and text
    for idx in range(len(teids)):
        parid = teids.par_id[idx]
        #print(parid)
        # select row from original dataset
        keyword = data.loc[data.par_id == parid].keyword.values[0]
        text = data.loc[data.par_id == parid].text.values[0]
        orig_label = int(data.loc[data.par_id == parid].orig_label.values[0])
        label = data.loc[data.par_id == parid].label.values[0]
        rows.append({
            'par_id':parid,
            'community':keyword,
            'text':text,
            'label':label,
            'orig_label':orig_label
        })

    tedf1 = pd.DataFrame(rows)

    return trdf1, tedf1

In [13]:
train_df, dev_df = load_datasets_official()

## Individual model predictions

In [19]:
model_path1 = './best_model/final_model_1'

config1 = RobertaConfig.from_pretrained(model_pat1)
model1 = RoBERTaForPCL.from_pretrained(model_path1, config=config1, dropout_rate=0.1, num_frozen_layers=8, is_multiclass=False, extra_hidden_layer=True).to(device)


In [20]:
torch.manual_seed(6)

# Load the datasets
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

train_set = PCLDataset(tokenizer, train_df, is_multiclass=False)
dev_set_PCL = PCLDataset(tokenizer, dev_df, is_multiclass=False)
dev_set = DataLoader(dev_set_PCL, batch_size=32)

report = evaluate(model1, tokenizer, dev_set)
report

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]

{'Not PCL': {'precision': 0.9486514522821576,
  'recall': 0.9651715039577836,
  'f1-score': 0.9568401778707821,
  'support': 1895},
 'PCL': {'precision': 0.6024096385542169,
  'recall': 0.5025125628140703,
  'f1-score': 0.547945205479452,
  'support': 199},
 'accuracy': 0.9212034383954155,
 'macro avg': {'precision': 0.7755305454181873,
  'recall': 0.733842033385927,
  'f1-score': 0.7523926916751171,
  'support': 2094},
 'weighted avg': {'precision': 0.915746905514316,
  'recall': 0.9212034383954155,
  'f1-score': 0.9179814866072317,
  'support': 2094}}

In [None]:
model_path2 = './best_model/final_model_2'

config2 = RobertaConfig.from_pretrained(model_path2)
model2 = RoBERTaForPCL.from_pretrained(model_path2, config=config2, dropout_rate=0, num_frozen_layers=0, is_multiclass=False, extra_hidden_layer=False).to(device)

In [22]:
torch.manual_seed(6)

# Load the datasets
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

train_set = PCLDataset(tokenizer, train_df, is_multiclass=False)
dev_set_PCL = PCLDataset(tokenizer, dev_df, is_multiclass=False)
dev_set = DataLoader(dev_set_PCL, batch_size=32)

report = evaluate(model2, tokenizer, dev_set)
report

Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]

{'Not PCL': {'precision': 0.9483737738771296,
  'recall': 0.9693931398416886,
  'f1-score': 0.958768267223382,
  'support': 1895},
 'PCL': {'precision': 0.6305732484076433,
  'recall': 0.49748743718592964,
  'f1-score': 0.5561797752808989,
  'support': 199},
 'accuracy': 0.9245463228271251,
 'macro avg': {'precision': 0.7894735111423865,
  'recall': 0.7334402885138092,
  'f1-score': 0.7574740212521405,
  'support': 2094},
 'weighted avg': {'precision': 0.9181721002532387,
  'recall': 0.9245463228271251,
  'f1-score': 0.9205089024208251,
  'support': 2094}}

In [None]:
model_path3 = './best_model/final_model_3'
config3 = RobertaConfig.from_pretrained(model_path3)
model3 = RoBERTaForPCL.from_pretrained(model_path3, config=config3, dropout_rate=0.1, num_frozen_layers=8, is_multiclass=True, extra_hidden_layer=True).to(device)

In [24]:
torch.manual_seed(6)

# Load the datasets
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

train_set = PCLDataset(tokenizer, train_df, is_multiclass=True)
dev_set_PCL = PCLDataset(tokenizer, dev_df, is_multiclass=True)
dev_set = DataLoader(dev_set_PCL, batch_size=32)

report = evaluate(model3, tokenizer, dev_set)

Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]

In [25]:
report

{'Not PCL': {'precision': 0.9627192982456141,
  'recall': 0.9266490765171504,
  'f1-score': 0.9443398763108363,
  'support': 1895},
 'PCL': {'precision': 0.48518518518518516,
  'recall': 0.6582914572864321,
  'f1-score': 0.5586353944562898,
  'support': 199},
 'accuracy': 0.9011461318051576,
 'macro avg': {'precision': 0.7239522417153996,
  'recall': 0.7924702669017913,
  'f1-score': 0.7514876353835631,
  'support': 2094},
 'weighted avg': {'precision': 0.9173375940913517,
  'recall': 0.9011461318051576,
  'f1-score': 0.9076850568795781,
  'support': 2094}}

## Ensembled model predictions

In [30]:
from transformers import RobertaConfig


model_path = './best_model/final_model_1'
config = RobertaConfig.from_pretrained(model_path)
model = RoBERTaForPCL.from_pretrained(model_path, config=config, dropout_rate=0.1, num_frozen_layers=8, is_multiclass=False, extra_hidden_layer=True).to(device)

model_path2 = './best_model/final_model_2'
config2 = RobertaConfig.from_pretrained(model_path2)
model2 = RoBERTaForPCL.from_pretrained(model_path2, config=config2, dropout_rate=0, num_frozen_layers=0, is_multiclass=False, extra_hidden_layer=False).to(device)

model_path3 = './best_model/final_model_3'
config3 = RobertaConfig.from_pretrained(model_path3)
model3 = RoBERTaForPCL.from_pretrained(model_path, config=config3, dropout_rate=0.1, num_frozen_layers=8, is_multiclass=True, extra_hidden_layer=True).to(device)

models = [model, model2, model3]
results = ensemble(models, tokenizer, dev_set)
results

Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]

{'Not PCL': {'precision': 0.9514613778705637,
  'recall': 0.9620052770448548,
  'f1-score': 0.9567042770926267,
  'support': 1895},
 'PCL': {'precision': 0.5955056179775281,
  'recall': 0.5326633165829145,
  'f1-score': 0.5623342175066314,
  'support': 199},
 'accuracy': 0.9212034383954155,
 'macro avg': {'precision': 0.7734834979240459,
  'recall': 0.7473342968138847,
  'f1-score': 0.759519247299629,
  'support': 2094},
 'weighted avg': {'precision': 0.9176336814910442,
  'recall': 0.9212034383954155,
  'f1-score': 0.9192259380966319,
  'support': 2094}}

In [71]:
def predict(model, tokenizer, data_loader):
    model.eval()
    all_preds = []
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            inputs = tokenizer(batch['text'], return_tensors='pt', padding=True, truncation=True, max_length=128)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            outputs = model(**inputs)
            
            if model.is_multiclass:
                # Convert logits to probabilities and then to multi-class predictions
                preds = outputs.argmax(dim=1)
                # Map multi-class labels to binary
                mapped_preds = preds.clone()
                mapped_preds[mapped_preds < 2] = 0  # Map 0,1 to 0
                mapped_preds[mapped_preds >= 2] = 1  # Map 2,3,4 to 1
                preds = mapped_preds

            else:
                # Convert logits to probabilities and then to binary predictions for binary classification
                probs = torch.sigmoid(outputs).squeeze()
                preds = (probs > 0.5).long()
            
            all_preds.extend(preds.cpu().tolist())

    return all_preds


In [41]:
def predict_ensemble(models, tokenizer, data_loader):
    all_preds = []
    
    for batch in tqdm(data_loader, desc="Evaluating"):
        inputs = tokenizer(batch['text'], return_tensors='pt', padding=True, truncation=True, max_length=128)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        batch_probs = []  # Store probabilities for this batch

        for model in models:
            model.eval()
            with torch.no_grad():
                outputs = model(**inputs)
                
                if model.is_multiclass:
                    # Convert logits to probabilities and then to multi-class predictions
                    probs_multiclass = torch.softmax(outputs, dim=1)
                    #Map classes [0,1] to 0 and classes [2,3,4] to 1
                    probs = probs_multiclass[:, 2] + probs_multiclass[:, 3] + probs_multiclass[:, 4]  # Probability of label 1
                    batch_probs.append(probs.cpu().numpy())  # Store binary probabilities

                else:
                    # Convert logits to probabilities and then to binary predictions for binary classification
                    probs = torch.sigmoid(outputs).squeeze()
                    batch_probs.append(probs.cpu().numpy())

        # Average the probabilities across models for this batch
        avg_probs = np.mean(batch_probs, axis=0)
        # Convert averaged probabilities to binary predictions
        aggregated_preds = (avg_probs > 0.5).astype(int)
        
        all_preds.extend(aggregated_preds.tolist())

    return all_preds

In [None]:
models = [model, model2, model3]
dev_predictions = predict_ensemble(models, tokenizer, dev_set)

In [46]:
labels2file([[k] for k in dev_predictions], 'dev.txt')

In [47]:
labels2file(dev_df.label.apply(lambda x:[x]).tolist(), os.path.join('ref/', 'task1.txt'))

In [48]:
!python3 evaluation.py . .

# Predicting on the test set

In [51]:
test_df = pd.read_csv('task4_test.tsv', delimiter='\t', header=None, names=['par_id', 'art_id', 'community', 'country', 'text'])

In [54]:
# add a empty label column to the test data
test_df['label'] = np.nan
test_df['orig_label'] = np.nan

In [55]:
test_set = PCLDataset(tokenizer, test_df, is_multiclass=True)
test_loader = DataLoader(test_set, batch_size=32)

test_predictions = predict_ensemble(models, tokenizer, test_loader)

Evaluating:   0%|          | 0/120 [00:00<?, ?it/s]

In [58]:
labels2file([[k] for k in test_predictions], 'test.txt')