# Pre-finetuning CamemBERT on Sentence Similarity Task using PAWS-C french dataset

In this notebook, we will pre-finetune CamemBERT on Sentences Similarity task using PAWS-C french dataset. The goal is to pre-finetune CamemBERT on a french dataset before fine-tuning it on French keywords extraction task. We'll use the PyTorch Lightning framework to train the model, and the HuggingFace Transformers library to load the model and tokenizer. This notebook is just for testing purposes, we'll use the script run_task.py to pre-finetune the model.

## Loading and Preprocessing

In [1]:
# pip install torch transformers lightning datasets seaborn plotly pandas

In [1]:
from pprint import pprint
import functools
import torch
from torch.utils.data import DataLoader
import torch.nn.functional as F
import lightning.pytorch as pl

from transformers import AutoModelForSequenceClassification, CamembertForMaskedLM, AutoTokenizer, AutoConfig
from datasets import load_dataset
from sklearn.metrics import confusion_matrix, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from tqdm.notebook import tqdm
import pandas as pd
from datasets import Dataset
import numpy as np
import random

# Set the random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)
pl.trainer.seed_everything(42)

import warnings
warnings.filterwarnings("ignore")


  from .autonotebook import tqdm as notebook_tqdm
Global seed set to 42


In [3]:
dataset = "../data/PAWS-C-FR/"
data = {
    "train": dataset + "translated_train.tsv",
    "dev": dataset + "dev_2k.tsv",
    "test": dataset + "test_2k.tsv"
}


def load_process():
    train = pd.read_csv(data['train'], delimiter='\t', on_bad_lines='skip')
    dev = pd.read_csv(data['dev'], delimiter='\t', on_bad_lines='skip')
    test = pd.read_csv (data['test'], delimiter='\t', on_bad_lines='skip')

    train.drop(columns=['id'], inplace=True)
    dev.drop(columns=['id'], inplace=True)
    test.drop(columns=['id'], inplace=True)

    train.dropna(inplace=True)
    dev.dropna(inplace=True)
    test.dropna(inplace=True)

    train['label'] = train['label'].astype(int)
    dev['label'] = dev['label'].astype(int)
    test['label'] = test['label'].astype(int)

    return train, dev, test


train, dev, test = load_process()

# Create PyTorch datasets from the dataframes

train_dataset = Dataset.from_pandas(train)
dev_dataset = Dataset.from_pandas(dev)
test_dataset = Dataset.from_pandas(test)

In [4]:
# Shape of the data

print(f"Total train samples : {train.shape[0]}")
print(f"Total validation samples: {dev.shape[0]}")
print(f"Total test samples: {test.shape[0]}")

Total train samples : 49127
Total validation samples: 1988
Total test samples: 2000


### Dataloaders

In [5]:
tokenizer = AutoTokenizer.from_pretrained('camembert-base')
batch_size = 16

def tokenize_batch(samples, tokenizer):
    sentence_1 = [sample['sentence1'] for sample in samples]
    sentence_2 = [sample['sentence2'] for sample in samples]
    labels = torch.tensor([sample["label"] for sample in samples])
    str_labels = [sample["label"] for sample in samples]
    text = [[str(x), str(y)] for x,y in zip(sentence_1, sentence_2)]
    tokens = tokenizer(text, return_tensors="pt", padding='max_length', max_length = 128, truncation=True)

    return {"input_ids": tokens.input_ids, "attention_mask": tokens.attention_mask, "labels": labels, "str_labels": str_labels, "sentences": text}

# Create dataloaders 

train_dataloader = DataLoader(
    train_dataset, 
    batch_size=batch_size, 
    shuffle=True, 
    pin_memory=True,
    collate_fn=functools.partial(tokenize_batch, tokenizer=tokenizer)
)
val_dataloader = DataLoader(
    dev_dataset, 
    batch_size=batch_size, 
    shuffle=False, 
    pin_memory=True,
    collate_fn=functools.partial(tokenize_batch, tokenizer=tokenizer)
)

test_dataloader = DataLoader(
    test_dataset, 
    batch_size=batch_size, 
    shuffle=False, 
    pin_memory=True,
    collate_fn=functools.partial(tokenize_batch, tokenizer=tokenizer)
)

## Fine-tuning

#### Model Customization

In [6]:
epochs = 1
lr = 3e-5
weight_decay = 0.

In [7]:
class LightningModel(pl.LightningModule):
    def __init__(self, model_name, num_labels, lr, weight_decay, from_scratch=False):
        super().__init__()
        self.save_hyperparameters()
        if from_scratch:
            config = AutoConfig.from_pretrained(
                model_name, num_labels=num_labels
            ).to("cuda")
            self.model = AutoModelForSequenceClassification.from_config(config)
        else:
            self.model = AutoModelForSequenceClassification.from_pretrained(
                model_name, num_labels=num_labels
            ).to("cuda")
        self.lr = lr
        self.weight_decay = weight_decay
        self.num_labels = self.model.num_labels

    def forward(self, batch):
        return self.model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"]
        )

    def training_step(self, batch):
        out = self.forward(batch)
        logits = out.logits
        loss_fn = torch.nn.CrossEntropyLoss()
        loss = loss_fn(logits.view(-1, self.num_labels), batch["labels"].view(-1))
        self.log("train/loss", loss)

        return loss

    def validation_step(self, batch, batch_index):
        labels = batch["labels"]
        out = self.forward(batch)
        preds = torch.max(out.logits, -1).indices
        acc = (batch["labels"] == preds).float().mean()
        self.log("valid/acc", acc)
        # If you’re trying to clear up the attached computational graph, use .detach() instead.
        f1 = f1_score(labels.detach().cpu().numpy(), preds.detach().cpu().numpy(), average='macro')
        self.log("valid/f1", f1)

    def predict_step(self, batch, batch_idx):
        out = self.forward(batch)

        return torch.max(out.logits, -1).indices

    def configure_optimizers(self):
        return torch.optim.AdamW(
            self.model.parameters(), lr=self.lr, weight_decay=self.weight_decay
        )

#### Training

In [9]:
lightning_model = LightningModel("camembert-base", 2, lr=lr, weight_decay=weight_decay)
model_checkpoint = pl.callbacks.ModelCheckpoint(monitor="valid/acc", mode="max")

camembert_trainer = pl.Trainer(
    max_epochs=epochs,
    #precision=16, 
    accelerator="gpu", devices="auto",
    callbacks=[
        pl.callbacks.EarlyStopping(monitor="valid/acc", patience=4, mode="max"),
        model_checkpoint,
    ]
)
camembert_trainer.fit(lightning_model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 

Epoch 0: 100%|██████████| 3071/3071 [33:26<00:00,  1.53it/s, v_num=0]      

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 3071/3071 [33:28<00:00,  1.53it/s, v_num=0]


FIT Profiler Report
Profile stats for: [LightningModule]LightningModel.configure_callbacks
         7 function calls in 0.000 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.000    0.000 contextlib.py:139(__exit__)
        1    0.000    0.000    0.000    0.000 {built-in method builtins.next}
        1    0.000    0.000    0.000    0.000 profiler.py:54(profile)
        1    0.000    0.000    0.000    0.000 advanced.py:66(stop)
        1    0.000    0.000    0.000    0.000 module.py:889(configure_callbacks)
        1    0.000    0.000    0.000    0.000 {method 'get' of 'dict' objects}
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}



Profile stats for: [LightningModule]LightningModel.prepare_data
         7 function calls in 0.000 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
  

#### Evaluation

In [14]:
def eval(model,trainer,test_set):
    preds = trainer.predict(model,dataloaders=test_set)
    # does trainer.predict do a forward pass on the model? > yes
    # does it change the model weights? > no
    preds = torch.cat(preds, -1)
    preds = preds.detach().cpu().numpy()
    preds = preds.tolist()
    test['preds'] = preds
    test['preds'] = test['preds'].astype(int)
    test['label'] = test['label'].astype(int)
    print(f"Accuracy: {sum(test['preds'] == test['label'])/len(test)}")
    print(f"F1 score: {f1_score(test['preds'], test['label'], average='macro')}")


In [16]:
# evaluate model on dev and test sets

print("Dev set")
#eval(lightning_model, camembert_trainer, val_dataloader)

print("Test set")
eval(lightning_model, camembert_trainer, test_dataloader)



LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Dev set
Test set
Predicting DataLoader 0: 100%|██████████| 125/125 [00:28<00:00,  4.33it/s]


PREDICT Profiler Report
Profile stats for: [LightningModule]LightningModel.configure_callbacks
         7 function calls in 0.000 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.000    0.000 contextlib.py:139(__exit__)
        1    0.000    0.000    0.000    0.000 {built-in method builtins.next}
        1    0.000    0.000    0.000    0.000 profiler.py:54(profile)
        1    0.000    0.000    0.000    0.000 advanced.py:66(stop)
        1    0.000    0.000    0.000    0.000 module.py:889(configure_callbacks)
        1    0.000    0.000    0.000    0.000 {method 'get' of 'dict' objects}
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}



Profile stats for: [LightningModule]LightningModel.prepare_data
         7 function calls in 0.000 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function

Accuracy: 0.9095
F1 score: 0.9089811699138011
