In [1]:
from pprint import pprint
import functools

import torch
from torch.utils.data import DataLoader
import torch.nn.functional as F
import pytorch_lightning as pl
from transformers import AutoModelForSequenceClassification, CamembertForMaskedLM, AutoTokenizer, AutoConfig
from datasets import load_dataset
from sklearn.metrics import confusion_matrix, f1_score

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from tqdm.notebook import tqdm
import pandas as pd
import json
from copy import deepcopy

In [2]:
camembert = CamembertForMaskedLM.from_pretrained('camembert-base')
tokenizer = AutoTokenizer.from_pretrained('camembert-base')

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing CamembertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
batch_sentences = [
    "Vous savez où est la <mask> la plus proche?",
    "La Seine est un <mask>.",
    "Je cherche urgemment un endroit où retirer de l'<mask>.",
]

In [4]:
tokenizer_output = tokenizer(
    batch_sentences,
    padding="max_length",
    truncation=True,
    return_tensors="pt"
)
pprint(tokenizer_output, width=150)

{'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]),
 'input_ids': tensor([[   5,  158, 2591,  ...,    1,    1,    1],
        [   5,   61, 4458,  ...,    1,    1,    1],
        [   5,  100, 1066,  ...,    1,    1,    1]])}


In [5]:
pd_dataset_train = pd.DataFrame()

# activite = 0
# disponibilite_chambre = 1
# ouverture_accueil = 2

a = 0
with open('test/test_data/queries_activite.json', 'r') as f:
    data = json.load(f)
for i in data:
    pd_dataset_train[a] = ["activite", data[i], 0]
    a +=1

with open('test/test_data/queries_disponibilite_chambres.json', 'r') as f:
    data = json.load(f)
for i in data:
    pd_dataset_train[a] = ["disponibilite_chambre", data[i], 1]
    a +=1

with open('test/test_data/queries_ouverture_accueil.json', 'r') as f:
    data = json.load(f)
for i in data:
    pd_dataset_train[a] = ["ouverture_accueil", data[i], 2]
    a +=1

pd_dataset_train = pd_dataset_train.T
pd_dataset_train.columns = ['label', 'query', 'label_id']

pd_dataset_train.head()

Unnamed: 0,label,query,label_id
0,activite,Quelles activités proposez-vous pendant les va...,0
1,activite,Avez-vous des cours de yoga à l'hôtel ?,0
2,activite,Est-ce que vous proposez des sorties en voilier ?,0
3,activite,Je voudrais savoir si vous avez des activités ...,0
4,activite,Y a-t-il des excursions organisées depuis l'hô...,0


In [6]:
pd_dataset_test = pd.DataFrame()

# activite = 0
# disponibilite_chambre = 1
# ouverture_accueil = 2

a = 0
with open('test/test_data/queries_activite_test.json', 'r') as f:
    data = json.load(f)
for i in data:
    pd_dataset_test[a] = ["activite", data[i], 0]
    a +=1

with open('test/test_data/queries_disponibilite_chambres_test.json', 'r') as f:
    data = json.load(f)
for i in data:
    pd_dataset_test[a] = ["disponibilite_chambre", data[i], 1]
    a +=1

with open('test/test_data/queries_ouverture_accueil_test.json', 'r') as f:
    data = json.load(f)
for i in data:
    pd_dataset_test[a] = ["ouverture_accueil", data[i], 2]
    a +=1

pd_dataset_test = pd_dataset_test.T
pd_dataset_test.columns = ['label', 'query', 'label_id']

pd_dataset_test.head()

Unnamed: 0,label,query,label_id
0,activite,Proposez-vous des sorties en bateau pour les c...,0
1,activite,Y a-t-il des cours de natation pour les enfants ?,0
2,activite,Quels sont les tarifs pour louer des jets-skis...,0
3,activite,L'hôtel propose-t-il des cours de plongée sous...,0
4,activite,Est-ce qu'il y a des sorties organisées pour f...,0


In [7]:
pd_dataset_test.index

Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
       54, 55, 56, 57, 58, 59],
      dtype='int64')

In [8]:
def tokenize_batch(samples, tokenizer):
    text = [sample["query"] for sample in samples]
    labels = torch.tensor([sample["label_id"] for sample in samples])
    str_labels = [sample["label"] for sample in samples]
    # The tokenizer handles
    # - Tokenization (amazing right?)
    # - Padding (adding empty tokens so that each example has the same length)
    # - Truncation (cutting samples that are too long)
    # - Special tokens (in CamemBERT, each sentence ends with a special token </s>)
    # - Attention mask (a binary vector which tells the model which tokens to look at. For instance it will not compute anything if the token is a padding token)
    tokens = tokenizer(text, padding="longest", return_tensors="pt")

    return {"input_ids": tokens.input_ids, "attention_mask": tokens.attention_mask, "labels": labels, "str_labels": str_labels, "sentences": text}

In [9]:
test_dataloader = DataLoader(pd_dataset_test.to_dict(orient="records"), batch_size=8, shuffle=True, collate_fn=functools.partial(tokenize_batch, tokenizer=tokenizer))
val_dataloader = deepcopy(test_dataloader)
train_dataloader = DataLoader(pd_dataset_train.to_dict(orient="records"), batch_size=8, shuffle=True, collate_fn=functools.partial(tokenize_batch, tokenizer=tokenizer))
next(iter(train_dataloader))

{'input_ids': tensor([[    5,  9827,    26,   315,   143,    20,  1310,    24,   116,  6092,
             15,   350,     8,  1273,   106,     6,     1],
         [    5, 19214,    26,   315,   103,   248,    86,    39,   441,   138,
             20,   307,     8,  9475,   106,     6,     1],
         [    5,  9827,    26,   315,    20,  1310,  1339,    24,    44,  1658,
             26,   904,   106,     6,     1,     1,     1],
         [    5,   468,  1262,  1507,  3402,    17,    11,  1585,  2385,   823,
            106,     6,     1,     1,     1,     1,     1],
         [    5,   121,    11,  2642,   737,     7,    15,  1262,  1507,  2033,
             13,  2229,   106,     6,     1,     1,     1],
         [    5,  1196,    26,   291,    27,    13,  2229,    30,  1995,   641,
            133,  9571,   106,     6,     1,     1,     1],
         [    5, 19214,    26,   315,   103,   248,   257,    11,   169,  1262,
           1507,    13,  2229,    30,  1995,   106,     6],
       

In [10]:
sentences = []
labels = []
str_labels = []
all_representations = torch.Tensor()

def average_embeddings(embeddings, attention_mask):
    return (attention_mask[..., None] * embeddings).mean(1)

with torch.no_grad():
    for tokenized_batch in tqdm(val_dataloader):
        model_output = camembert(
            input_ids = tokenized_batch["input_ids"],
            attention_mask = tokenized_batch["attention_mask"],
            output_hidden_states=True
        )
        batch_representations = average_embeddings(model_output["hidden_states"][-1], tokenized_batch["attention_mask"])
        sentences.extend(tokenized_batch["sentences"])
        labels.extend(tokenized_batch["labels"])
        str_labels.extend(tokenized_batch["str_labels"])
        all_representations = torch.cat((all_representations, batch_representations), 0)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/8 [00:00<?, ?it/s]

In [11]:
from sklearn.manifold import TSNE

tsne = TSNE()
all_representations_2d = tsne.fit_transform(all_representations.cpu())
scatter_plot = px.scatter(x=all_representations_2d[:, 0], y=all_representations_2d[:, 1], color=str_labels)
scatter_plot.show(config={'staticPlot': True})

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [12]:
class LightningModel(pl.LightningModule):
    def __init__(self, model_name, num_labels, lr, weight_decay, from_scratch=False):
        super().__init__()
        self.save_hyperparameters()
        if from_scratch:
            # Si `from_scratch` est vrai, on charge uniquement la config (nombre de couches, hidden size, etc.) et pas les poids du modèle
            config = AutoConfig.from_pretrained(
                model_name, num_labels=num_labels
            )
            self.model = AutoModelForSequenceClassification.from_config(config)
        else:
            # Cette méthode permet de télécharger le bon modèle pré-entraîné directement depuis le Hub de HuggingFace sur lequel sont stockés de nombreux modèles
            self.model = AutoModelForSequenceClassification.from_pretrained(
                model_name, num_labels=num_labels
            )
        self.lr = lr
        self.weight_decay = weight_decay
        self.num_labels = self.model.num_labels

    def forward(self, batch):
        return self.model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"]
        )

    def training_step(self, batch):
        out = self.forward(batch)

        logits = out.logits
        # -------- MASKED --------
        loss_fn = torch.nn.CrossEntropyLoss()
        loss = loss_fn(logits.view(-1, self.num_labels), batch["labels"].view(-1))

        # ------ END MASKED ------

        self.log("train/loss", loss)

        return loss

    def validation_step(self, batch, batch_index):
        labels = batch["labels"]
        out = self.forward(batch)

        preds = torch.max(out.logits, -1).indices
        # -------- MASKED --------
        acc = (batch["labels"] == preds).float().mean()
        # ------ END MASKED ------
        self.log("valid/acc", acc)

        f1 = f1_score(batch["labels"].cpu().tolist(), preds.cpu().tolist(), average="macro")
        self.log("valid/f1", f1)

    def predict_step(self, batch, batch_idx):
        """La fonction predict step facilite la prédiction de données. Elle est
        similaire à `validation_step`, sans le calcul des métriques.
        """
        out = self.forward(batch)

        return torch.max(out.logits, -1).indices

    def configure_optimizers(self):
        return torch.optim.AdamW(
            self.model.parameters(), lr=self.lr, weight_decay=self.weight_decay
        )

num_labels = pd_dataset_train["label"].nunique()
print(f"Nombre de labels : {num_labels}")

Nombre de labels : 3


In [13]:
lightning_model = LightningModel("camembert-base", num_labels, lr=3e-5, weight_decay=0.)

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
%load_ext tensorboard
%reload_ext tensorboard
%tensorboard --logdir lightning_logs

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
model_checkpoint = pl.callbacks.ModelCheckpoint(monitor="valid/acc", mode="max")

camembert_trainer = pl.Trainer(
    max_epochs=20,
    # gpus=1,
    callbacks=[
        pl.callbacks.EarlyStopping(monitor="valid/acc", patience=4, mode="max"),
        model_checkpoint,
    ]
)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [16]:
camembert_trainer.fit(lightning_model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)


  | Name  | Type                               | Params | Mode
--------------------------------------------------------------------
0 | model | CamembertForSequenceClassification | 110 M  | eval
--------------------------------------------------------------------
110 M     Trainable params
0         Non-trainable params
110 M     Total params
442.497   Total estimated model params size (MB)
0         Modules in train mode
230       Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]


Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.


The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=10` in the `DataLoader` to improve performance.


Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 8. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.


The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=10` in the `DataLoader` to improve performance.


The number of training batches (12) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.



Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]


Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 4. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.



Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

In [17]:
#lightning_model = LightningModel.load_from_checkpoint(checkpoint_path=model_checkpoint.best_model_path)

In [18]:
ID_TO_LABEL = pd_dataset_train["label"].unique()
ID_TO_LABEL = {i: label for i, label in enumerate(ID_TO_LABEL)}

LABEL_TO_ID = {label: i for i, label in ID_TO_LABEL.items()}

print(ID_TO_LABEL, LABEL_TO_ID)

{0: 'activite', 1: 'disponibilite_chambre', 2: 'ouverture_accueil'} {'activite': 0, 'disponibilite_chambre': 1, 'ouverture_accueil': 2}


In [19]:
def get_preds(model, tokenizer, sentence):
    tokenized_sentence = tokenizer(sentence, return_tensors="pt")
    input_ids, attention_mask = tokenized_sentence.input_ids, tokenized_sentence.attention_mask

    out = model(
        input_ids=tokenized_sentence.input_ids,
        attention_mask=tokenized_sentence.attention_mask
    )

    logits = out.logits

    probas = torch.softmax(logits, -1).squeeze()

    pred = torch.argmax(probas)

    return ID_TO_LABEL[int(pred)], probas[pred].item()

In [20]:
test_sentence = "tu veux quoi toi"

label_predicted, proba = get_preds(lightning_model.model, tokenizer, test_sentence)

print(f"Label: {label_predicted}, confidence: {proba:.2f}")

Label: activite, confidence: 0.61


In [21]:
# save the model

torch.save(lightning_model.model, 'model.pth')

# load the model

model = torch.load('model.pth')

label_predicted, proba = get_preds(model, tokenizer, test_sentence)

print(f"Label: {label_predicted}, confidence: {proba:.2f}")

Label: activite, confidence: 0.61



You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.

