# CamamBERT model

> CamemBERT is a state-of-the-art language model for French based on the RoBERTa architecture pretrained on the French subcorpus of the newly available multilingual corpus OSCAR.
> https://camembert-model.fr

In [None]:
import pandas as pd
from transformers import AutoTokenizer
from datasets import load_dataset
import seaborn as sns
import torch  # GPU optim. + gradient opt.
from torch.utils.data import DataLoader
import functools
from LightningModel import LightningModel
import pytorch_lightning as pl
from sklearn.metrics import confusion_matrix

from matplotlib import pyplot as plt

In [None]:
tokenizer = AutoTokenizer.from_pretrained('camembert-base') #clean, tokenize as proprecessing is required for model camambert

dataset = load_dataset('Makxxx/french_CEFR') # stocked in huggingface, like a github for dataset. --> cambembert already loaded with right functions
dataset


In [None]:
pd_dataset = {split_name: split_data.to_pandas() for split_name, split_data in dataset.items()}
pd_dataset["validation"] #to test why we choose validation

### Visualize data

In [None]:
sns.set_theme()

nb_labels = len(pd_dataset["train"]["label"].unique())
print(f"Le dataset comprend {nb_labels} labels.")

ax = pd_dataset["train"]["label"].hist(density=True, bins=nb_labels)
ax.set_xlabel("Label ID")
ax.set_ylabel("Fréquence")
ax.set_title("Répartition des labels dans le dataset (train split)")
ax.figure.show()

In [None]:
pd_dataset["train"]["len_sen"] = pd_dataset["train"]["sentence"].apply(lambda x: len(x))
ax = pd_dataset["train"]["len_sen"].hist(density=True, bins=50)
ax.set_xlabel("Longueur")
ax.set_ylabel("Fréquence")
ax.set_title("Nombre de caractères par phrase")
ax.figure.show()

In [None]:
pd_dataset["train"]["len_sen"].max()

### defining fuctuion batch

In [None]:
def tokenize_batch(samples, tokenizer):
    text = [sample["sentence"] for sample in samples]
    labels = torch.tensor([sample["label"] for sample in samples])
    str_labels = [sample["difficulty"] for sample in samples]
    # The tokenizer handles
    # - Tokenization (amazing right?)
    # - Padding (adding empty tokens so that each example has the same length)
    # - Truncation (cutting samples that are too long)
    # - Special tokens (in CamemBERT, each sentence ends with a special token </s>)
    # - Attention mask (a binary vector which tells the model which tokens to look at. For instance it will not compute anything if the token is a padding token)
    tokens = tokenizer(text, padding="longest", return_tensors="pt")

    return {"input_ids": tokens.input_ids, "attention_mask": tokens.attention_mask, "labels": labels, "str_labels": str_labels, "sentences": text}

### defining data sets

In [None]:
train_dataset, test_dataset, val_dataset = dataset.values()
num_labels = len(pd_dataset["train"]["label"].unique())

In [None]:

#faire le lien entre dataset et les diff modèles. On met en place les paramètre batch et random.
train_dataloader = DataLoader(
    dataset["train"],
    batch_size=16,
    shuffle=True,
    collate_fn=functools.partial(tokenize_batch, tokenizer=tokenizer)
)
val_dataloader = DataLoader(
    dataset["validation"],
    batch_size=16,
    shuffle=False,
    collate_fn=functools.partial(tokenize_batch, tokenizer=tokenizer)
)

defining ligting model instance

In [None]:
lightning_model = LightningModel("camembert-base", num_labels, lr=3e-5, weight_decay=2)
# creation du modele au dessus
model_checkpoint = pl.callbacks.ModelCheckpoint(monitor="valid/acc", mode="max")

camembert_trainer = pl.Trainer(
    max_epochs=25, #how many times iteration on dataset
    gpus=1,
    callbacks=[
        pl.callbacks.EarlyStopping(monitor="valid/acc", patience=4, mode="max"),
        model_checkpoint,
    ]
)

In [None]:
camembert_trainer.fit(lightning_model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)

In [None]:
lightning_model = LightningModel.load_from_checkpoint(checkpoint_path=model_checkpoint.best_model_path) #5
#recover best model we found.

In [None]:
ID_TO_LABEL = dict(zip(range(6), ('A1', 'A2', 'B1', 'B2', 'C1', 'C2',)))
label_names = list(ID_TO_LABEL.values())

In [None]:
def plot_confusion_matrix(labels, preds, label_names):
    confusion_norm = confusion_matrix(labels, preds.tolist(), labels=list(range(len(label_names))), normalize="true")
    confusion = confusion_matrix(labels, preds.tolist(), labels=list(range(len(label_names))))

    plt.figure(figsize=(16, 14))
    sns.heatmap(
        confusion_norm,
        annot=confusion,
        cbar=False,
        fmt="d",
        xticklabels=label_names,
        yticklabels=label_names,
        cmap="viridis"
    )

In [None]:
camembert_preds = camembert_trainer.predict(lightning_model, dataloaders=val_dataloader)
camembert_preds = torch.cat(camembert_preds, -1)



In [None]:
plot_confusion_matrix(dataset["validation"]["label"], camembert_preds, label_names)



In [None]:
print(classification_report(dataset["validation"]["label"], camembert_preds, target_names=label_names))


In [None]:
wrong_preds = camembert_preds.numpy() != np.array(dataset["validation"]["label"])
wrong = dataset["validation"].to_pandas()[['sentence', 'difficulty']][wrong_preds]

preds = pd.Series(camembert_preds.numpy())[wrong_preds].apply(lambda x: ID_TO_LABEL[x])
wrong["preds"] = preds
wrong.columns = ["sentence", "true", "predicted"]
wrong

In [None]:
test_dataloader = DataLoader(
    dataset["test"],
    batch_size=16,
    shuffle=False,
    collate_fn=functools.partial(tokenize_batch, tokenizer=tokenizer)
)

preds = camembert_trainer.predict(lightning_model, dataloaders=test_dataloader)
preds = torch.cat(preds, -1) # ?

test_df = dataset["test"].to_pandas()
test_df.label = preds.numpy()
test_df.difficulty = test_df.label.apply(lambda x: label_names[x])
test_df.index.name = 'id'
test_df.drop(columns=["sentence", "label"], inplace=True)

In [None]:
test_df.to_csv('preds.csv')