In [None]:
!pip install datasets
pip install wandb



We are going to add wandb to track the different trainings

In [None]:
import wandb
wandb.login()


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

In [None]:
import pandas as pd
from datasets import Dataset
import pathlib
import os


#get the path directory, assuming you are running on google collab and you follow the same structure as us
folder_path = "data/processedData/GPClassification"

file_name = "balanced_dataset-removed-2-classes-THEME-CONTEXT-INTERVENTION-5InterventionsForContext-GPmentionedInContext.json"

file_path = os.path.join(folder_path, file_name)

# Load the JSON file into a Pandas DataFrame
df = pd.read_json(file_path)

print("Size of the DataFrame:", df.shape)

# convert to the dataframe to Dataset
dataset = Dataset.from_pandas(df)

# display first line jjust to check
dataset[0]


Size of the DataFrame: (4450, 2)


{'text': '[THEME]\nDéclaration du Gouvernement et débat\n[CONTEXT]\nintervenant LIOT :Très bien !\nintervenant LIOT :La première d’entre elles est d’engager des réformes visant à améliorer le pouvoir d’achat de nos concitoyens. Une conférence sociale nationale sur le partage de la valeur, associant les corps intermédiaires, doit être organisée. Il est indispensable de réfléchir aux pistes qui permettront aux entreprises françaises de toutes tailles de générer plus de résultats afin de permettre ce partage.\nintervenant GDR :C’est la bombe atomique ! Elle a coûté cher !\nintervenant LIOT :Les Français sont majoritairement insatisfaits. À vouloir s’occuper de tout, l’État central finit par faire tout mal. Au groupe LIOT, nous sommes convaincus que la centralisation exacerbée du pays est un facteur de tensions sociales et de dégradation démocratique qui contribue lourdement aux déficits budgétaires.\n[INTERVENTION]\nLe soir de la défaite cuisante de son parti aux élections européennes, le

This part of the code creates a simple mapping from each unique label in the dataset to a number (like "left" → 0, "right" → 1, etc.). Machine learning models usually work with numbers, not text, so we need to turn the labels into integers. First, we build the label map, then we go through the dataset and replace each label with its corresponding number. If a label isn’t in the map, we just set it to -1.

In [None]:
def generate_label_map(dataset):
    # extract unique labeles from the dataset
    unique_labels = set(dataset['label'])

    # create a mapping label -> idx
    label_map = {label: idx for idx, label in enumerate(unique_labels)}

    return label_map

# automatically generate the label
label_map = generate_label_map(dataset)
print("label map:"+str(label_map))

# apply the corret label
def map_labels(example):
    example['label'] = label_map.get(example['label'], -1)  # Utilise -1 pour les labels inconnus
    return example

label map:{'GDR': 0, 'EPR': 1, 'RN': 2, 'DEM': 3, 'SOC': 4, 'HOR': 5, 'LFI-NFP': 6, 'DR': 7, 'UDR': 8, 'ECOS': 9}


In this part, we’re setting up everything needed for text classification. First, we extract the labels from the dataset and figure out how many unique classes we have. Then, we load a pre-trained CamemBERT model, which is great for French text. We also load the tokenizer that matches the model. The tokenizer is responsible for turning the raw text into a format that the model can understand. Finally, we apply the tokenizer to the dataset, making sure each text is padded or truncated to the right length.

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

# Extract the labels from the dataset
labels = [example['label'] for example in dataset]

# Find the number of unique classes
unique_labels = set(labels)
number_of_classes = len(unique_labels)

# Load a pre-trained CamemBERT model for sequence classification
# and specify the number of output labels (classes)
model = AutoModelForSequenceClassification.from_pretrained('camembert-base', num_labels=number_of_classes)

# Load the CamemBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained('camembert-base')

# Define a function to tokenize the input text
# It pads or truncates the text to a fixed length
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

# Apply the tokenization function to the entire dataset (in batches for efficiency)
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4450 [00:00<?, ? examples/s]

This section does a few things:

First, it tokenizes the dataset (like we did before, turning text into numbers the model can understand).

Then, it splits the dataset into training and evaluation sets (80% for training and 20% for testing).

After that, it prints the sizes of the training and evaluation datasets so we can see how much data we’re working with.

Finally, it applies the label mapping (the one we created earlier) to both the training and evaluation datasets, making sure the labels are correctly converted into numbers for the model.

In [None]:
# Split the dataset into training and testing sets (80%/20%)
# First, tokenize the balanced dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Split the tokenized dataset into a training set and a test set (80%/20%)
dataset_split = tokenized_datasets.train_test_split(test_size=0.2)

# Separate the training and test datasets
train_dataset = dataset_split['train']
eval_dataset = dataset_split['test']

# Print the size of the training and evaluation datasets to check the split
print(f"Training size: {len(train_dataset)}")
print(f"Evaluation size: {len(eval_dataset)}")

# Apply the label mapping function to both the training and evaluation datasets
# This converts labels from text to integers
train_dataset = train_dataset.map(map_labels)
eval_dataset = eval_dataset.map(map_labels)

Map:   0%|          | 0/4450 [00:00<?, ? examples/s]

Training size: 3560
Evaluation size: 890


Map:   0%|          | 0/3560 [00:00<?, ? examples/s]

Map:   0%|          | 0/890 [00:00<?, ? examples/s]

In this part, we're preparing to train our model using the CamemBERT pre-trained model for sequence classification. First, we make sure the labels are in the right format (integers). Then, we define the training settings, like the number of epochs, batch size, and learning rate. We also enable mixed precision training for better performance and use WandB for tracking the training progress. Finally, we set up the Trainer from the HuggingFace library, which will handle the actual training of the model. After that, we start the training process with the trainer.train() method.

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
import os

# Define a function to tokenize the input data
# This function ensures the text is properly tokenized, padded, and truncated to fit the model's requirements
def tokenize_function(examples):
    return tokenizer(examples['text'], padding=True, truncation=True)

# Convert the label column to integers for both training and evaluation datasets
train_dataset = train_dataset.map(lambda x: {'label': int(x['label'])})
eval_dataset = eval_dataset.map(lambda x: {'label': int(x['label'])})

# Load a pre-trained CamemBERT model for sequence classification
# The model is configured to handle 10 output labels (adjust based on the number of classes)
model = AutoModelForSequenceClassification.from_pretrained('camembert-base', num_labels=10)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',              # Directory to save results
    eval_strategy="epoch",               # Evaluate the model after each epoch
    learning_rate=2e-5,                  # Set the learning rate
    per_device_train_batch_size=8,       # Batch size for training
    per_device_eval_batch_size=8,        # Batch size for evaluation
    num_train_epochs=6,                  # Number of epochs to train the model
    weight_decay=0.01,                   # Apply weight decay for regularization
    gradient_accumulation_steps=4,       # Accumulate gradients over multiple steps
    report_to="wandb",                   # Enable W&B (Weights & Biases) tracking
    fp16=True,                           # Enable mixed precision training
)

# Define the Trainer object, which will handle the training and evaluation loop
trainer = Trainer(
    model=model,                         # The model to train
    args=training_args,                  # The training arguments
    train_dataset=train_dataset,         # The training dataset
    eval_dataset=eval_dataset,           # The evaluation dataset
    tokenizer=tokenizer,                 # The tokenizer to use for processing the text
)

# Start training the model using the trainer
trainer.train()


Map:   0%|          | 0/3560 [00:00<?, ? examples/s]

Map:   0%|          | 0/890 [00:00<?, ? examples/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,2.079569
2,No log,1.870205
3,No log,1.781773
4,No log,1.727432
5,1.918700,1.679544


TrainOutput(global_step=666, training_loss=1.8520897301109704, metrics={'train_runtime': 597.1708, 'train_samples_per_second': 35.769, 'train_steps_per_second': 1.115, 'total_flos': 5576249991610368.0, 'train_loss': 1.8520897301109704, 'epoch': 5.952808988764045})

## Now it's time to look at the result!

In [None]:
from sklearn.metrics import accuracy_score, f1_score

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')  # ou 'macro' si tu veux chaque classe égale

    return {
        'accuracy': acc,
        'f1': f1,
    }


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics  # <--- add the function to compute the metrics here !
)


  trainer = Trainer(


In [None]:
# Évaluation du modèle
# Une fois l'entraînement terminé, tu peux évaluer rapidement le modèle sur
# le jeu de validation. Effectuer une évaluation
results = trainer.evaluate()

# Afficher les résultats
print(f"Accuracy: {results['eval_accuracy']}")
print(f"F1 score: {results['eval_f1']}")  # Si tu as configuré une métrique F1


Accuracy: 0.5191011235955056
F1 score: 0.5165710285311319


Now we save the model, we will analysee the results more in depth in a later notebook

In [None]:
# After training finishes, you can explicitly save the model like this:
trainer.save_model('./models/GPClassification/final_model_camembert-base')