In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.wh

In [12]:
import transformers
from transformers import CamembertTokenizer, CamembertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
import pandas as pd
import torch
from google.colab import drive

drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
# 1. Chargement des données
DATA_PATH = "/content/drive/MyDrive/emails_annotated_100_clean.csv"
df = pd.read_csv(DATA_PATH)

# 2. Encodage des labels
label_encoder = LabelEncoder()
df["label_id"] = label_encoder.fit_transform(df["label"])
# 6. Chargement du modèle

model = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=len(label_encoder.classes_))
# 3. Tokenisation
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")

def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=256)

# 4. Conversion en dataset Hugging Face
raw_dataset = Dataset.from_pandas(df[["text", "label_id"]])
raw_dataset = raw_dataset.map(tokenize, batched=True)
raw_dataset = raw_dataset.rename_column("label_id", "labels")
raw_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# 5. Split train/test
train_dataset, eval_dataset = raw_dataset.train_test_split(test_size=0.2).values()

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [14]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [15]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)  # Get predicted class indices
    accuracy = accuracy_score(labels, predictions)  # Calculate accuracy
    return {"accuracy": accuracy}

model.to("cuda")

# 7. Entraînement
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",  # This now works with compute_metrics
    logging_dir="./logs",
    logging_steps=10,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,  # Add the compute_metrics function
)

print(f"Training on device: {'cuda' if torch.cuda.is_available() else 'cpu'}")

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Training on device: cuda


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2556,0.987017,0.78
2,0.5779,0.636565,0.84
3,0.3373,0.628669,0.83
4,0.3564,0.652526,0.845
5,0.2758,0.744565,0.83
6,0.1673,0.792673,0.82
7,0.0469,0.777644,0.845
8,0.071,0.74378,0.855
9,0.0817,0.725231,0.855
10,0.034,0.743364,0.855


TrainOutput(global_step=1000, training_loss=0.39819901041686534, metrics={'train_runtime': 478.6619, 'train_samples_per_second': 16.713, 'train_steps_per_second': 2.089, 'total_flos': 1052510367744000.0, 'train_loss': 0.39819901041686534, 'epoch': 10.0})

In [16]:
# 8. Sauvegarde du modèle et tokenizer
model.save_pretrained("/content/drive/MyDrive/cyia_camembert_model")
tokenizer.save_pretrained("/content/drive/MyDrive/cyia_camembert_model")

('/content/drive/MyDrive/cyia_camembert_model/tokenizer_config.json',
 '/content/drive/MyDrive/cyia_camembert_model/special_tokens_map.json',
 '/content/drive/MyDrive/cyia_camembert_model/sentencepiece.bpe.model',
 '/content/drive/MyDrive/cyia_camembert_model/added_tokens.json')

In [18]:
# 9. Export du label encoder
import pickle
with open("/content/drive/MyDrive/label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

print("\n✅ Modèle entraîné et sauvegardé avec succès !")


✅ Modèle entraîné et sauvegardé avec succès !
