In [None]:
# ==================== 1. IMPORTATION DES BIBLIOTHEQUES ====================
import pandas as pd
import ast
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# ==================== 2. CHARGEMENT ET EXPLORATION DES DONNEES ====================
# Chargement du dataset
data = pd.read_csv('datasets/categorization-dataset/enhanced_multilabel_resumes.csv')

# Conversion des catégories (stockées comme strings) en listes
data['Categories'] = data['Categories'].apply(ast.literal_eval)

# Analyse de la distribution des catégories
all_categories = [category for sublist in data['Categories'] for category in sublist]
category_counts = pd.Series(all_categories).value_counts()

print("Distribution des catégories:")
print(category_counts)
category_counts.plot(kind='bar', figsize=(12, 6))
plt.title('Distribution des Catégories')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Extraction des textes et labels
texts = data['cleaned_resume'].tolist()
categories = data['Categories'].tolist()


In [None]:
# ==================== 3. PREPROCESSING ET PREPARATION DES DONNEES ====================
# Encodage multilabel des catégories
mlb = MultiLabelBinarizer()
binary_labels = mlb.fit_transform(categories)

# Division des données (train: 90%, val: 5%, test: 5%)
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, binary_labels, test_size=0.1, random_state=42
)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    test_texts, test_labels, test_size=0.5, random_state=42
)

# Initialisation du tokenizer BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_texts(texts):
    # Vérification et conversion en strings
    texts = [str(text) for text in texts]
    return tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="pt")

# Nettoyage des textes (remplacement des valeurs None/NaN)
train_texts = [text if text is not None and not pd.isna(text) else "" for text in train_texts]
val_texts = [text if text is not None and not pd.isna(text) else "" for text in val_texts]
test_texts = [text if text is not None and not pd.isna(text) else "" for text in test_texts]

# Tokenization des ensembles de données
train_encodings = tokenize_texts(train_texts)
val_encodings = tokenize_texts(val_texts)
test_encodings = tokenize_texts(test_texts)


In [None]:
# ==================== 4. CREATION DES DATASETS ET DATALOADERS ====================
class CVDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# Création des datasets
train_dataset = CVDataset(train_encodings, train_labels)
val_dataset = CVDataset(val_encodings, val_labels)
test_dataset = CVDataset(test_encodings, test_labels)

# Création des dataloaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)


In [None]:
# ==================== 5. CONFIGURATION DU MODELE ====================
# Chargement du modèle BERT pour classification multilabel
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(mlb.classes_),
    problem_type="multi_label_classification"
)

# Configuration de l'optimiseur et fonction de perte
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.BCEWithLogitsLoss()

# Déplacement du modèle sur le device approprié (GPU si disponible)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f"Utilisation du device : {device}")
model.to(device)


In [None]:
# ==================== 6. ENTRAINEMENT DU MODELE ====================
epochs = 10
best_val_loss = float('inf')
patience = 2
early_stopping_counter = 0

def compute_metrics(preds, labels, threshold=0.5):
    # Conversion des logits en prédictions binaires
    preds = (torch.sigmoid(torch.tensor(preds)) > threshold).int().numpy()

    # Calcul des métriques
    f1 = f1_score(labels, preds, average='micro')
    precision = precision_score(labels, preds, average='micro')
    recall = recall_score(labels, preds, average='micro')

    return f1, precision, recall

# Boucle d'entraînement principale
for epoch in range(epochs):
    print(f"Époque {epoch + 1}/{epochs}")

    # Phase d'entraînement
    model.train()
    train_loss = 0
    for batch in tqdm(train_loader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        loss = loss_fn(logits, labels)
        train_loss += loss.item()

        loss.backward()
        optimizer.step()

    train_loss /= len(train_loader)

    # Phase de validation
    model.eval()
    val_loss = 0
    val_preds = []
    val_labels_list = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validation"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            loss = loss_fn(logits, labels)
            val_loss += loss.item()

            val_preds.extend(logits.cpu().numpy())
            val_labels_list.extend(labels.cpu().numpy())

    val_loss /= len(val_loader)

    # Calcul et affichage des métriques
    f1, precision, recall = compute_metrics(val_preds, val_labels_list)

    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
    print(f"F1: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")

    # Gestion de l'early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        early_stopping_counter = 0
        # Sauvegarde du meilleur modèle
        torch.save(model.state_dict(), 'best_model.pt')
    else:
        early_stopping_counter += 1
        if early_stopping_counter >= patience:
            print("Early stopping déclenché !")
            break


In [None]:
# ==================== 7. EVALUATION DU MODELE ====================
# Chargement du meilleur modèle
model.load_state_dict(torch.load('best_model.pt'))

# Évaluation sur l'ensemble de test
model.eval()
test_preds = []
test_labels_list = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        test_preds.extend(logits.cpu().numpy())
        test_labels_list.extend(labels.cpu().numpy())

# Calcul et affichage des métriques
f1, precision, recall = compute_metrics(test_preds, test_labels_list)
print("\nRésultats sur l'ensemble de test:")
print(f"F1: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")

# Rapport de classification détaillé
test_preds_binary = (torch.sigmoid(torch.tensor(test_preds)) > 0.5).int().numpy()
print("\nRapport de classification:")
print(classification_report(test_labels_list, test_preds_binary, target_names=mlb.classes_, zero_division=0))



In [None]:
# ==================== 8. SAUVEGARDE DES RESULTATS ====================
# Sauvegarde du modèle et du tokenizer
model_save_path = '/kaggle/working/mon_modele_bert_multilabel'
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"\nModèle et tokenizer sauvegardés à : {model_save_path}")

# Sauvegarde du MultiLabelBinarizer
import joblib
joblib.dump(mlb, 'models/multilabel_binarizer.pkl')
print("MultiLabelBinarizer sauvegardé")

In [None]:

# ==================== 9.Évaluation sur l'ensemble de test ====================
model.eval()
test_preds = []
test_labels_list = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        test_preds.append(logits.cpu().numpy())
        test_labels_list.append(labels.cpu().numpy())

# Convertir les listes en tableaux numpy
test_preds = np.concatenate(test_preds, axis=0)
test_labels_list = np.concatenate(test_labels_list, axis=0)

# Calcul des métriques sur le test
def compute_metrics(preds, labels, threshold=0.5):
    # Convertir les logits en prédictions binaires
    preds = (torch.sigmoid(torch.tensor(preds)) > threshold).int().numpy()

    # Calculer les métriques
    f1 = f1_score(labels, preds, average='micro')
    precision = precision_score(labels, preds, average='micro')
    recall = recall_score(labels, preds, average='micro')

    return f1, precision, recall, preds

f1, precision, recall, test_preds_binary = compute_metrics(test_preds, test_labels_list)

print("\nRésultats sur l'ensemble de test:")
print(f"F1: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")

# Rapport de classification détaillé
print("\nRapport de classification:")
print(classification_report(test_labels_list, test_preds_binary, target_names=mlb.classes_, zero_division=0))

# Affichage de quelques exemples de prédictions
print("\nExemples de prédictions:")
num_examples = 50
for i in range(num_examples):
    print(f"\nExemple {i+1}:")
    print(f"Texte: {test_texts[i][:100]}...")  # Affiche les 100 premiers caractères
    print(f"Labels réels: {[mlb.classes_[i] for i, val in enumerate(test_labels_list[i]) if val == 1]}")
    print(f"Labels prédits: {[mlb.classes_[i] for i, val in enumerate(test_preds_binary[i]) if val == 1]}")