In [None]:
!pip install torch tensorflow transformers pandas tf-keras hf_xet

In [8]:
import pandas as pd

articles = pd.read_csv('LeMonde2003_9classes(1).csv')

In [9]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(articles, test_size=0.2, random_state=42)

In [12]:
import pandas as pd
from transformers import pipeline
from tqdm import tqdm

batch_size = 32


classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0)


new_data = test[test['category'] != "UNE"].copy()
new_data.loc[new_data['category'] == 'SOC', 'category'] = 'FRA'
new_data = new_data.reset_index(drop=True)


category_to_label = {
    "SPO": "sport",
    "ART": "art",
    "FRA": "actualité france",
    "INT": "international",
    "ENT": "économie"
}


new_data["category_explicit"] = new_data["category"].map(category_to_label)
labels = list(category_to_label.values())


predicted_labels = []

for start in tqdm(range(0, len(new_data), batch_size)):
    end = min(start + batch_size, len(new_data))
    batch_texts = new_data.loc[start:end-1, "text"].fillna("").astype(str).str.slice(0, 300).tolist()

    for text in batch_texts:
        try:
            result = classifier(text, labels, multi_label=False)
            predicted_labels.append(result["labels"][0])
        except Exception as e:
            print(f"Error on text: {text[:50]}... → {e}")
            predicted_labels.append(None)


new_data["predicted_label"] = predicted_labels


Device set to use cuda:0
100%|██████████| 177/177 [15:52<00:00,  5.38s/it]


In [13]:
from sklearn.metrics import accuracy_score, classification_report


mask = new_data["category_explicit"].notna() & new_data["predicted_label"].notna()
y_true = new_data.loc[mask, "category_explicit"]
y_pred = new_data.loc[mask, "predicted_label"]


acc = accuracy_score(y_true, y_pred)
print(f"\n Accuracy : {acc:.2f}\n")

print("Classification report :")
print(classification_report(y_true, y_pred, labels=list(category_to_label.values())))



 Accuracy : 0.48

Classification report :
                  precision    recall  f1-score   support

           sport       0.55      0.67      0.60       548
             art       0.38      0.21      0.27       919
actualité france       0.61      0.44      0.51      1478
   international       0.43      0.81      0.56      1491
        économie       0.46      0.22      0.30      1201

        accuracy                           0.48      5637
       macro avg       0.49      0.47      0.45      5637
    weighted avg       0.49      0.48      0.45      5637

