In [2]:
import torch
import tqdm
from tqdm import tqdm
import pandas as pd
import pickle
import numpy as np
from transformers import CamembertTokenizer, CamembertModel
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torch.nn as nn
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
%run downoald_and_cleaning_data.ipynb

Downloading...
From (original): https://drive.google.com/uc?id=1UR_MC8Q5K4JSRnSUX5MQ-sIGPjLLnm9f
From (redirected): https://drive.google.com/uc?id=1UR_MC8Q5K4JSRnSUX5MQ-sIGPjLLnm9f&confirm=t&uuid=c7859b70-1121-41a3-a36d-05548203bb16
To: /home/onyxia/work/PSSD/Data/data.csv
100%|██████████| 363M/363M [00:04<00:00, 87.7MB/s] 


In [5]:
df = pd.read_csv("data_clean.csv")

In [6]:
df.dropna(subset=['texte'], inplace=True)
df['texte'] = df['texte'].str.lower()

In [7]:
import os
os.chdir('..')
df_themes = pd.read_csv('Annotations/theme.csv')

In [8]:
df_merged = df.merge(df_themes, on="identifiant", how="left", suffixes=("", "_manual"))
df_merged["texte_total"] = df_merged["titre"].fillna("") + " " + df_merged["texte"].fillna("")
df_merged = df_merged[~df_merged["theme"].isna()]
df_merged["theme"] = df_merged["theme"].replace({
    "tribune": "analyse",
    "société": "politique"
})

In [9]:
df_merged

Unnamed: 0.1,Unnamed: 0,identifiant,journal_clean,titre,annee,mois,jour,texte,keywords,theme,texte_total
94,432,efee09cde26fba5704002a3180ab0bf7f2f711dbff957d...,Le Figaro,Rennes : nouveau meurtre,1998,1,19,"« ici, samedi 6 heures, une femme de 38 ans as...","violence, meurtre, rennes, femme, samedi",actualité,"Rennes : nouveau meurtre « ici, samedi 6 heure..."
96,433,d183794139b099c8c366eb2482b740f413f22d62bb7d6d...,Le Figaro,Une femme médecin assassinée,1998,1,21,"- une femme de 60 ans, médecin allergologue, a...",,actualité,Une femme médecin assassinée - une femme de 60...
98,434,00627f5991ec8312f034b90a05650e755e28a6a8109170...,Le Figaro,Le fils du médecin interpellé,1998,1,22,"- le fils d'une femme médecin, assassinée chez...",fils,actualité,Le fils du médecin interpellé - le fils d'une ...
100,435,965556384a9f3ab74807d96fda6d5802b3f4b9adb93e31...,Libération,Les chantiers de la justice (4): le juge uniqu...,1998,1,23,"le 15 janvier, la garde des sceaux présentait ...","réforme, chantiers, guigou, justice, juge, eli...",politique,Les chantiers de la justice (4): le juge uniqu...
102,441,f958023203a90c46a0d24df94ad3ac3bf4490491e3d612...,Le Figaro,Caucase : l'« industrie » florissante du rapt,1998,1,31,"l'enlèvement de vincent cochetel, trente-sept ...","industrie, trente-sept, florissante, caucase, ...",politique,Caucase : l'« industrie » florissante du rapt ...
...,...,...,...,...,...,...,...,...,...,...,...
11969,80574,b2a4e0f965f89ec54af1a183a71581de3913f47faf9299...,Le Nouvel Obs,Depardieu accusé d'agressions sexuelles : son ...,2024,10,28,"l'acteur gérard depardieu , visé par de nombre...","accusé, agressions, procès, renvoi, acteur, se...",actualité,Depardieu accusé d'agressions sexuelles : son ...
11981,80659,d3e66a6526618cbf5f22945428a74f0130b19bfde3cf0f...,Libération,Affaire abbé Pierre : «Il est nécessaire que l...,2024,10,31,sociologue à l’ecole des hautes études en scie...,"pierre, mouvement, nécessaire, fasse, emmaüs",analyse,Affaire abbé Pierre : «Il est nécessaire que l...
12027,81039,9a5e3997aa8d05bb922b99a51cba1f3f5316f8b403c118...,Le Point,« Monstres » sur Netflix : va-t-on vers une ov...,2024,11,11,"qu'ont en commun une religieuse frustrée, un f...","série, ryan, netflix, figures, monstres, murphy",culture,« Monstres » sur Netflix : va-t-on vers une ov...
12111,81761,01d9b67bc79701339320e4ea9d7ab8b8242d35cff49138...,Le Figaro,Nantes : frappée par son conjoint devant leur ...,2024,11,25,un homme de 30 ans a été placé en garde à vue ...,"percé, tympan, tympan_percé, nantes, femme",actualité,Nantes : frappée par son conjoint devant leur ...


In [10]:
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
model = CamembertModel.from_pretrained("camembert-base").to(device)
model.eval()  

embeddings_dict = {}


for _, row in tqdm(df_merged.iterrows()):
    sentence = row['texte_total']
    theme = row['theme']
    

    inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=False)
    inputs = {key: val.to(device) for key, val in inputs.items()}  
    
    with torch.no_grad():
        outputs = model(**inputs)
        
        token_embeddings = outputs.last_hidden_state.squeeze(0).cpu().numpy()
    
    embeddings_dict[sentence] = {
        "embeddings": token_embeddings, 
        "theme": theme
    }


with open("themeed_token_embeddings.pkl", "wb") as f:
    pickle.dump(embeddings_dict, f)

433it [00:13, 31.16it/s]


In [11]:
df_merged.drop('Unnamed: 0', axis=1, inplace=True)

In [12]:
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
model = CamembertModel.from_pretrained("camembert-base").to(device)

In [13]:
# 1. Supprimer les classes rares (moins de 2 exemples)
counts = df_merged["theme"].value_counts()
valid_themes = counts[counts >= 2].index
df_filtered = df_merged[df_merged["theme"].isin(valid_themes)].copy()

# 2. Refaire le LabelEncoder
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df_filtered["theme_encoded"] = label_encoder.fit_transform(df_filtered["theme"])

# 3. Vérification
print("Classes conservées :", list(label_encoder.classes_))

# 4. Train/test split avec stratification
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    df_filtered[["texte_total", "theme_encoded"]],
    test_size=0.2,
    stratify=df_filtered["theme_encoded"],
    random_state=42
)

# 5. Redéfinir le nombre de classes pour le modèle
n_classes = len(label_encoder.classes_)


Classes conservées : ['actualité', 'analyse', 'culture', 'politique']


In [14]:
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=256):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.loc[idx, "texte_total"]
        label = self.df.loc[idx, "theme_encoded"]

        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long)  # correction ici
        }

In [15]:
from torch.utils.data import DataLoader

batch_size = 64

train_dataset = TextDataset(train_df, tokenizer)
test_dataset = TextDataset(test_df, tokenizer)

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=2,       
    pin_memory=True      
)

test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=2,
    pin_memory=True
)


In [16]:
import torch.nn as nn
from transformers import CamembertModel

class CamembertCNNLSTMClassifier(nn.Module):
    def __init__(self, conv_out_dim=256, hidden_dim=128, num_classes=4):
        super().__init__()
        self.backbone = CamembertModel.from_pretrained("camembert-base")
        
        # On freeze CamemBERT
        for param in self.backbone.parameters():
            param.requires_grad = False

        self.conv1d = nn.Conv1d(in_channels=768, out_channels=conv_out_dim, kernel_size=3, padding=1)
        self.relu_conv = nn.ReLU()

        self.lstm1 = nn.LSTM(input_size=conv_out_dim, hidden_size=hidden_dim,
                             batch_first=True, bidirectional=True)
        self.relu = nn.ReLU()
        self.lstm2 = nn.LSTM(input_size=hidden_dim * 2, hidden_size=hidden_dim,
                             batch_first=True, bidirectional=True)

        # Classification multi-classe (logits de taille [batch, num_classes])
        self.classifier = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        
        sequence_output = outputs.last_hidden_state              # [batch, seq_len, 768]
        x = sequence_output.permute(0, 2, 1)                     # [batch, 768, seq_len]
        x = self.conv1d(x)                                       # [batch, conv_out_dim, seq_len]
        x = self.relu_conv(x)
        x = x.permute(0, 2, 1)                                   # [batch, seq_len, conv_out_dim]
        
        lstm_out1, _ = self.lstm1(x)
        relu_out = self.relu(lstm_out1)
        lstm_out2, _ = self.lstm2(relu_out)

        cls_token_out = lstm_out2[:, 0, :]                       # On prend le 1er token
        logits = self.classifier(cls_token_out)                 # [batch, num_classes]
        
        return logits


In [17]:

# On compte le nombre d'occurrences de chaque classe
class_counts = train_df['theme_encoded'].value_counts().sort_index().values
class_weights = 1.0 / torch.tensor(class_counts, dtype=torch.float)
class_weights = class_weights / class_weights.sum() * len(class_counts)  # normalisation

class_weights = class_weights.to(device)

# Fonction de perte avec pondération par classe
criterion = nn.CrossEntropyLoss(weight=class_weights)


In [18]:
n_classes = len(label_encoder.classes_)
print(f"Nombre de classes : {n_classes}")
print("Valeurs des labels dans le jeu de test :", set(test_df["theme_encoded"]))


Nombre de classes : 4
Valeurs des labels dans le jeu de test : {0, 1, 2, 3}


In [19]:
model = CamembertCNNLSTMClassifier().to(device)
for param in model.backbone.parameters():
    param.requires_grad = False

# we only unfreeze the last layer of camembert
for param in model.backbone.encoder.layer[-1].parameters():
    param.requires_grad = True
optimizer = torch.optim.Adam(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=1e-4
)

In [20]:
from sklearn.metrics import accuracy_score, f1_score
import torch.nn.functional as F

num_epochs = 25

for epoch in tqdm(range(num_epochs)):
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)  # ← attention, le champ correct ici

        logits = model(input_ids, attention_mask)  # [batch, 6]
        loss = criterion(logits, labels)           # labels : [batch] (entiers 0–5)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # Évaluation
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            logits = model(input_ids, attention_mask)
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro')

    print(f"Epoch {epoch+1}/{num_epochs} — Loss: {total_loss/len(train_loader):.4f} | "
          f"Accuracy: {accuracy:.4f} | F1-macro: {f1:.4f}")


  4%|▍         | 1/25 [00:07<02:54,  7.27s/it]

Epoch 1/25 — Loss: 1.3868 | Accuracy: 0.1264 | F1-macro: 0.0561


  8%|▊         | 2/25 [00:14<02:44,  7.17s/it]

Epoch 2/25 — Loss: 1.3853 | Accuracy: 0.1264 | F1-macro: 0.0561


 12%|█▏        | 3/25 [00:21<02:37,  7.16s/it]

Epoch 3/25 — Loss: 1.3825 | Accuracy: 0.1264 | F1-macro: 0.0561


 16%|█▌        | 4/25 [00:28<02:31,  7.23s/it]

Epoch 4/25 — Loss: 1.3790 | Accuracy: 0.3908 | F1-macro: 0.3103


 20%|██        | 5/25 [00:35<02:23,  7.19s/it]

Epoch 5/25 — Loss: 1.3743 | Accuracy: 0.5747 | F1-macro: 0.5216


 24%|██▍       | 6/25 [00:43<02:17,  7.26s/it]

Epoch 6/25 — Loss: 1.3660 | Accuracy: 0.5862 | F1-macro: 0.5221


 28%|██▊       | 7/25 [00:50<02:09,  7.21s/it]

Epoch 7/25 — Loss: 1.3537 | Accuracy: 0.5977 | F1-macro: 0.5128


 32%|███▏      | 8/25 [00:57<02:02,  7.21s/it]

Epoch 8/25 — Loss: 1.3338 | Accuracy: 0.5747 | F1-macro: 0.4898


 36%|███▌      | 9/25 [01:04<01:54,  7.15s/it]

Epoch 9/25 — Loss: 1.2913 | Accuracy: 0.4713 | F1-macro: 0.3024


 40%|████      | 10/25 [01:11<01:47,  7.17s/it]

Epoch 10/25 — Loss: 1.2336 | Accuracy: 0.4828 | F1-macro: 0.3264


 44%|████▍     | 11/25 [01:19<01:40,  7.16s/it]

Epoch 11/25 — Loss: 1.1525 | Accuracy: 0.5172 | F1-macro: 0.3719


 48%|████▊     | 12/25 [01:26<01:33,  7.16s/it]

Epoch 12/25 — Loss: 1.0662 | Accuracy: 0.5977 | F1-macro: 0.5386


 52%|█████▏    | 13/25 [01:33<01:26,  7.20s/it]

Epoch 13/25 — Loss: 1.0061 | Accuracy: 0.5977 | F1-macro: 0.5422


 56%|█████▌    | 14/25 [01:40<01:19,  7.23s/it]

Epoch 14/25 — Loss: 0.8852 | Accuracy: 0.6092 | F1-macro: 0.5735


 60%|██████    | 15/25 [01:48<01:12,  7.24s/it]

Epoch 15/25 — Loss: 0.8447 | Accuracy: 0.6552 | F1-macro: 0.6231


 64%|██████▍   | 16/25 [01:55<01:05,  7.24s/it]

Epoch 16/25 — Loss: 0.7813 | Accuracy: 0.6437 | F1-macro: 0.6193


 68%|██████▊   | 17/25 [02:02<00:57,  7.22s/it]

Epoch 17/25 — Loss: 0.7528 | Accuracy: 0.6322 | F1-macro: 0.5821


 72%|███████▏  | 18/25 [02:09<00:50,  7.24s/it]

Epoch 18/25 — Loss: 0.7389 | Accuracy: 0.6782 | F1-macro: 0.6694


 76%|███████▌  | 19/25 [02:16<00:43,  7.24s/it]

Epoch 19/25 — Loss: 0.6976 | Accuracy: 0.5977 | F1-macro: 0.5587


 80%|████████  | 20/25 [02:24<00:36,  7.25s/it]

Epoch 20/25 — Loss: 0.6045 | Accuracy: 0.7011 | F1-macro: 0.6932


 84%|████████▍ | 21/25 [02:31<00:28,  7.24s/it]

Epoch 21/25 — Loss: 0.5773 | Accuracy: 0.6552 | F1-macro: 0.6054


 88%|████████▊ | 22/25 [02:38<00:21,  7.29s/it]

Epoch 22/25 — Loss: 0.5869 | Accuracy: 0.6782 | F1-macro: 0.6597


 92%|█████████▏| 23/25 [02:46<00:14,  7.36s/it]

Epoch 23/25 — Loss: 0.5306 | Accuracy: 0.6897 | F1-macro: 0.6389


 96%|█████████▌| 24/25 [02:53<00:07,  7.43s/it]

Epoch 24/25 — Loss: 0.4961 | Accuracy: 0.7126 | F1-macro: 0.6936


100%|██████████| 25/25 [03:01<00:00,  7.26s/it]

Epoch 25/25 — Loss: 0.5448 | Accuracy: 0.7241 | F1-macro: 0.6657





In [21]:
torch.save(model.state_dict(), "camembert_cnn_lstm_weights.pth")

In [22]:
def predict_theme(text, model, tokenizer, device, label_encoder=None):
    model.eval()
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=256).to(device)

    with torch.no_grad():
        logits = model(inputs['input_ids'], inputs['attention_mask'])  # shape: [1, 6]

    prediction_idx = torch.argmax(logits, dim=1).item()

    if label_encoder is not None:
        prediction_label = label_encoder.inverse_transform([prediction_idx])[0]
        return prediction_label, prediction_idx
    else:
        return prediction_idx


In [23]:
from sklearn.metrics import classification_report
print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_))

              precision    recall  f1-score   support

   actualité       0.71      0.87      0.78        23
     analyse       0.60      0.27      0.38        11
     culture       0.86      0.73      0.79        26
   politique       0.66      0.78      0.71        27

    accuracy                           0.72        87
   macro avg       0.71      0.66      0.67        87
weighted avg       0.73      0.72      0.71        87



In [24]:
class InferenceDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=256):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # Forcer la conversion en string et remplacer les NaN
        text = str(self.df.loc[idx, "texte"]) if pd.notna(self.df.loc[idx, "texte"]) else ""
        inputs = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0)
        }


In [37]:
df = pd.read_csv('Data/data_clean.csv')
df_all = df.merge(df_themes, on = 'identifiant', how = 'left')
df_to_predict = df_all[df_all['theme'].isna() & df_all['texte'].notna()].copy()
df_to_predict = df_to_predict.reset_index()  # garde l'index d'origine dans df_to_predict["index"]


In [39]:
inference_dataset = InferenceDataset(df_to_predict, tokenizer)
inference_loader = DataLoader(inference_dataset, batch_size=32, shuffle=False)

In [42]:
df_all.loc[df_to_predict["index"], "theme_pred_encoded"] = all_preds
df_all.loc[df_to_predict["index"], "theme_pred"] = label_encoder.inverse_transform(all_preds)
df_all.loc[df_to_predict["index"], "confidence"] = all_probs

df_all["theme_final"] = df_all["theme"].fillna(df_all["theme_pred"])

In [40]:
from torch.nn.functional import softmax

model.eval()
all_preds = []
all_probs = []

with torch.no_grad():
    for batch in tqdm(inference_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        logits = model(input_ids, attention_mask)            # [batch, num_classes]
        probs = softmax(logits, dim=1)                       # [batch, num_classes]

        pred_classes = torch.argmax(probs, dim=1)            # [batch]
        max_probs = torch.max(probs, dim=1).values           # [batch]

        all_preds.extend(pred_classes.cpu().numpy())
        all_probs.extend(max_probs.cpu().numpy())



100%|██████████| 367/367 [07:46<00:00,  1.27s/it]


In [47]:
df_all

Unnamed: 0.1,Unnamed: 0,identifiant,journal_clean,titre,annee,mois,jour,texte,keywords,theme,theme_pred_encoded,theme_pred,confidence,theme_final
0,162,3a4723d9d754ca30c68ba8e420cd6683548f2af7ff7af3...,Libération,Viol collectif dans la nuit de la Saint-Sylves...,1995,1,6,Quatre jeunes de 19 à 23 ans ont été écroués m...,"nuit, jeunes, collectif",,0.0,actualité,0.891264,actualité
1,164,2b6ce23c0fbfd6e213b7cd196a4b12eda4323e322f5f35...,Libération,"En Moselle, une jeune femme tuée par son mari",1995,1,16,"Une femme de 23 ans, mère d'une fillette de 2 ...","mari, ans",,0.0,actualité,0.924156,actualité
2,166,1ff60f3c67772523234516266aea695c0cac0751676577...,Libération,"""Péché originel"": P. D. James plagie",1995,1,19,"La baronne James, reine du déjà vu? Cette rume...","baronne, déjà, édition, james, plagie, James_p...",,2.0,culture,0.357727,culture
3,170,5b65c6d7e4f5ad08803aeb5e7921b6cb6321f1fef50b7f...,Libération,Sarajevo sous le feu des Serbes,1995,2,28,La reprise massive des tirs serbes sur Sarajev...,"recrudescence, demirel, serbes, capitale, turc...",,3.0,politique,0.717922,politique
4,171,d82fc0248b80e33ca1fc2a0da41f749d51f53f2db569b4...,Libération,Violence familiale : aujourd'hui la plainte pa...,1995,3,2,"Jusqu'à la réforme du code pénal il y a un an,...","violence, victimes, pénal, loi, avis, médical,...",,3.0,politique,0.717879,politique
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12165,82156,e36f02ac7daaf39e2d3b66037ec45f1884593b1e6b73dd...,Libération,Devenir acteur,2024,12,4,"Et si, en chemin, l'éducation populaire avait ...","acteur, populaire, éducation",,1.0,analyse,0.584799,analyse
12166,82157,e021e6c61999c0fb494e2a49ed9614758d1e8318230a25...,Le Point,Quand l'air de Bohème souffle sur le Rocher,2024,12,4,"À 88 ans, Charles Dutoit a ralenti son activit...","air, dutoit, charles",,2.0,culture,0.916663,culture
12167,82172,ebd597bcedd9a52c354cd8337257ec2511ff3ae3b56296...,Libération,"A Marseille, devenir acteur de l’éducation pop...",2024,12,4,"Et si, en chemin, l’éducation populaire avait ...","acteur, populaire, l’éducation",,1.0,analyse,0.679225,analyse
12168,82173,20d7c133c259fe5e149d1cddfa8be63aa616233362596f...,Le Figaro,«Adèle Haenel et ses 12 ans étaient d’une sens...,2024,12,4,Au dernier étage d’un immeuble de la rue Jean-...,"violences, christophe, adèle, réalisateur, d’u...",,0.0,actualité,0.874708,actualité


In [48]:
df_all = df_all.drop(columns=['Unnamed: 0', 'theme', 'theme_pred_encoded', 'theme_pred', 'confidence'])


In [49]:
df_all

Unnamed: 0,identifiant,journal_clean,titre,annee,mois,jour,texte,keywords,theme_final
0,3a4723d9d754ca30c68ba8e420cd6683548f2af7ff7af3...,Libération,Viol collectif dans la nuit de la Saint-Sylves...,1995,1,6,Quatre jeunes de 19 à 23 ans ont été écroués m...,"nuit, jeunes, collectif",actualité
1,2b6ce23c0fbfd6e213b7cd196a4b12eda4323e322f5f35...,Libération,"En Moselle, une jeune femme tuée par son mari",1995,1,16,"Une femme de 23 ans, mère d'une fillette de 2 ...","mari, ans",actualité
2,1ff60f3c67772523234516266aea695c0cac0751676577...,Libération,"""Péché originel"": P. D. James plagie",1995,1,19,"La baronne James, reine du déjà vu? Cette rume...","baronne, déjà, édition, james, plagie, James_p...",culture
3,5b65c6d7e4f5ad08803aeb5e7921b6cb6321f1fef50b7f...,Libération,Sarajevo sous le feu des Serbes,1995,2,28,La reprise massive des tirs serbes sur Sarajev...,"recrudescence, demirel, serbes, capitale, turc...",politique
4,d82fc0248b80e33ca1fc2a0da41f749d51f53f2db569b4...,Libération,Violence familiale : aujourd'hui la plainte pa...,1995,3,2,"Jusqu'à la réforme du code pénal il y a un an,...","violence, victimes, pénal, loi, avis, médical,...",politique
...,...,...,...,...,...,...,...,...,...
12165,e36f02ac7daaf39e2d3b66037ec45f1884593b1e6b73dd...,Libération,Devenir acteur,2024,12,4,"Et si, en chemin, l'éducation populaire avait ...","acteur, populaire, éducation",analyse
12166,e021e6c61999c0fb494e2a49ed9614758d1e8318230a25...,Le Point,Quand l'air de Bohème souffle sur le Rocher,2024,12,4,"À 88 ans, Charles Dutoit a ralenti son activit...","air, dutoit, charles",culture
12167,ebd597bcedd9a52c354cd8337257ec2511ff3ae3b56296...,Libération,"A Marseille, devenir acteur de l’éducation pop...",2024,12,4,"Et si, en chemin, l’éducation populaire avait ...","acteur, populaire, l’éducation",analyse
12168,20d7c133c259fe5e149d1cddfa8be63aa616233362596f...,Le Figaro,«Adèle Haenel et ses 12 ans étaient d’une sens...,2024,12,4,Au dernier étage d’un immeuble de la rue Jean-...,"violences, christophe, adèle, réalisateur, d’u...",actualité


In [50]:
actualité = df_all[df_all['theme_final']=='actualité']

In [52]:
actualité.to_csv('Data/articles_actualite.csv')