In [3]:
os.chdir('..')
%run Data/downoald_and_cleaning_data.ipynb

Downloading...
From (original): https://drive.google.com/uc?id=1UR_MC8Q5K4JSRnSUX5MQ-sIGPjLLnm9f
From (redirected): https://drive.google.com/uc?id=1UR_MC8Q5K4JSRnSUX5MQ-sIGPjLLnm9f&confirm=t&uuid=c7859b70-1121-41a3-a36d-05548203bb16
To: /home/onyxia/work/PSSD/Data/data.csv
100%|██████████| 363M/363M [00:04<00:00, 87.7MB/s] 


In [2]:
import torch
import tqdm
import os
import pickle
import numpy as np
import torch.nn as nn

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [5]:
df = pd.read_csv("data_clean.csv")

In [6]:
df.dropna(subset=['texte'], inplace=True)
df['texte'] = df['texte'].str.lower()

In [7]:
os.chdir('..')
df_themes = pd.read_csv('Annotations/theme.csv')

In [8]:
df_merged = df.merge(df_themes, on="identifiant", how="left", suffixes=("", "_manual"))
df_merged["texte_total"] = df_merged["titre"].fillna("") + " " + df_merged["texte"].fillna("")
df_merged = df_merged[~df_merged["theme"].isna()]
df_merged["theme"] = df_merged["theme"].replace({
    "tribune": "analyse",
    "société": "politique"
})

In [10]:
from transformers import CamembertTokenizer, CamembertModel

tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
model = CamembertModel.from_pretrained("camembert-base").to(device)
model.eval()  

embeddings_dict = {}

for _, row in tqdm(df_merged.iterrows()):
    sentence = row['texte_total']
    theme = row['theme']
    inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=False)
    inputs = {key: val.to(device) for key, val in inputs.items()}  
    with torch.no_grad():
        outputs = model(**inputs)
        token_embeddings = outputs.last_hidden_state.squeeze(0).cpu().numpy()
    
    embeddings_dict[sentence] = {
        "embeddings": token_embeddings, 
        "theme": theme
    }

433it [00:13, 31.16it/s]


In [None]:
with open("Models/theme_token_embeddings.pkl", "wb") as f:
    pickle.dump(embeddings_dict, f)

In [11]:
df_merged.drop('Unnamed: 0', axis=1, inplace=True)

In [13]:
counts = df_merged["theme"].value_counts()
valid_themes = counts[counts >= 2].index
df_filtered = df_merged[df_merged["theme"].isin(valid_themes)].copy()

Classes conservées : ['actualité', 'analyse', 'culture', 'politique']


In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df_filtered["theme_encoded"] = label_encoder.fit_transform(df_filtered["theme"])
print("Classes conservées :", list(label_encoder.classes_))

In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    df_filtered[["texte_total", "theme_encoded"]],
    test_size=0.2,
    stratify=df_filtered["theme_encoded"],
    random_state=42
)

In [None]:
n_classes = len(label_encoder.classes_)

In [14]:
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=256):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.loc[idx, "texte_total"]
        label = self.df.loc[idx, "theme_encoded"]

        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long)  # correction ici
        }

In [15]:
from torch.utils.data import DataLoader

batch_size = 64

train_dataset = TextDataset(train_df, tokenizer)
test_dataset = TextDataset(test_df, tokenizer)

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=2,       
    pin_memory=True      
)

test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=2,
    pin_memory=True
)


In [16]:
class CamembertCNNLSTMClassifier(nn.Module):
    def __init__(self, conv_out_dim=256, hidden_dim=128, num_classes=4):
        
        super().__init__()
        self.backbone = CamembertModel.from_pretrained("camembert-base")
        for param in self.backbone.parameters():
            param.requires_grad = False
        self.conv1d = nn.Conv1d(in_channels=768, out_channels=conv_out_dim, kernel_size=3, padding=1)
        self.relu_conv = nn.ReLU()
        self.lstm1 = nn.LSTM(input_size=conv_out_dim, hidden_size=hidden_dim,
                             batch_first=True, bidirectional=True)
        self.relu = nn.ReLU()
        self.lstm2 = nn.LSTM(input_size=hidden_dim * 2, hidden_size=hidden_dim,
                             batch_first=True, bidirectional=True)
        self.classifier = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, input_ids, attention_mask):
        
        with torch.no_grad():
            outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state              
        x = sequence_output.permute(0, 2, 1)                     
        x = self.conv1d(x)                                       
        x = self.relu_conv(x)
        x = x.permute(0, 2, 1)                                   
        lstm_out1, _ = self.lstm1(x)
        relu_out = self.relu(lstm_out1)
        lstm_out2, _ = self.lstm2(relu_out)
        cls_token_out = lstm_out2[:, 0, :]                       
        logits = self.classifier(cls_token_out)                
        
        return logits


In [17]:
class_counts = train_df['theme_encoded'].value_counts().sort_index().values
class_weights = 1.0 / torch.tensor(class_counts, dtype=torch.float)
class_weights = class_weights / class_weights.sum() * len(class_counts) 
class_weights = class_weights.to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)

In [18]:
n_classes = len(label_encoder.classes_)
print(f"Nombre de classes : {n_classes}")
print("Valeurs des labels dans le jeu de test :", set(test_df["theme_encoded"]))

Nombre de classes : 4
Valeurs des labels dans le jeu de test : {0, 1, 2, 3}


In [19]:
model = CamembertCNNLSTMClassifier().to(device)
for param in model.backbone.parameters():
    param.requires_grad = False
for param in model.backbone.encoder.layer[-1].parameters():
    param.requires_grad = True
optimizer = torch.optim.Adam(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=1e-4
)

In [20]:
from sklearn.metrics import accuracy_score, f1_score
import torch.nn.functional as F

num_epochs = 25

for epoch in tqdm(range(num_epochs)):
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)  # ← attention, le champ correct ici

        logits = model(input_ids, attention_mask)  # [batch, 6]
        loss = criterion(logits, labels)           # labels : [batch] (entiers 0–5)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # Évaluation
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            logits = model(input_ids, attention_mask)
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro')

    print(f"Epoch {epoch+1}/{num_epochs} — Loss: {total_loss/len(train_loader):.4f} | "
          f"Accuracy: {accuracy:.4f} | F1-macro: {f1:.4f}")


  4%|▍         | 1/25 [00:07<02:54,  7.27s/it]

Epoch 1/25 — Loss: 1.3868 | Accuracy: 0.1264 | F1-macro: 0.0561


  8%|▊         | 2/25 [00:14<02:44,  7.17s/it]

Epoch 2/25 — Loss: 1.3853 | Accuracy: 0.1264 | F1-macro: 0.0561


 12%|█▏        | 3/25 [00:21<02:37,  7.16s/it]

Epoch 3/25 — Loss: 1.3825 | Accuracy: 0.1264 | F1-macro: 0.0561


 16%|█▌        | 4/25 [00:28<02:31,  7.23s/it]

Epoch 4/25 — Loss: 1.3790 | Accuracy: 0.3908 | F1-macro: 0.3103


 20%|██        | 5/25 [00:35<02:23,  7.19s/it]

Epoch 5/25 — Loss: 1.3743 | Accuracy: 0.5747 | F1-macro: 0.5216


 24%|██▍       | 6/25 [00:43<02:17,  7.26s/it]

Epoch 6/25 — Loss: 1.3660 | Accuracy: 0.5862 | F1-macro: 0.5221


 28%|██▊       | 7/25 [00:50<02:09,  7.21s/it]

Epoch 7/25 — Loss: 1.3537 | Accuracy: 0.5977 | F1-macro: 0.5128


 32%|███▏      | 8/25 [00:57<02:02,  7.21s/it]

Epoch 8/25 — Loss: 1.3338 | Accuracy: 0.5747 | F1-macro: 0.4898


 36%|███▌      | 9/25 [01:04<01:54,  7.15s/it]

Epoch 9/25 — Loss: 1.2913 | Accuracy: 0.4713 | F1-macro: 0.3024


 40%|████      | 10/25 [01:11<01:47,  7.17s/it]

Epoch 10/25 — Loss: 1.2336 | Accuracy: 0.4828 | F1-macro: 0.3264


 44%|████▍     | 11/25 [01:19<01:40,  7.16s/it]

Epoch 11/25 — Loss: 1.1525 | Accuracy: 0.5172 | F1-macro: 0.3719


 48%|████▊     | 12/25 [01:26<01:33,  7.16s/it]

Epoch 12/25 — Loss: 1.0662 | Accuracy: 0.5977 | F1-macro: 0.5386


 52%|█████▏    | 13/25 [01:33<01:26,  7.20s/it]

Epoch 13/25 — Loss: 1.0061 | Accuracy: 0.5977 | F1-macro: 0.5422


 56%|█████▌    | 14/25 [01:40<01:19,  7.23s/it]

Epoch 14/25 — Loss: 0.8852 | Accuracy: 0.6092 | F1-macro: 0.5735


 60%|██████    | 15/25 [01:48<01:12,  7.24s/it]

Epoch 15/25 — Loss: 0.8447 | Accuracy: 0.6552 | F1-macro: 0.6231


 64%|██████▍   | 16/25 [01:55<01:05,  7.24s/it]

Epoch 16/25 — Loss: 0.7813 | Accuracy: 0.6437 | F1-macro: 0.6193


 68%|██████▊   | 17/25 [02:02<00:57,  7.22s/it]

Epoch 17/25 — Loss: 0.7528 | Accuracy: 0.6322 | F1-macro: 0.5821


 72%|███████▏  | 18/25 [02:09<00:50,  7.24s/it]

Epoch 18/25 — Loss: 0.7389 | Accuracy: 0.6782 | F1-macro: 0.6694


 76%|███████▌  | 19/25 [02:16<00:43,  7.24s/it]

Epoch 19/25 — Loss: 0.6976 | Accuracy: 0.5977 | F1-macro: 0.5587


 80%|████████  | 20/25 [02:24<00:36,  7.25s/it]

Epoch 20/25 — Loss: 0.6045 | Accuracy: 0.7011 | F1-macro: 0.6932


 84%|████████▍ | 21/25 [02:31<00:28,  7.24s/it]

Epoch 21/25 — Loss: 0.5773 | Accuracy: 0.6552 | F1-macro: 0.6054


 88%|████████▊ | 22/25 [02:38<00:21,  7.29s/it]

Epoch 22/25 — Loss: 0.5869 | Accuracy: 0.6782 | F1-macro: 0.6597


 92%|█████████▏| 23/25 [02:46<00:14,  7.36s/it]

Epoch 23/25 — Loss: 0.5306 | Accuracy: 0.6897 | F1-macro: 0.6389


 96%|█████████▌| 24/25 [02:53<00:07,  7.43s/it]

Epoch 24/25 — Loss: 0.4961 | Accuracy: 0.7126 | F1-macro: 0.6936


100%|██████████| 25/25 [03:01<00:00,  7.26s/it]

Epoch 25/25 — Loss: 0.5448 | Accuracy: 0.7241 | F1-macro: 0.6657





In [21]:
torch.save(model.state_dict(), "Models/camembert_cnn_lstm_weights.pth")

In [22]:
def predict_theme(text, model, tokenizer, device, label_encoder=None):
    model.eval()
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=256).to(device)

    with torch.no_grad():
        logits = model(inputs['input_ids'], inputs['attention_mask'])  # shape: [1, 6]

    prediction_idx = torch.argmax(logits, dim=1).item()

    if label_encoder is not None:
        prediction_label = label_encoder.inverse_transform([prediction_idx])[0]
        return prediction_label, prediction_idx
    else:
        return prediction_idx


In [23]:
from sklearn.metrics import classification_report
print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_))

              precision    recall  f1-score   support

   actualité       0.71      0.87      0.78        23
     analyse       0.60      0.27      0.38        11
     culture       0.86      0.73      0.79        26
   politique       0.66      0.78      0.71        27

    accuracy                           0.72        87
   macro avg       0.71      0.66      0.67        87
weighted avg       0.73      0.72      0.71        87



### Inference

In [24]:
class InferenceDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=256):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # Forcer la conversion en string et remplacer les NaN
        text = str(self.df.loc[idx, "texte"]) if pd.notna(self.df.loc[idx, "texte"]) else ""
        inputs = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0)
        }


In [37]:
df = pd.read_csv('Data/data_clean.csv')
df_all = df.merge(df_themes, on = 'identifiant', how = 'left')
df_to_predict = df_all[df_all['theme'].isna() & df_all['texte'].notna()].copy()
df_to_predict = df_to_predict.reset_index() 

In [39]:
inference_dataset = InferenceDataset(df_to_predict, tokenizer)
inference_loader = DataLoader(inference_dataset, batch_size=32, shuffle=False)

In [42]:
df_all.loc[df_to_predict["index"], "theme_pred_encoded"] = all_preds
df_all.loc[df_to_predict["index"], "theme_pred"] = label_encoder.inverse_transform(all_preds)
df_all.loc[df_to_predict["index"], "confidence"] = all_probs

df_all["theme_final"] = df_all["theme"].fillna(df_all["theme_pred"])

In [40]:
from torch.nn.functional import softmax

model.eval()
all_preds = []
all_probs = []

with torch.no_grad():
    for batch in tqdm(inference_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        logits = model(input_ids, attention_mask)            
        probs = softmax(logits, dim=1)                       

        pred_classes = torch.argmax(probs, dim=1)            
        max_probs = torch.max(probs, dim=1).values           

        all_preds.extend(pred_classes.cpu().numpy())
        all_probs.extend(max_probs.cpu().numpy())



100%|██████████| 367/367 [07:46<00:00,  1.27s/it]


In [48]:
df_all = df_all.drop(columns=['Unnamed: 0', 'theme', 'theme_pred_encoded', 'theme_pred', 'confidence'])


In [50]:
actualité = df_all[df_all['theme_final']=='actualité']

In [52]:
actualité.to_csv('Data/articles_actualite.csv')