In [1]:
import torch
import tqdm
import os
import pickle
import numpy as np
import torch.nn as nn

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [4]:
df = pd.read_csv("data_clean.csv")

In [5]:
df.dropna(subset=['texte'], inplace=True)
df['texte'] = df['texte'].str.lower()

In [6]:
df_themes = pd.read_csv('Annotations/theme.csv')

In [7]:
df_merged = df.merge(df_themes, on="identifiant", how="left", suffixes=("", "_manual"))
df_merged["texte_total"] = df_merged["titre"].fillna("") + " " + df_merged["texte"].fillna("")
df_merged = df_merged[~df_merged["theme"].isna()]
df_merged["theme"] = df_merged["theme"].replace({
    "tribune": "analyse",
    "société": "politique"
})

In [9]:
from transformers import CamembertTokenizer, CamembertModel
from tqdm import tqdm
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
model = CamembertModel.from_pretrained("camembert-base").to(device)
model.eval()  

embeddings_dict = {}

for _, row in tqdm(df_merged.iterrows()):
    sentence = row['texte_total']
    theme = row['theme']
    inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=False)
    inputs = {key: val.to(device) for key, val in inputs.items()}  
    with torch.no_grad():
        outputs = model(**inputs)
        token_embeddings = outputs.last_hidden_state.squeeze(0).cpu().numpy()
    
    embeddings_dict[sentence] = {
        "embeddings": token_embeddings, 
        "theme": theme
    }

433it [00:19, 22.43it/s]


In [10]:
with open("Models/theme_token_embeddings.pkl", "wb") as f:
    pickle.dump(embeddings_dict, f)

In [11]:
df_merged.drop('Unnamed: 0', axis=1, inplace=True)

In [12]:
counts = df_merged["theme"].value_counts()
valid_themes = counts[counts >= 2].index
df_filtered = df_merged[df_merged["theme"].isin(valid_themes)].copy()

In [13]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df_filtered["theme_encoded"] = label_encoder.fit_transform(df_filtered["theme"])
print("Classes conservées :", list(label_encoder.classes_))

Classes conservées : ['actualité', 'analyse', 'culture', 'politique']


In [14]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    df_filtered[["texte_total", "theme_encoded"]],
    test_size=0.2,
    stratify=df_filtered["theme_encoded"],
    random_state=42
)

In [15]:
n_classes = len(label_encoder.classes_)

In [16]:
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=256):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.loc[idx, "texte_total"]
        label = self.df.loc[idx, "theme_encoded"]

        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long)  # correction ici
        }

In [17]:
from torch.utils.data import DataLoader

batch_size = 64

train_dataset = TextDataset(train_df, tokenizer)
test_dataset = TextDataset(test_df, tokenizer)

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=2,       
    pin_memory=True      
)

test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=2,
    pin_memory=True
)


In [18]:
class CamembertCNNLSTMClassifier(nn.Module):
    def __init__(self, conv_out_dim=256, hidden_dim=128, num_classes=4):
        
        super().__init__()
        self.backbone = CamembertModel.from_pretrained("camembert-base")
        for param in self.backbone.parameters():
            param.requires_grad = False
        self.conv1d = nn.Conv1d(in_channels=768, out_channels=conv_out_dim, kernel_size=3, padding=1)
        self.relu_conv = nn.ReLU()
        self.lstm1 = nn.LSTM(input_size=conv_out_dim, hidden_size=hidden_dim,
                             batch_first=True, bidirectional=True)
        self.relu = nn.ReLU()
        self.lstm2 = nn.LSTM(input_size=hidden_dim * 2, hidden_size=hidden_dim,
                             batch_first=True, bidirectional=True)
        self.classifier = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, input_ids, attention_mask):
        
        with torch.no_grad():
            outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state              
        x = sequence_output.permute(0, 2, 1)                     
        x = self.conv1d(x)                                       
        x = self.relu_conv(x)
        x = x.permute(0, 2, 1)                                   
        lstm_out1, _ = self.lstm1(x)
        relu_out = self.relu(lstm_out1)
        lstm_out2, _ = self.lstm2(relu_out)
        cls_token_out = lstm_out2[:, 0, :]                       
        logits = self.classifier(cls_token_out)                
        
        return logits


In [19]:
class_counts = train_df['theme_encoded'].value_counts().sort_index().values
class_weights = 1.0 / torch.tensor(class_counts, dtype=torch.float)
class_weights = class_weights / class_weights.sum() * len(class_counts) 
class_weights = class_weights.to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)

In [20]:
n_classes = len(label_encoder.classes_)
print(f"Nombre de classes : {n_classes}")
print("Valeurs des labels dans le jeu de test :", set(test_df["theme_encoded"]))

Nombre de classes : 4
Valeurs des labels dans le jeu de test : {0, 1, 2, 3}


In [21]:
model = CamembertCNNLSTMClassifier().to(device)
for param in model.backbone.parameters():
    param.requires_grad = False
for param in model.backbone.encoder.layer[-1].parameters():
    param.requires_grad = True
optimizer = torch.optim.Adam(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=1e-4
)

In [22]:
from sklearn.metrics import accuracy_score, f1_score
import torch.nn.functional as F

num_epochs = 25

for epoch in tqdm(range(num_epochs)):
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)  # ← attention, le champ correct ici

        logits = model(input_ids, attention_mask)  # [batch, 6]
        loss = criterion(logits, labels)           # labels : [batch] (entiers 0–5)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # Évaluation
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            logits = model(input_ids, attention_mask)
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro')

    print(f"Epoch {epoch+1}/{num_epochs} — Loss: {total_loss/len(train_loader):.4f} | "
          f"Accuracy: {accuracy:.4f} | F1-macro: {f1:.4f}")


  4%|▍         | 1/25 [00:10<04:14, 10.59s/it]

Epoch 1/25 — Loss: 1.3876 | Accuracy: 0.3103 | F1-macro: 0.1184


  8%|▊         | 2/25 [00:21<04:03, 10.57s/it]

Epoch 2/25 — Loss: 1.3849 | Accuracy: 0.3103 | F1-macro: 0.1184


 12%|█▏        | 3/25 [00:31<03:51, 10.50s/it]

Epoch 3/25 — Loss: 1.3832 | Accuracy: 0.3678 | F1-macro: 0.2045


 16%|█▌        | 4/25 [00:42<03:41, 10.55s/it]

Epoch 4/25 — Loss: 1.3803 | Accuracy: 0.5402 | F1-macro: 0.3812


 20%|██        | 5/25 [00:53<03:33, 10.66s/it]

Epoch 5/25 — Loss: 1.3753 | Accuracy: 0.5632 | F1-macro: 0.4437


 24%|██▍       | 6/25 [01:03<03:22, 10.68s/it]

Epoch 6/25 — Loss: 1.3673 | Accuracy: 0.6322 | F1-macro: 0.5822


 28%|██▊       | 7/25 [01:14<03:12, 10.67s/it]

Epoch 7/25 — Loss: 1.3539 | Accuracy: 0.6207 | F1-macro: 0.5718


 32%|███▏      | 8/25 [01:25<03:02, 10.72s/it]

Epoch 8/25 — Loss: 1.3350 | Accuracy: 0.6552 | F1-macro: 0.6337


 36%|███▌      | 9/25 [01:36<02:51, 10.74s/it]

Epoch 9/25 — Loss: 1.2967 | Accuracy: 0.5977 | F1-macro: 0.5612


 40%|████      | 10/25 [01:46<02:40, 10.73s/it]

Epoch 10/25 — Loss: 1.2344 | Accuracy: 0.5517 | F1-macro: 0.4784


 44%|████▍     | 11/25 [01:57<02:29, 10.68s/it]

Epoch 11/25 — Loss: 1.1457 | Accuracy: 0.5632 | F1-macro: 0.5011


 48%|████▊     | 12/25 [02:07<02:18, 10.64s/it]

Epoch 12/25 — Loss: 1.0727 | Accuracy: 0.5977 | F1-macro: 0.5504


 52%|█████▏    | 13/25 [02:18<02:08, 10.67s/it]

Epoch 13/25 — Loss: 1.0352 | Accuracy: 0.6782 | F1-macro: 0.6516


 56%|█████▌    | 14/25 [02:29<01:58, 10.75s/it]

Epoch 14/25 — Loss: 1.0142 | Accuracy: 0.6552 | F1-macro: 0.6113


 60%|██████    | 15/25 [02:40<01:47, 10.70s/it]

Epoch 15/25 — Loss: 0.9155 | Accuracy: 0.6437 | F1-macro: 0.6103


 64%|██████▍   | 16/25 [02:50<01:36, 10.70s/it]

Epoch 16/25 — Loss: 0.8375 | Accuracy: 0.6782 | F1-macro: 0.6659


 68%|██████▊   | 17/25 [03:01<01:25, 10.73s/it]

Epoch 17/25 — Loss: 0.7615 | Accuracy: 0.6322 | F1-macro: 0.5974


 72%|███████▏  | 18/25 [03:12<01:15, 10.76s/it]

Epoch 18/25 — Loss: 0.7589 | Accuracy: 0.7241 | F1-macro: 0.7040


 76%|███████▌  | 19/25 [03:23<01:04, 10.79s/it]

Epoch 19/25 — Loss: 0.7151 | Accuracy: 0.6667 | F1-macro: 0.6606


 80%|████████  | 20/25 [03:33<00:53, 10.75s/it]

Epoch 20/25 — Loss: 0.6684 | Accuracy: 0.6897 | F1-macro: 0.6723


 84%|████████▍ | 21/25 [03:44<00:42, 10.70s/it]

Epoch 21/25 — Loss: 0.6157 | Accuracy: 0.6552 | F1-macro: 0.6289


 88%|████████▊ | 22/25 [03:55<00:32, 10.73s/it]

Epoch 22/25 — Loss: 0.6112 | Accuracy: 0.5862 | F1-macro: 0.5597


 92%|█████████▏| 23/25 [04:06<00:21, 10.72s/it]

Epoch 23/25 — Loss: 0.5493 | Accuracy: 0.6322 | F1-macro: 0.5911


 96%|█████████▌| 24/25 [04:16<00:10, 10.67s/it]

Epoch 24/25 — Loss: 0.5085 | Accuracy: 0.7471 | F1-macro: 0.7260


100%|██████████| 25/25 [04:27<00:00, 10.69s/it]

Epoch 25/25 — Loss: 0.4982 | Accuracy: 0.7011 | F1-macro: 0.6420





In [23]:
torch.save(model.state_dict(), "Models/camembert_cnn_lstm_weights.pth")

In [24]:
def predict_theme(text, model, tokenizer, device, label_encoder=None):
    model.eval()
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=256).to(device)

    with torch.no_grad():
        logits = model(inputs['input_ids'], inputs['attention_mask'])  # shape: [1, 6]

    prediction_idx = torch.argmax(logits, dim=1).item()

    if label_encoder is not None:
        prediction_label = label_encoder.inverse_transform([prediction_idx])[0]
        return prediction_label, prediction_idx
    else:
        return prediction_idx


In [25]:
from sklearn.metrics import classification_report
print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_))

              precision    recall  f1-score   support

   actualité       0.68      0.91      0.78        23
     analyse       0.43      0.27      0.33        11
     culture       0.86      0.69      0.77        26
   politique       0.68      0.70      0.69        27

    accuracy                           0.70        87
   macro avg       0.66      0.65      0.64        87
weighted avg       0.70      0.70      0.69        87



### Inference

In [47]:
import torch
from torch.utils.data import Dataset, DataLoader


class InferenceDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=256):
        self.df = df  
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = str(self.df.iloc[idx]["texte"]) if pd.notna(self.df.iloc[idx]["texte"]) else ""
        inputs = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0)
        }


In [48]:
df_all = df.merge(df_themes, on = 'identifiant', how = 'left')
df_to_predict = df_all[df_all["theme"].isna() & df_all["texte"].notna()].copy()
original_indices = df_to_predict.index
inference_dataset = InferenceDataset(df_to_predict, tokenizer)
inference_loader = DataLoader(inference_dataset, batch_size=32, shuffle=False)


In [50]:
from torch.nn.functional import softmax

model.eval()
all_preds = []
all_probs = []

with torch.no_grad():
    for batch in tqdm(inference_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        logits = model(input_ids, attention_mask)            
        probs = softmax(logits, dim=1)                       

        pred_classes = torch.argmax(probs, dim=1)            
        max_probs = torch.max(probs, dim=1).values           

        all_preds.extend(pred_classes.cpu().numpy())
        all_probs.extend(max_probs.cpu().numpy())


100%|██████████| 367/367 [05:22<00:00,  1.14it/s]


In [51]:

assert len(original_indices) == len(all_preds), "Must have equal len keys and value when setting with an iterable"

# 6. Mise à jour de df_all
df_all.loc[original_indices, "theme_pred_encoded"] = all_preds
df_all.loc[original_indices, "theme_pred"] = label_encoder.inverse_transform(all_preds)
df_all.loc[original_indices, "confidence"] = all_probs

# 7. Remplissage de la colonne finale
df_all["theme_final"] = df_all["theme"].fillna(df_all["theme_pred"])


In [52]:
df_all = df_all.drop(columns=['Unnamed: 0', 'theme', 'theme_pred_encoded', 'theme_pred', 'confidence'])


In [53]:
actualité = df_all[df_all['theme_final']=='actualité']

In [54]:
actualité.to_csv('Data/articles_actualite.csv')

In [2]:
import os 
os.chdir('..')

'/home/onyxia/work/media_sexism_violence_treatment/Models'