In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random
import string

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

from transformers import BertTokenizer, BertForSequenceClassification
from transformers import BertConfig
from transformers.models.bert.modeling_bert import BertEncoder
from sklearn.metrics import roc_auc_score

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
import os
from google.colab import files

# Nettoyer tous les fichiers du dossier /content (sauf les dossiers système)
for f in os.listdir("/content"):
    path = f"/content/{f}"
    if os.path.isfile(path):
        os.remove(path)

# Lancer l’upload interactif
print(" Veuillez téléverser vos fichiers maintenant.")
uploaded = files.upload()

# Afficher uniquement les noms des fichiers
print("\n Fichiers téléversés :")
for fname in uploaded.keys():
    print(fname)


 Veuillez téléverser vos fichiers maintenant.


Saving sample_submission.csv to sample_submission.csv
Saving test_essays.csv to test_essays.csv
Saving train_essays.csv to train_essays.csv
Saving train_prompts.csv to train_prompts.csv

 Fichiers téléversés :
sample_submission.csv
test_essays.csv
train_essays.csv
train_prompts.csv


In [3]:
TRAIN_PATH = "/content/train_essays.csv"
TEST_PATH = "/content/test_essays.csv"
PROMPT_PATH = "/content/train_prompts.csv"
SUB_PATH = "/content/sample_submission.csv"


src_train = pd.read_csv(TRAIN_PATH)
src_test = pd.read_csv(TEST_PATH)
src_prompt = pd.read_csv(PROMPT_PATH)
src_sub = pd.read_csv(SUB_PATH)

In [4]:
# Aperçu rapide
print(src_train.shape, src_test.shape, src_prompt.shape, src_sub.shape)
print("Extrait du train :")
src_train.sample(5)

(1378, 4) (3, 3) (2, 4) (3, 2)
Extrait du train :


Unnamed: 0,id,prompt_id,text,generated
705,82196b38,1,Most people were raised on fairness what every...,0
32,07433107,1,Voting. Voting is a pretty big deal that only ...,0
568,6aeed61c,1,The Electoral College is not a fair system for...,0
986,b08b6efc,0,Thinking of the past century ideas of driving ...,0
286,3953c14e,0,"Many countries are pushing towards less cars, ...",0


In [10]:
src_train["prompt_id"].unique()

array([0, 1])

In [12]:
src_train["generated"].unique()

array([0, 1])

In [14]:
src_train["text"][0][:1000]

'Cars. Cars have been around since they became famous in the 1900s, when Henry Ford created and built the first ModelT. Cars have played a major role in our every day lives since then. But now, people are starting to question if limiting car usage would be a good thing. To me, limiting the use of cars might be a good thing to do.\n\nIn like matter of this, article, "In German Suburb, Life Goes On Without Cars," by Elizabeth Rosenthal states, how automobiles are the linchpin of suburbs, where middle class families from either Shanghai or Chicago tend to make their homes. Experts say how this is a huge impediment to current efforts to reduce greenhouse gas emissions from tailpipe. Passenger cars are responsible for 12 percent of greenhouse gas emissions in Europe...and up to 50 percent in some carintensive areas in the United States. Cars are the main reason for the greenhouse gas emissions because of a lot of people driving them around all the time getting where they need to go. Article

In [15]:
src_test.columns

Index(['id', 'prompt_id', 'text'], dtype='object')

In [16]:
src_prompt.columns

Index(['prompt_id', 'prompt_name', 'instructions', 'source_text'], dtype='object')

In [17]:
src_prompt.head()

Unnamed: 0,prompt_id,prompt_name,instructions,source_text
0,0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ..."
1,1,Does the electoral college work?,Write a letter to your state senator in which ...,# What Is the Electoral College? by the Office...


In [5]:
# --- Étude statistique rapide ---

print("\n Distribution des classes (colonne 'generated') :")
print(src_train['generated'].value_counts())

print("\n Longueur moyenne des textes :")
src_train['text_length'] = src_train['text'].apply(len)
print(src_train['text_length'].describe())

print("\n Nombre de prompts uniques :", src_prompt['prompt_id'].nunique())
print(" Nombre de correspondances entre prompts et essais :", src_train['prompt_id'].nunique())



 Distribution des classes (colonne 'generated') :
generated
0    1375
1       3
Name: count, dtype: int64

 Longueur moyenne des textes :
count    1378.000000
mean     3169.050798
std       920.588198
min      1356.000000
25%      2554.250000
50%      2985.500000
75%      3623.750000
max      8436.000000
Name: text_length, dtype: float64

 Nombre de prompts uniques : 2
 Nombre de correspondances entre prompts et essais : 2


In [6]:
# Model preparation
# === Étape 3 : Préparer le modèle BERT ===
tokenizer_save_path = "bert_tokenizer"
model_save_path = "bert_model"

# Chargement du tokenizer BERT
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Chargement du modèle BERT pour classification (utile pour le discriminateur plus tard)
pretrained_model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

# Extraire l’encodeur BERT uniquement (partie embeddings sans la tête de classification)
embedding_model = pretrained_model.bert

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Exemple de batch de texte (tu pourras adapter avec tout ton dataset plus tard)
sample_texts = src_train['text'].tolist()[:4]  # prend quelques exemples

# Tokenisation (traduction texte → input_ids + attention mask)
inputs = tokenizer(sample_texts, padding=True, truncation=True, return_tensors="pt")

# Extraire les embeddings depuis BERT
with torch.no_grad():
    outputs = embedding_model(**inputs)

# outputs.last_hidden_state → (batch_size, seq_len, hidden_dim)
# On peut prendre par exemple le [CLS] token comme représentation globale :
embeddings = outputs.last_hidden_state[:, 0, :]  # shape = (batch_size, hidden_dim)

print("Forme des embeddings extraits :", embeddings.shape)


Forme des embeddings extraits : torch.Size([4, 768])


In [9]:
# Tokeniser tous les textes de src_train
inputs = tokenizer(
    src_train["text"].tolist(),
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

# Passer dans le modèle BERT (sans tête de classification)
embedding_model.eval()
with torch.no_grad():
    outputs = embedding_model(**inputs)
    text_embeddings = outputs.last_hidden_state[:, 0, :]  # vecteur [CLS]

print(" Embeddings extraits :", text_embeddings.shape)


 Embeddings extraits : torch.Size([1378, 768])


In [10]:
"""# Parameter definition"""

train_batch_size = 32         # Taille de batch raisonnable pour BERT
test_batch_size = 64          # Plus grand pour les prédictions (pas de backprop)
lr = 2e-5                     # Taux d’apprentissage adapté à BERT et GAN
beta1 = 0.5                   # Classique pour Adam dans les GAN
nz = 100                      # Taille du vecteur latent (déjà défini)
num_epochs = 5                # À ajuster selon ton temps et overfitting...
num_hidden_layers = 2        # Nombre de couches dans les réseaux D et G
train_ratio = 1              # 1 étape de D pour 1 étape de G (GAN classique)

In [11]:
"""# Data Preparation"""

class GANDAIGDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

#Nombre total d'exemples
all_num = len(src_train)
train_ratio_split = 0.8
train_num = int(all_num * train_ratio_split)

test_num = all_num - train_num

# Division des embeddings et des labels
train_embeddings = text_embeddings[:train_num]
test_embeddings = text_embeddings[train_num:]

train_labels = src_train['generated'].values[:train_num]
test_labels = src_train['generated'].values[train_num:]

# On garde un test_set Pandas uniquement si nécessaire
train_set = src_train.iloc[:train_num]
test_set = pd.concat([
    src_train.iloc[train_num:],
]).reset_index(drop=True)

# Création des datasets PyTorch
train_dataset = GANDAIGDataset(train_embeddings, train_labels)
test_dataset = GANDAIGDataset(test_embeddings, test_labels)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=test_batch_size, shuffle=False)

In [54]:
"""# Generator definition"""

config = BertConfig(num_hidden_layers=num_hidden_layers)

class Generator(nn.Module):
    def __init__(self, input_dim):
        super().__init__()

        # Étendre le vecteur latent z à une forme compatible pour convolution
        self.fc = nn.Linear(input_dim, 256 * 4)  # 256 canaux, longueur 4

        # Réseau de convolution transposée pour simuler une séquence (ex: 768 dim finale)
        self.conv_net = nn.Sequential(
            nn.ConvTranspose1d(256, 128, kernel_size=4, stride=2),  # out: (128, 10)
            nn.ReLU(),
            nn.ConvTranspose1d(128, 64, kernel_size=4, stride=2),   # out: (64, 22)
            nn.ReLU(),
            nn.ConvTranspose1d(64, 32, kernel_size=4, stride=2),    # out: (32, ~46)
            nn.ReLU(),
            nn.ConvTranspose1d(32, 1, kernel_size=4, stride=2),     # out: (1, ~96)
            nn.AdaptiveAvgPool1d(96),  # 🔧 pour garantir la taille
            nn.Flatten(),  # on aplatit en vecteur
            nn.Linear(96, 768)  # projection vers un vecteur BERT-like
        )

        # BertEncoder simulé (peut aider à affiner la structure de l'embedding généré)
        self.bert_encoder = BertEncoder(config)

    def forward(self, x):
        x = self.fc(x)                          # (batch, 256*4)
        x = x.view(-1, 256, 4)                  # (batch, channels, seq_len)
        x = self.conv_net(x)                    # (batch, 768)
        extended = x.unsqueeze(1)               # (batch, 1, 768)

        attention_mask = torch.ones((x.size(0), 1), dtype=torch.long, device=x.device)

        encoder_outputs = self.bert_encoder(
            hidden_states=extended,
            attention_mask=attention_mask
        )

        x = encoder_outputs.last_hidden_state[:, 0, :]  # ✅ juste le vecteur CLS
        return x


In [55]:
"""# Discriminator definition"""

from transformers import BertModel

class SumBertPooler(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        sum_hidden = hidden_states.sum(dim=1)                    # (batch, hidden_size)
        sum_mask = sum_hidden.sum(1).unsqueeze(1)                # (batch, 1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)               # éviter division par zéro
        mean_embeddings = sum_hidden / sum_mask
        return mean_embeddings



In [57]:
class Discriminator(nn.Module):
    def __init__(self):
        super().__init__()
        self.classifier = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )

    def forward(self, x):  # x est un vecteur de forme (batch_size, 768)
        return self.classifier(x)


In [59]:
def eval_auc(model):
    model.eval()

    predictions = []
    actuals = []

    with torch.no_grad():
        for batch in test_loader:
            embeddings = batch[0].to(device)          # embeddings générés par BERT
            labels = batch[1].float().to(device)      # labels (0 ou 1)

            outputs = model(embeddings).squeeze()     # prédictions (probas)
            predictions.extend(outputs.cpu().numpy())
            actuals.extend(labels.cpu().numpy())

    auc = roc_auc_score(actuals, predictions)
    print("AUC:", round(auc, 4))
    return auc


In [60]:
def get_model_info_dict(model, epoch, auc_score):
    current_device = next(model.parameters()).device
    model.to('cpu')

    model_info = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'auc_score': auc_score,
    }

    model.to(current_device)
    return model_info


In [61]:
def preparation_embedding(texts):
    encodings = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    input_ids = encodings['input_ids']
    token_type_ids = encodings['token_type_ids']
    embeded = embedding_model(input_ids=input_ids, token_type_ids=token_type_ids)
    return embeded

In [62]:
def GAN_step(optimizerG, optimizerD, netG, netD, real_data, label, epoch, i):
    netD.zero_grad()
    batch_size = real_data.size(0)

    # Assure-toi que les labels ont bien la forme (batch_size, 1)
    label = label.view(-1, 1)

    # --- Discriminateur sur données réelles ---
    output = netD(real_data)
    errD_real = criterion(output, label)
    errD_real.backward()
    D_x = output.mean().item()

    # --- Génération de données artificielles ---
    noise = torch.randn(batch_size, nz, device=device)
    fake_data = netG(noise)  # (batch_size, 768)
    label_fake = torch.zeros(batch_size, 1, device=device)  # faux = 0

    # --- Discriminateur sur données générées ---
    output = netD(fake_data.detach())
    errD_fake = criterion(output, label_fake)
    errD_fake.backward()
    D_G_z1 = output.mean().item()
    errD = errD_real + errD_fake
    optimizerD.step()

    # --- Générateur : essaye de tromper le discriminateur ---
    netG.zero_grad()
    label_real_for_G = torch.ones(batch_size, 1, device=device)  # objectif : faire croire que c’est réel
    output = netD(fake_data)
    errG = criterion(output, label_real_for_G)
    errG.backward()
    D_G_z2 = output.mean().item()
    optimizerG.step()

    if i % 50 == 0:
        print('[%d/%d][%d] Loss_D: %.4f Loss_G: %.4f D(x): %.4f D(G(z)): %.4f / %.4f'
              % (epoch, num_epochs, i, errD.item(), errG.item(), D_x, D_G_z1, D_G_z2))

    return optimizerG, optimizerD, netG, netD


In [47]:
def GAN_step(optimizerG, optimizerD, netG, netD, real_data, label, epoch, i):
    netD.zero_grad()
    batch_size = real_data.size(0)

    # Assure que les labels ont bien la forme (batch_size, 1)
    label = label.view(-1, 1)

    # --- Discriminateur sur données réelles ---
    output = netD(real_data)
    errD_real = criterion(output, label)
    errD_real.backward()
    D_x = output.mean().item()

    # --- Génération de données artificielles ---
    noise = torch.randn(batch_size, nz, device=device)
    fake_data = netG(noise)
    label_fake = torch.zeros(batch_size, 1, device=device)  # faux = 0

    # --- Discriminateur sur données générées ---
    output = netD(fake_data.detach())
    errD_fake = criterion(output, label_fake)
    errD_fake.backward()
    D_G_z1 = output.mean().item()
    errD = errD_real + errD_fake
    optimizerD.step()

    # --- Générateur : essaye de tromper le discriminateur ---
    netG.zero_grad()
    label_real_for_G = torch.ones(batch_size, 1, device=device)  # objectif : le faire croire que c’est vrai
    output = netD(fake_data)
    errG = criterion(output, label_real_for_G)
    errG.backward()
    D_G_z2 = output.mean().item()
    optimizerG.step()

    if i % 50 == 0:
        print('[%d/%d][%d] Loss_D: %.4f Loss_G: %.4f D(x): %.4f D(G(z)): %.4f / %.4f'
              % (epoch, num_epochs, i, errD.item(), errG.item(), D_x, D_G_z1, D_G_z2))

    return optimizerG, optimizerD, netG, netD


In [45]:
def GAN_step(optimizerG, optimizerD, netG, netD, real_data, label, epoch, i):
    netD.zero_grad()
    batch_size = real_data.size(0)

    output = netD(real_data)
    errD_real = criterion(output, label)
    errD_real.backward()
    D_x = output.mean().item()

    noise = torch.randn(batch_size, nz, device=device)
    fake_data = netG(noise).last_hidden_state
    label.fill_(1)
    output = netD(fake_data.detach())
    errD_fake = criterion(output, label)
    errD_fake.backward()
    D_G_z1 = output.mean().item()
    errD = errD_real + errD_fake
    optimizerD.step()

    netG.zero_grad()
    label.fill_(0)
    output = netD(fake_data)
    errG = criterion(output, label)
    errG.backward()
    D_G_z2 = output.mean().item()
    optimizerG.step()
    if i % 50 == 0:
        print('[%d/%d][%d/%d] Loss_D: %.4f Loss_G: %.4f D(x): %.4f D(G(z)): %.4f / %.4f')
#               % (epoch, num_epochs, i, len(train_loader), errD.item(), errG.item(), D_x, D_G_z1, D_G_z2))

    return optimizerG, optimizerD, netG, netD

In [63]:
# --- Initialisation des modèles ---
netG = Generator(nz).to(device)
netD = Discriminator().to(device)

criterion = nn.BCELoss()

optimizerD = optim.Adam(netD.parameters(), lr=lr, betas=(beta1, 0.999))
optimizerG = optim.Adam(netG.parameters(), lr=lr, betas=(beta1, 0.999))

In [64]:
# --- Entraînement ---
model_infos = []
for epoch in range(num_epochs):
    for i, data in enumerate(train_loader, 0):
        with torch.no_grad():
            embeded = data[0].to(device)

        optimizerG, optimizerD, netG, netD = GAN_step(
            optimizerG=optimizerG,
            optimizerD=optimizerD,
            netG=netG,
            netD=netD,
            real_data=embeded,
            label=data[1].float().to(device),
            epoch=epoch, i=i)

    auc_score = eval_auc(netD)
    model_infos.append(get_model_info_dict(netD, epoch, auc_score))

print('Train complete！')


[0/5][0] Loss_D: 1.3854 Loss_G: 0.7352 D(x): 0.5077 D(G(z)): 0.4913 / 0.4798
AUC: 0.2145
[1/5][0] Loss_D: 3.4390 Loss_G: 0.0586 D(x): 0.4043 D(G(z)): 0.9460 / 0.9431
AUC: 0.4109
[2/5][0] Loss_D: 2.9934 Loss_G: 0.0807 D(x): 0.3186 D(G(z)): 0.9264 / 0.9225
AUC: 0.4764
[3/5][0] Loss_D: 2.3399 Loss_G: 0.1482 D(x): 0.2624 D(G(z)): 0.8692 / 0.8622
AUC: 0.4727
[4/5][0] Loss_D: 2.0390 Loss_G: 0.1930 D(x): 0.2207 D(G(z)): 0.8329 / 0.8245
AUC: 0.5273
Train complete！


In [65]:
# Récupérer le modèle avec le meilleur AUC
max_auc_model_info = max(model_infos, key=lambda x: x['auc_score'])

In [66]:
model = Discriminator()
model.load_state_dict(max_auc_model_info['model_state_dict'])
model.to(device)
model.eval()

Discriminator(
  (classifier): Sequential(
    (0): Linear(in_features=768, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=1, bias=True)
    (3): Sigmoid()
  )
)

In [67]:
class InferenceDataset(torch.utils.data.Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __getitem__(self, idx):
        return self.texts[idx]

    def __len__(self):
        return len(self.texts)


In [68]:
# Préparer le dataset d'inférence à partir du texte de test
sub_dataset = InferenceDataset(src_test["text"].tolist())

# Loader d'inférence (batch_size = 1 ou +)
inference_loader = DataLoader(sub_dataset, batch_size=16, shuffle=False)


In [72]:
sub_predictions = []

# Assure que le modèle d'embedding est sur le bon device
embedding_model.to(device)
model.to(device)
model.eval()

with torch.no_grad():
    for batch_texts in inference_loader:
        # Tokenisation
        encoded = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            return_tensors="pt",
            max_length=512
        )
        encoded = {k: v.to(device) for k, v in encoded.items()}  # ✅ déplace tout sur GPU

        # Embedding CLS
        outputs = embedding_model(**encoded)
        cls_embeddings = outputs.last_hidden_state[:, 0, :]  # (batch, 768)

        # Prédiction
        probs = model(cls_embeddings)
        sub_predictions.extend(probs.cpu().numpy().flatten())


In [75]:
sub_ans_df = pd.DataFrame({
    "id": src_test["id"],
    "generated": sub_predictions
})
print(sub_ans_df.head())


         id  generated
0  0000aaaa   0.337146
1  1111bbbb   0.338393
2  2222cccc   0.345841
