In [None]:
from pathlib import Path
import os
import re
import numpy as np
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split

# Tokeniseur très simple
def tokenize(text):
    text = re.sub(r"<br />", " ", text)
    return re.findall(r"\b\w+\b", text.lower())

# Construction vocabulaire + encodage
def build_vocab(reviews, vocab_size=5000, min_freq=5):
    word_counts = {}
    for text in reviews:
        for word in tokenize(text):
            word_counts[word] = word_counts.get(word, 0) + 1
    sorted_words = sorted(word_counts.items(), key=lambda x: -x[1])
    vocab = {"<PAD>": 0, "<UNK>": 1}
    for word, count in sorted_words:
        if count >= min_freq and len(vocab) < vocab_size:
            vocab[word] = len(vocab)
    return vocab

def encode(text, vocab, max_len=400):
    tokens = tokenize(text)
    indices = [vocab.get(token, vocab["<UNK>"]) for token in tokens[:max_len]]
    if len(indices) < max_len:
        indices += [vocab["<PAD>"]] * (max_len - len(indices))
    return indices


In [None]:
class IMDBDataset(Dataset):
    def __init__(self, texts, labels, vocab):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = encode(self.texts[idx], self.vocab)
        label = self.labels[idx]
        return torch.tensor(text), torch.tensor(label, dtype=torch.float)


In [None]:
import torch.nn as nn
import torch.nn.functional as F

class SentimentAwareEmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.biases = nn.Parameter(torch.zeros(vocab_size))  # b_w dans le papier
        self.sentiment_regressor = nn.Linear(embedding_dim, 1)  # ψ et b_c

    def semantic_energy(self, word_vecs, theta):
        return torch.matmul(word_vecs, theta.T)  # θᵗ · φ_w

    def forward(self, docs, thetas):
        # docs: [batch, max_len], thetas: [batch, embed_dim]
        word_vecs = self.embeddings(docs)                      # [B, L, D]
        avg_vec = word_vecs.mean(dim=1)                        # φ_doc ≈ moyenne(φ_w)
        sentiment_logits = self.sentiment_regressor(avg_vec).squeeze(1)
        sentiment_probs = torch.sigmoid(sentiment_logits)
        return sentiment_probs


In [None]:
def train(model, train_loader, optimizer, epochs=5):
    for epoch in range(epochs):
        model.train()
        total_loss, total_acc = 0, 0
        for docs, labels in train_loader:
            batch_size = docs.shape[0]
            theta = torch.randn(batch_size, model.embeddings.embedding_dim, requires_grad=True)
            optimizer.zero_grad()

            # Forward pass
            preds = model(docs, theta)
            loss = F.binary_cross_entropy(preds, labels)
            loss.backward()
            optimizer.step()

            # Accuracy
            acc = ((preds > 0.5) == labels).float().mean()
            total_loss += loss.item()
            total_acc += acc.item()

        print(f"Epoch {epoch+1} - Loss: {total_loss:.4f} - Acc: {total_acc / len(train_loader):.4f}")
