## Imports

In [1]:
!pip install -r requirements.txt



In [9]:
import os
import re
import pandas as pd
import numpy as np

from tqdm import tqdm

from scipy.special import logsumexp
from scipy.special import expit

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

## Pré-processing

In [5]:
base_dir = "aclImdb"
train_dir = os.path.join(base_dir, "train")
test_dir = os.path.join(base_dir, "test")

def load_reviews_from_dir(directory, label):
    data = []
    for fname in os.listdir(directory):
        if fname.endswith(".txt"):
            with open(os.path.join(directory, fname), encoding="utf-8") as f:
                text = f.read()
                data.append((text, label))
    return data

def load_all_data():
    train_pos = load_reviews_from_dir(os.path.join(train_dir, "pos"), 1)
    train_neg = load_reviews_from_dir(os.path.join(train_dir, "neg"), 0)
    test_pos  = load_reviews_from_dir(os.path.join(test_dir,  "pos"), 1)
    test_neg  = load_reviews_from_dir(os.path.join(test_dir,  "neg"), 0)
    return train_pos + train_neg + test_pos + test_neg

raw_data = load_all_data()
df = pd.DataFrame(raw_data, columns=["text", "label"])

def clean_text(text):
    text = re.sub(r"<br\s*/?>", " ", text)  # remplace <br> et <br /> par un espace
    text = re.sub(r"\s+", " ", text)        # normalise les espaces
    return text.strip()

df["text"] = df["text"].apply(clean_text)

vectorizer = CountVectorizer(max_features=5050, token_pattern=r"(?u)\b\w+\b")
vectorizer.fit(df["text"])
full_vocab = vectorizer.get_feature_names_out()

# --- Exclusion des 50 mots les plus fréquents comme dans l'article ---
excluded_words = full_vocab[:50]
vocab = full_vocab[50:]  # Top 5000 mots après exclusion des 50 premiers

def filter_tokens(text, vocab_set):
    tokens = text.split()
    return " ".join([tok for tok in tokens if tok in vocab_set])

vocab_set = set(vocab)
df["filtered_text"] = df["text"].apply(lambda x: filter_tokens(x, vocab_set))


In [6]:
print(df.head(10))

                                                text  label  \
0  Bromwell High is a cartoon comedy. It ran at t...      1   
1  Homelessness (or Houselessness as George Carli...      1   
2  Brilliant over-acting by Lesley Ann Warren. Be...      1   
3  This is easily the most underrated film inn th...      1   
4  This is not the typical Mel Brooks film. It wa...      1   
5  This isn't the comedic Robin Williams, nor is ...      1   
6  Yes its an art... to successfully make a slow ...      1   
7  In this "critically acclaimed psychological th...      1   
8  THE NIGHT LISTENER (2006) **1/2 Robin Williams...      1   
9  You know, Robin Williams, God bless him, is co...      1   

                                       filtered_text  
0  is a cartoon ran at the same time as some othe...  
1  as has been an issue for years but never a pla...  
2  by dramatic lady have ever and love scenes in ...  
3  is easily the most underrated film the its doe...  
4  is not the typical was much 

## Entraînement des vecteurs de mots (non-supervisé)

In [None]:
beta = 50  # dimension des vecteurs de mots
lambda_reg = 0.01
nu_reg = 0.001
epochs = 3
learning_rate = 0.01

vectorizer = CountVectorizer(vocabulary=list(vocab))
X = vectorizer.transform(df["filtered_text"])
X = X.toarray()  # shape: (n_docs, vocab_size)

n_docs, vocab_size = X.shape

R = np.random.normal(0, 0.01, size=(beta, vocab_size))
b = np.zeros(vocab_size)

theta = np.random.normal(0, 0.01, size=(n_docs, beta))

def softmax_probs(theta_k, R, b):
    logits = np.dot(theta_k, R) + b
    logits = logits - np.max(logits)  # stabilité numérique
    exps = np.exp(logits)
    return exps / np.sum(exps)

for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    total_log_likelihood = 0.0
    for k in tqdm(range(n_docs)):
        x_k = X[k]  # comptage des mots
        if x_k.sum() == 0:
            continue
        # E-step : optimiser θ_k
        for _ in range(3):
            probs = softmax_probs(theta[k], R, b)
            grad_theta = R @ (x_k - probs * x_k.sum()) - lambda_reg * theta[k]
            theta[k] += learning_rate * grad_theta

        # Calcul du log-likelihood pour ce document
        log_probs = np.dot(x_k, np.log(probs + 1e-9))  # pour éviter log(0)
        total_log_likelihood += log_probs

    for k in range(n_docs):
        x_k = X[k]
        if x_k.sum() == 0:
            continue
        probs = softmax_probs(theta[k], R, b)
        err = x_k - probs * x_k.sum()
        grad_R = np.outer(theta[k], err)
        grad_b = err

        # mise à jour immédiate
        R += learning_rate * (grad_R - nu_reg * R)
        b += learning_rate * grad_b


    avg_ll = total_log_likelihood / n_docs
    print(f"[Epoch {epoch+1}] Avg semantic log-likelihood: {avg_ll:.4f}")

Epoch 1/3


100%|██████████| 50000/50000 [00:28<00:00, 1745.98it/s]


[Epoch 1] Avg semantic log-likelihood: -1310.0647
Epoch 2/3


100%|██████████| 50000/50000 [00:28<00:00, 1746.61it/s]


## Ajustement avec supervision sentimentale

In [26]:
# Initialisation des paramètres de la régression logistique
psi = np.random.normal(0, 0.01, size=(beta,))
bc = 0.0
sentiment_lr = 0.1
sentiment_epochs = 5

# Normaliser les labels entre [0, 1]
df["score"] = df["label"]

# Entraînement simple
for epoch in range(sentiment_epochs):
    total_loss = 0
    for k in range(n_docs):
        x_k = X[k]
        if x_k.sum() == 0:
            continue
        # Représentation du doc = moyenne pondérée des vecteurs de mots
        doc_vec = (R @ x_k) / x_k.sum()
        pred = expit(psi @ doc_vec + bc)
        label = df["score"].iloc[k]

        # Gradient + update
        error = label - pred
        psi += sentiment_lr * error * doc_vec
        bc += sentiment_lr * error

        total_loss += - (label * np.log(pred + 1e-9) + (1 - label) * np.log(1 - pred + 1e-9))
    print(f"[Sentiment Epoch {epoch+1}] Loss: {total_loss/n_docs:.4f}")

[Sentiment Epoch 1] Loss: nan
[Sentiment Epoch 2] Loss: nan
[Sentiment Epoch 3] Loss: nan


KeyboardInterrupt: 

## Comparaison des performances

In [11]:
X_bow = X.copy()
y = df["label"].values

X_train, X_test, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=42)

#### Bag of words

In [12]:
clf = LinearSVC()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(f"[BoW] Accuracy: {accuracy_score(y_test, y_pred):.4f}")

[BoW] Accuracy: 0.8471


#### LSA

In [13]:
svd = TruncatedSVD(n_components=100)
X_lsa_train = svd.fit_transform(X_train)
X_lsa_test = svd.transform(X_test)

clf = LinearSVC()
clf.fit(X_lsa_train, y_train)
y_pred = clf.predict(X_lsa_test)
print(f"[LSA] Accuracy: {accuracy_score(y_test, y_pred):.4f}")

[LSA] Accuracy: 0.7704


#### LDA

In [14]:
lda = LatentDirichletAllocation(n_components=50, max_iter=10, random_state=42)
X_lda_train = lda.fit_transform(X_train)
X_lda_test = lda.transform(X_test)

clf = LinearSVC()
clf.fit(X_lda_train, y_train)
y_pred = clf.predict(X_lda_test)
print(f"[LDA] Accuracy: {accuracy_score(y_test, y_pred):.4f}")

[LDA] Accuracy: 0.8106


Modèle sémantique seul

In [17]:
def doc_features_from_R(X_data, R):
    sums = X_data.sum(axis=1, keepdims=True)
    sums[sums == 0] = 1  # évite division par zéro
    return (X_data @ R.T) / sums

X_r_train = doc_features_from_R(X_train, R)
X_r_test = doc_features_from_R(X_test, R)

clf = LinearSVC()
clf.fit(X_r_train, y_train)
y_pred = clf.predict(X_r_test)
print(f"[Semantic Only] Accuracy: {accuracy_score(y_test, y_pred):.4f}")

[Semantic Only] Accuracy: 0.5035


#### Modèle complet

In [18]:
def doc_sentiment_features(X_data, R, psi, bc):
    feats = doc_features_from_R(X_data, R)
    sentiment_score = expit(feats @ psi + bc).reshape(-1, 1)
    return np.hstack([feats, sentiment_score])  # concat ψ info

X_full_train = doc_sentiment_features(X_train, R, psi, bc)
X_full_test = doc_sentiment_features(X_test, R, psi, bc)

clf = LinearSVC()
clf.fit(X_full_train, y_train)
y_pred = clf.predict(X_full_test)
print(f"[Semantic + Sentiment] Accuracy: {accuracy_score(y_test, y_pred):.4f}")

[Semantic + Sentiment] Accuracy: 0.5035


#### Concat BoW et modèle sémantique seul

In [20]:
X_comb_train = np.hstack([X_r_train, X_train])
X_comb_test = np.hstack([X_r_test, X_test])

clf = LinearSVC()
clf.fit(X_comb_train, y_train)
y_pred = clf.predict(X_comb_test)
print(f"[Semantic + BoW] Accuracy: {accuracy_score(y_test, y_pred):.4f}")

[Semantic + BoW] Accuracy: 0.5035
