## Imports

In [15]:
import os
import re
import pandas as pd
import numpy as np

from tqdm import tqdm

from scipy.special import logsumexp
from scipy.special import expit

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

## Pré-processing

In [16]:
base_dir = "aclImdb"
train_dir = os.path.join(base_dir, "train")
test_dir = os.path.join(base_dir, "test")

def load_reviews_from_dir(directory, label):
    data = []
    for fname in os.listdir(directory):
        if fname.endswith(".txt"):
            with open(os.path.join(directory, fname), encoding="utf-8") as f:
                text = f.read()
                data.append((text, label))
    return data

def load_all_data():
    train_pos = load_reviews_from_dir(os.path.join(train_dir, "pos"), 1)
    train_neg = load_reviews_from_dir(os.path.join(train_dir, "neg"), 0)
    test_pos  = load_reviews_from_dir(os.path.join(test_dir,  "pos"), 1)
    test_neg  = load_reviews_from_dir(os.path.join(test_dir,  "neg"), 0)
    return train_pos + train_neg + test_pos + test_neg, len(train_pos) + len(train_neg), len(test_pos) + len(test_neg)

raw_data, len_train, len_test = load_all_data()
df = pd.DataFrame(raw_data, columns=["text", "label"])

def clean_text(text):
    text = re.sub(r"<br\s*/?>", " ", text)  # remplace <br> et <br /> par un espace
    text = re.sub(r"\s+", " ", text)        # normalise les espaces
    return text.strip()

df["text"] = df["text"].apply(clean_text)

vectorizer = CountVectorizer(max_features=5050, token_pattern=r"(?u)\b\w+\b")
vectorizer.fit(df["text"])
full_vocab = vectorizer.get_feature_names_out()

# --- Exclusion des 50 mots les plus fréquents comme dans l'article ---
excluded_words = full_vocab[:50]
vocab = full_vocab[50:]  # Top 5000 mots après exclusion des 50 premiers

def filter_tokens(text, vocab_set):
    tokens = text.split()
    return " ".join([tok for tok in tokens if tok in vocab_set])

vocab_set = set(vocab)
df["filtered_text"] = df["text"].apply(lambda x: filter_tokens(x, vocab_set))

In [17]:
df

Unnamed: 0,text,label,filtered_text
0,Bromwell High is a cartoon comedy. It ran at t...,1,is a cartoon ran at the same time as some othe...
1,Homelessness (or Houselessness as George Carli...,1,as has been an issue for years but never a pla...
2,Brilliant over-acting by Lesley Ann Warren. Be...,1,by dramatic lady have ever and love scenes in ...
3,This is easily the most underrated film inn th...,1,is easily the most underrated film the its doe...
4,This is not the typical Mel Brooks film. It wa...,1,is not the typical was much less slapstick tha...
...,...,...,...
49995,I occasionally let my kids watch this garbage ...,0,occasionally let my kids watch this garbage so...
49996,When all we have anymore is pretty much realit...,0,all we have anymore is pretty much reality sho...
49997,The basic genre is a thriller intercut with an...,0,basic genre is a thriller with an uncomfortabl...
49998,Four things intrigued me as to this film - fir...,0,things intrigued me as to this film it stars w...


In [18]:
print(df.head(10))

                                                text  label  \
0  Bromwell High is a cartoon comedy. It ran at t...      1   
1  Homelessness (or Houselessness as George Carli...      1   
2  Brilliant over-acting by Lesley Ann Warren. Be...      1   
3  This is easily the most underrated film inn th...      1   
4  This is not the typical Mel Brooks film. It wa...      1   
5  This isn't the comedic Robin Williams, nor is ...      1   
6  Yes its an art... to successfully make a slow ...      1   
7  In this "critically acclaimed psychological th...      1   
8  THE NIGHT LISTENER (2006) **1/2 Robin Williams...      1   
9  You know, Robin Williams, God bless him, is co...      1   

                                       filtered_text  
0  is a cartoon ran at the same time as some othe...  
1  as has been an issue for years but never a pla...  
2  by dramatic lady have ever and love scenes in ...  
3  is easily the most underrated film the its doe...  
4  is not the typical was much 

## Entraînement des vecteurs de mots (non-supervisé)

In [None]:
beta = 50  # dimension des vecteurs de mots
lambda_reg = 0.01
nu_reg = 0.001
epochs = 3
learning_rate = 0.01

vectorizer = CountVectorizer(vocabulary=list(vocab))
X = vectorizer.transform(df["filtered_text"])
X = X.toarray()  # shape: (n_docs, vocab_size)

n_docs, vocab_size = X.shape

R = np.random.normal(0, 0.01, size=(beta, vocab_size))
b = np.zeros(vocab_size)

theta = np.random.normal(0, 0.01, size=(n_docs, beta))

def softmax_probs(theta_k, R, b):
    logits = np.dot(theta_k, R) + b
    logits = logits - np.max(logits)  # stabilité numérique
    exps = np.exp(logits)
    return exps / np.sum(exps)

for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    total_log_likelihood = 0.0
    for k in tqdm(range(n_docs)):
        x_k = X[k]  # comptage des mots
        if x_k.sum() == 0:
            continue
        # E-step : optimiser θ_k
        for _ in range(3):
            probs = softmax_probs(theta[k], R, b)
            grad_theta = R @ (x_k - probs * x_k.sum()) - lambda_reg * theta[k]
            theta[k] += learning_rate * grad_theta

        # Calcul du log-likelihood pour ce document
        log_probs = np.dot(x_k, np.log(probs + 1e-9))  # pour éviter log(0)
        total_log_likelihood += log_probs

    for k in range(n_docs):
        x_k = X[k]
        if x_k.sum() == 0:
            continue
        probs = softmax_probs(theta[k], R, b)
        err = x_k - probs * x_k.sum()
        grad_R = np.outer(theta[k], err)
        grad_b = err

        # mise à jour immédiate
        R += learning_rate * (grad_R - nu_reg * R)
        b += learning_rate * grad_b


    avg_ll = total_log_likelihood / n_docs
    print(f"[Epoch {epoch+1}] Avg semantic log-likelihood: {avg_ll:.4f}")

Epoch 1/3


100%|██████████| 50000/50000 [00:22<00:00, 2207.17it/s]


# Ajustement par algorithme génétique

In [None]:
def softmax_probs(theta_k, R, b):
    logits = np.dot(theta_k, R) + b
    logits = logits - np.max(logits)  # stabilité numérique
    exps = np.exp(logits)
    return exps / np.sum(exps)

class Model:
    def __init__(self, lambda_reg=0.1, nu_reg=0.1, learning_rate=0.01, beta=10):
        self.lambda_reg = lambda_reg
        self.nu_reg = nu_reg
        self.learning_rate = learning_rate
        self.beta = beta

    def train(self, vocab, filtered_texts, epochs=3):
        vectorizer = CountVectorizer(vocabulary=list(vocab))
        X = vectorizer.fit_transform(filtered_texts)
        X = X.toarray()  # shape: (n_docs, vocab_size)

        n_docs, vocab_size = X.shape

        self.R = np.random.normal(0, 0.01, size=(self.beta, vocab_size))
        self.b = np.zeros(vocab_size)

        theta = np.random.normal(0, 0.01, size=(n_docs, self.beta))

        for epoch in range(epochs):
            print(f"Epoch {epoch+1}/{epochs}")
            total_log_likelihood = 0.0
            for k in tqdm(range(n_docs)):
                x_k = X[k]
                if x_k.sum() == 0:
                    continue
                # E-step : optimiser θ_k
                for _ in range(3):
                    probs = softmax_probs(theta[k], self.R, self.b)
                    grad_theta = self.R @ (x_k - probs * x_k.sum()) - self.lambda_reg * theta[k]
                    theta[k] += self.learning_rate * grad_theta

                # Log-likelihood
                log_probs = np.dot(x_k, np.log(probs + 1e-9))
                total_log_likelihood += log_probs

            for k in range(n_docs):
                x_k = X[k]
                if x_k.sum() == 0:
                    continue
                probs = softmax_probs(theta[k], self.R, self.b)
                err = x_k - probs * x_k.sum()
                grad_R = np.outer(theta[k], err)
                grad_b = err

                self.R += self.learning_rate * (grad_R - self.nu_reg * self.R)
                self.b += self.learning_rate * grad_b

            avg_ll = total_log_likelihood / n_docs
            print(f"[Epoch {epoch+1}] Avg semantic log-likelihood: {avg_ll:.4f}")
        
        self.avg_log_likelihood = avg_ll

    def compute_objective(self):
        return self.avg_log_likelihood - self.lambda_reg * np.sum(self.R**2) - self.nu_reg * np.sum(self.b**2)

In [None]:
import random

# Espace de recherche des hyperparamètres
param_space = {
    'beta': [25, 50, 75, 100],
    'lambda_reg': [1e-4, 1e-3, 1e-2, 1e-1],
    'nu_reg': [1e-5, 1e-4, 1e-3, 1e-2],
    'learning_rate': [0.001, 0.005, 0.01, 0.05],
}

# Génère un individu
def generate_individual():
    return {
        'beta': random.choice(param_space['beta']),
        'lambda_reg': random.choice(param_space['lambda_reg']),
        'nu_reg': random.choice(param_space['nu_reg']),
        'learning_rate': random.choice(param_space['learning_rate']),
    }

# Évalue un individu (doit appeler ton modèle ici)
def evaluate(individual, vocab, text):
    model = Model(
        beta=individual['beta'],
        lambda_reg=individual['lambda_reg'],
        nu_reg=individual['nu_reg'],
        learning_rate=individual['learning_rate']
    )
    model.train(vocab, text, epochs=3)
    return model.compute_objective()  # À maximiser

# Crossover
def crossover(parent1, parent2):
    child = {}
    for key in parent1:
        child[key] = random.choice([parent1[key], parent2[key]])
    return child

# Mutation
def mutate(individual, mutation_rate=0.1):
    for key in individual:
        if random.random() < mutation_rate:
            individual[key] = random.choice(param_space[key])
    return individual

# Algorithme principal
def genetic_search(vocab, text, generations=10, population_size=10):
    population = [generate_individual() for _ in range(population_size)]
    
    for generation in range(generations):
        scored = [(ind, evaluate(ind, vocab, text)) for ind in population]
        scored.sort(key=lambda x: x[1], reverse=True)  # Maximize objective

        print(f"\nGeneration {generation + 1}, Best score: {scored[0][1]:.4f}")
        print(f"Best individual: {scored[0][0]}\n")

        survivors = [ind for ind, _ in scored[:population_size // 2]]

        # Reproduction
        next_gen = survivors.copy()
        while len(next_gen) < population_size:
            parents = random.sample(survivors, 2)
            child = mutate(crossover(parents[0], parents[1]))
            next_gen.append(child)

        population = next_gen

    return scored[0]

best = genetic_search(vocab, df.iloc[:len_train]["filtered_text"], generations=10, population_size=10)
print(f"Best individual: {best}")

Epoch 1/3


100%|██████████| 25000/25000 [00:09<00:00, 2605.41it/s]


[Epoch 1] Avg semantic log-likelihood: -1324.0861
Epoch 2/3


100%|██████████| 25000/25000 [00:09<00:00, 2552.99it/s]


[Epoch 2] Avg semantic log-likelihood: -3178.9025
Epoch 3/3


100%|██████████| 25000/25000 [00:12<00:00, 2080.16it/s]


[Epoch 3] Avg semantic log-likelihood: -3136.9355
Epoch 1/3


100%|██████████| 25000/25000 [00:09<00:00, 2660.79it/s]


[Epoch 1] Avg semantic log-likelihood: -1324.3828
Epoch 2/3


100%|██████████| 25000/25000 [00:09<00:00, 2655.35it/s]


[Epoch 2] Avg semantic log-likelihood: -943.2644
Epoch 3/3


100%|██████████| 25000/25000 [00:09<00:00, 2581.17it/s]


[Epoch 3] Avg semantic log-likelihood: -951.5969
Epoch 1/3


100%|██████████| 25000/25000 [00:09<00:00, 2670.33it/s]


[Epoch 1] Avg semantic log-likelihood: -1324.0928
Epoch 2/3


100%|██████████| 25000/25000 [00:10<00:00, 2324.45it/s]


[Epoch 2] Avg semantic log-likelihood: -3190.9309
Epoch 3/3


100%|██████████| 25000/25000 [00:12<00:00, 2029.11it/s]


[Epoch 3] Avg semantic log-likelihood: -3144.3328
Epoch 1/3


100%|██████████| 25000/25000 [00:09<00:00, 2725.48it/s]


[Epoch 1] Avg semantic log-likelihood: -1324.4154
Epoch 2/3


100%|██████████| 25000/25000 [00:09<00:00, 2700.92it/s]


[Epoch 2] Avg semantic log-likelihood: -957.4974
Epoch 3/3


100%|██████████| 25000/25000 [00:09<00:00, 2617.73it/s]


[Epoch 3] Avg semantic log-likelihood: -949.2766
Epoch 1/3


100%|██████████| 25000/25000 [00:06<00:00, 4014.43it/s]


[Epoch 1] Avg semantic log-likelihood: -1324.3693
Epoch 2/3


100%|██████████| 25000/25000 [00:06<00:00, 4149.70it/s]


[Epoch 2] Avg semantic log-likelihood: -1065.2278
Epoch 3/3


100%|██████████| 25000/25000 [00:08<00:00, 2997.16it/s]


[Epoch 3] Avg semantic log-likelihood: -3170.7203
Epoch 1/3


100%|██████████| 25000/25000 [00:06<00:00, 4056.54it/s]


[Epoch 1] Avg semantic log-likelihood: -1324.2569
Epoch 2/3


100%|██████████| 25000/25000 [00:06<00:00, 4001.23it/s]


[Epoch 2] Avg semantic log-likelihood: -3183.7785
Epoch 3/3


100%|██████████| 25000/25000 [00:08<00:00, 2957.32it/s]


[Epoch 3] Avg semantic log-likelihood: -3146.1465
Epoch 1/3


100%|██████████| 25000/25000 [00:06<00:00, 3991.88it/s]


[Epoch 1] Avg semantic log-likelihood: -1324.2943
Epoch 2/3


100%|██████████| 25000/25000 [00:06<00:00, 3861.87it/s]


[Epoch 2] Avg semantic log-likelihood: -3183.5561
Epoch 3/3


100%|██████████| 25000/25000 [00:08<00:00, 2792.62it/s]


[Epoch 3] Avg semantic log-likelihood: -3196.8821
Epoch 1/3


100%|██████████| 25000/25000 [00:11<00:00, 2123.65it/s]


[Epoch 1] Avg semantic log-likelihood: -1323.8899
Epoch 2/3


100%|██████████| 25000/25000 [00:12<00:00, 2016.39it/s]


[Epoch 2] Avg semantic log-likelihood: -3169.4263
Epoch 3/3


100%|██████████| 25000/25000 [00:14<00:00, 1696.59it/s]


[Epoch 3] Avg semantic log-likelihood: -3197.0487
Epoch 1/3


100%|██████████| 25000/25000 [00:35<00:00, 699.73it/s]


[Epoch 1] Avg semantic log-likelihood: -1324.3545
Epoch 2/3


100%|██████████| 25000/25000 [00:35<00:00, 706.35it/s]


[Epoch 2] Avg semantic log-likelihood: -943.5800
Epoch 3/3


100%|██████████| 25000/25000 [00:37<00:00, 666.10it/s]


[Epoch 3] Avg semantic log-likelihood: -937.2420
Epoch 1/3


100%|██████████| 25000/25000 [00:09<00:00, 2557.12it/s]


KeyboardInterrupt: 

In [None]:
best

({'beta': 75, 'lambda_reg': 0.0001, 'nu_reg': 0.0001, 'learning_rate': 0.01},
 np.float64(-917.4686275109037))

In [None]:
model = Model(
    beta= best[0]['beta'], 
    lambda_reg = best[0]['lambda_reg'], 
    nu_reg = best[0]['nu_reg'], 
    learning_rate = best[0]['learning_rate']
)
model.train(vocab, df.iloc[:len_train]["filtered_text"], epochs=3)
model.compute_objective()

Epoch 1/3


 30%|██▉       | 14758/50000 [00:07<00:17, 2030.57it/s]


KeyboardInterrupt: 

# Test avec EarlyStop

In [None]:
def softmax_probs(theta_k, R, b):
    logits = np.dot(theta_k, R) + b
    logits = logits - np.max(logits)  # stabilité numérique
    exps = np.exp(logits)
    return exps / np.sum(exps)

class ModelWithEarlyStopping:
    def __init__(self, lambda_reg=0.1, nu_reg=0.1, learning_rate=0.01, beta=10):
        self.lambda_reg = lambda_reg
        self.nu_reg = nu_reg
        self.learning_rate = learning_rate
        self.beta = beta

    def train(self, vocab, filtered_texts, earlyStop=3, max_epochs=100):
        vectorizer = CountVectorizer(vocabulary=list(vocab))
        X = vectorizer.fit_transform(filtered_texts)
        X = X.toarray()

        n_docs, vocab_size = X.shape
        self.R = np.random.normal(0, 0.01, size=(self.beta, vocab_size))
        self.b = np.zeros(vocab_size)
        theta = np.random.normal(0, 0.01, size=(n_docs, self.beta))

        best_avg_ll = float('-inf')
        best_params = {}
        no_improve_count = 0

        for epoch in range(max_epochs):
            print(f"Epoch {epoch + 1}/{max_epochs}")
            total_log_likelihood = 0.0

            for k in range(n_docs):
                x_k = X[k]
                if x_k.sum() == 0:
                    continue

                # E-step : optimiser θ_k
                for _ in range(3):
                    probs = softmax_probs(theta[k], self.R, self.b)
                    grad_theta = self.R @ (x_k - probs * x_k.sum()) - self.lambda_reg * theta[k]
                    theta[k] += self.learning_rate * grad_theta

                log_probs = np.dot(x_k, np.log(probs + 1e-9))
                total_log_likelihood += log_probs

            for k in range(n_docs):
                x_k = X[k]
                if x_k.sum() == 0:
                    continue
                probs = softmax_probs(theta[k], self.R, self.b)
                err = x_k - probs * x_k.sum()
                grad_R = np.outer(theta[k], err)
                grad_b = err

                self.R += self.learning_rate * (grad_R - self.nu_reg * self.R)
                self.b += self.learning_rate * grad_b

            avg_ll = total_log_likelihood / n_docs
            print(f"[Epoch {epoch+1}] Avg semantic log-likelihood: {avg_ll:.4f}")

            # Early stopping check
            if avg_ll > best_avg_ll + 1e-4:  # petite tolérance
                best_avg_ll = avg_ll
                best_params = {
                    'R': self.R.copy(),
                    'b': self.b.copy(),
                    'theta': theta.copy()
                }
                no_improve_count = 0
            else:
                no_improve_count += 1
                if no_improve_count >= earlyStop:
                    print(f"Early stopping at epoch {epoch+1}. Best avg log-likelihood: {best_avg_ll:.4f}")
                    break

        # Restore best params
        self.R = best_params['R']
        self.b = best_params['b']
        self.theta = best_params['theta']
        self.avg_log_likelihood = best_avg_ll

    def compute_objective(self):
        return self.avg_log_likelihood - self.lambda_reg * np.sum(self.R**2) - self.nu_reg * np.sum(self.b**2)

In [None]:
model = ModelWithEarlyStopping(
    beta= best[0]['beta'], 
    lambda_reg = best[0]['lambda_reg'], 
    nu_reg = best[0]['nu_reg'], 
    learning_rate = best[0]['learning_rate']
)
model.train(vocab, df.iloc[:len_train]["filtered_text"])
model.compute_objective()

Epoch 1/100
[Epoch 1] Avg semantic log-likelihood: -1310.0769
Epoch 2/100
[Epoch 2] Avg semantic log-likelihood: -939.1448
Epoch 3/100
[Epoch 3] Avg semantic log-likelihood: -934.4805
Epoch 4/100
[Epoch 4] Avg semantic log-likelihood: -931.9069
Epoch 5/100
[Epoch 5] Avg semantic log-likelihood: -929.1385
Epoch 6/100
[Epoch 6] Avg semantic log-likelihood: -926.2104
Epoch 7/100
[Epoch 7] Avg semantic log-likelihood: -923.5449
Epoch 8/100
[Epoch 8] Avg semantic log-likelihood: -920.8669
Epoch 9/100
[Epoch 9] Avg semantic log-likelihood: -918.3053
Epoch 10/100
[Epoch 10] Avg semantic log-likelihood: -916.1097
Epoch 11/100
[Epoch 11] Avg semantic log-likelihood: -914.2885
Epoch 12/100
[Epoch 12] Avg semantic log-likelihood: -913.8383
Epoch 13/100
[Epoch 13] Avg semantic log-likelihood: -909.8718
Epoch 14/100
[Epoch 14] Avg semantic log-likelihood: -908.5739
Epoch 15/100
[Epoch 15] Avg semantic log-likelihood: -905.3351
Epoch 16/100
[Epoch 16] Avg semantic log-likelihood: -911.6369
Epoch 17/

np.float64(-912.5670846563029)

# Modèle supervisé

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

def softmax_probs(theta_k, R, b):
    logits = np.dot(theta_k, R) + b
    logits = logits - np.max(logits)  # stabilité numérique
    exps = np.exp(logits)
    return exps / np.sum(exps)

class ModelWithEarlyStopping:
    def __init__(self, lambda_reg=0.1, nu_reg=0.1, learning_rate=0.01, beta=10, alpha_sup=1.0, vocab=None):
        self.lambda_reg = lambda_reg
        self.nu_reg = nu_reg
        self.learning_rate = learning_rate
        self.beta = beta
        self.alpha_sup = alpha_sup  # pondération du terme supervisé
        self.vocab = vocab

    def train(self, filtered_texts, vocab=None, earlyStop=3, max_epochs=100):
        if vocab:
            self.vocab = vocab  # pour usage futur
        vocab = self.vocab
        vectorizer = CountVectorizer(vocabulary=list(vocab))
        X = vectorizer.fit_transform(filtered_texts).toarray()
        
        self.vectorizer = vectorizer

        n_docs, vocab_size = X.shape
        self.R = np.random.normal(0, 0.01, size=(self.beta, vocab_size))
        self.b = np.zeros(vocab_size)
        theta = np.random.normal(0, 0.01, size=(n_docs, self.beta))

        best_avg_ll = float('-inf')
        best_params = {}
        no_improve_count = 0

        for epoch in range(max_epochs):
            print(f"Epoch {epoch + 1}/{max_epochs}")
            total_log_likelihood = 0.0

            for k in range(n_docs):
                x_k = X[k]
                if x_k.sum() == 0:
                    continue

                for _ in range(3):
                    probs = softmax_probs(theta[k], self.R, self.b)
                    grad_theta = self.R @ (x_k - probs * x_k.sum()) - self.lambda_reg * theta[k]
                    theta[k] += self.learning_rate * grad_theta

                log_probs = np.dot(x_k, np.log(probs + 1e-9))
                total_log_likelihood += log_probs

            for k in range(n_docs):
                x_k = X[k]
                if x_k.sum() == 0:
                    continue
                probs = softmax_probs(theta[k], self.R, self.b)
                err = x_k - probs * x_k.sum()
                grad_R = np.outer(theta[k], err)
                grad_b = err

                self.R += self.learning_rate * (grad_R - self.nu_reg * self.R)
                self.b += self.learning_rate * grad_b

            avg_ll = total_log_likelihood / n_docs
            print(f"[Epoch {epoch+1}] Avg semantic log-likelihood: {avg_ll:.4f}")

            if avg_ll > best_avg_ll + 1e-4:
                best_avg_ll = avg_ll
                best_params = {
                    'R': self.R.copy(),
                    'b': self.b.copy(),
                    'theta': theta.copy()
                }
                no_improve_count = 0
            else:
                no_improve_count += 1
                if no_improve_count >= earlyStop:
                    print(f"Early stopping at epoch {epoch+1}. Best avg log-likelihood: {best_avg_ll:.4f}")
                    break

        self.R = best_params['R']
        self.b = best_params['b']
        self.theta = best_params['theta']
        self.avg_log_likelihood = best_avg_ll

    def train_classifier(self, y):
        """
        Entraîne une régression logistique binaire sur les vecteurs theta_k appris.
        """
        assert hasattr(self, 'theta'), "Train the model first to get theta."
        self.classifier = LogisticRegression()
        self.classifier.fit(self.theta, y)
        print("Régression logistique entraînée.")

    def fit(self, filtered_texts, y):
        """
        Entraîne le modèle et la régression logistique sur les textes filtrés et les labels.
        """
        self.train(filtered_texts)
        self.train_classifier(y)

    def predict(self, filtered_texts):
        """
        Prédit les labels binaires pour de nouveaux textes.
        """
        assert hasattr(self, 'classifier'), "Train the classifier first."
        X = self.vectorizer.transform(filtered_texts).toarray()
        n_docs = X.shape[0]
        theta = np.zeros((n_docs, self.beta))

        for k in range(n_docs):
            x_k = X[k]
            if x_k.sum() == 0:
                continue
            theta_k = np.zeros(self.beta)
            for _ in range(3):
                probs = softmax_probs(theta_k, self.R, self.b)
                grad_theta = self.R @ (x_k - probs * x_k.sum()) - self.lambda_reg * theta_k
                theta_k += self.learning_rate * grad_theta
            theta[k] = theta_k

        preds = self.classifier.predict(theta)
        return preds

    def compute_objective(self):
        return self.avg_log_likelihood - self.lambda_reg * np.sum(self.R**2) - self.nu_reg * np.sum(self.b**2)


NameError: name 'self' is not defined

In [None]:
from sklearn.metrics import classification_report
import pickle

In [None]:
model = ModelWithEarlyStopping(
    beta= best[0]['beta'], 
    lambda_reg = best[0]['lambda_reg'], 
    nu_reg = best[0]['nu_reg'], 
    learning_rate = best[0]['learning_rate'],
    vocab=vocab
)
model.fit(df.iloc[:len_train]["filtered_text"], df.iloc[:len_train]["label"])

In [None]:
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [None]:
with open('model.pkl', 'rb') as f:
    model = pickle.load(f)

In [None]:
print("Log-likelihood:", model.compute_objective())
preds = model.predict(df["filtered_text"].iloc[len_train:])
print(classification_report(df["label"].iloc[len_train:], preds, target_names=["neg", "pos"]))

## Ajustement avec supervision sentimentale

In [None]:
# Initialisation des paramètres de la régression logistique
psi = np.random.normal(0, 0.01, size=(beta,))
bc = 0.0
sentiment_lr = 0.1
sentiment_epochs = 5

# Normaliser les labels entre [0, 1]
df["score"] = df["label"]

# Entraînement simple
for epoch in range(sentiment_epochs):
    total_loss = 0
    for k in range(n_docs):
        x_k = X[k]
        if x_k.sum() == 0:
            continue
        # Représentation du doc = moyenne pondérée des vecteurs de mots
        doc_vec = (R @ x_k) / x_k.sum()
        pred = expit(psi @ doc_vec + bc)
        label = df["score"].iloc[k]

        # Gradient + update
        error = label - pred
        psi += sentiment_lr * error * doc_vec
        bc += sentiment_lr * error

        total_loss += - (label * np.log(pred + 1e-9) + (1 - label) * np.log(1 - pred + 1e-9))
    print(f"[Sentiment Epoch {epoch+1}] Loss: {total_loss/n_docs:.4f}")

[Sentiment Epoch 1] Loss: nan
[Sentiment Epoch 2] Loss: nan
[Sentiment Epoch 3] Loss: nan


KeyboardInterrupt: 

## Comparaison des performances

In [None]:
X_bow = X.copy()
y = df["label"].values

X_train, X_test, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=42)

#### Bag of words

In [None]:
clf = LinearSVC()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(f"[BoW] Accuracy: {accuracy_score(y_test, y_pred):.4f}")

[BoW] Accuracy: 0.8471


#### LSA

In [None]:
svd = TruncatedSVD(n_components=100)
X_lsa_train = svd.fit_transform(X_train)
X_lsa_test = svd.transform(X_test)

clf = LinearSVC()
clf.fit(X_lsa_train, y_train)
y_pred = clf.predict(X_lsa_test)
print(f"[LSA] Accuracy: {accuracy_score(y_test, y_pred):.4f}")

[LSA] Accuracy: 0.7704


#### LDA

In [None]:
lda = LatentDirichletAllocation(n_components=50, max_iter=10, random_state=42)
X_lda_train = lda.fit_transform(X_train)
X_lda_test = lda.transform(X_test)

clf = LinearSVC()
clf.fit(X_lda_train, y_train)
y_pred = clf.predict(X_lda_test)
print(f"[LDA] Accuracy: {accuracy_score(y_test, y_pred):.4f}")

[LDA] Accuracy: 0.8106


Modèle sémantique seul

In [None]:
def doc_features_from_R(X_data, R):
    sums = X_data.sum(axis=1, keepdims=True)
    sums[sums == 0] = 1  # évite division par zéro
    return (X_data @ R.T) / sums

X_r_train = doc_features_from_R(X_train, R)
X_r_test = doc_features_from_R(X_test, R)

clf = LinearSVC()
clf.fit(X_r_train, y_train)
y_pred = clf.predict(X_r_test)
print(f"[Semantic Only] Accuracy: {accuracy_score(y_test, y_pred):.4f}")

[Semantic Only] Accuracy: 0.5035


#### Modèle complet

In [None]:
def doc_sentiment_features(X_data, R, psi, bc):
    feats = doc_features_from_R(X_data, R)
    sentiment_score = expit(feats @ psi + bc).reshape(-1, 1)
    return np.hstack([feats, sentiment_score])  # concat ψ info

X_full_train = doc_sentiment_features(X_train, R, psi, bc)
X_full_test = doc_sentiment_features(X_test, R, psi, bc)

clf = LinearSVC()
clf.fit(X_full_train, y_train)
y_pred = clf.predict(X_full_test)
print(f"[Semantic + Sentiment] Accuracy: {accuracy_score(y_test, y_pred):.4f}")

[Semantic + Sentiment] Accuracy: 0.5035


#### Concat BoW et modèle sémantique seul

In [None]:
X_comb_train = np.hstack([X_r_train, X_train])
X_comb_test = np.hstack([X_r_test, X_test])

clf = LinearSVC()
clf.fit(X_comb_train, y_train)
y_pred = clf.predict(X_comb_test)
print(f"[Semantic + BoW] Accuracy: {accuracy_score(y_test, y_pred):.4f}")

[Semantic + BoW] Accuracy: 0.5035
