In [1]:
import numpy as np
import random
from sklearn.model_selection import train_test_split

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

# Parameters
vocab_size = 500
embedding_dim = 50
n_docs = 1000
doc_length = 40

# Simulate a vocabulary: 0 to 499
# Create some words associated with sentiment
positive_words = set(random.sample(range(vocab_size), 50))  # 10% of vocab
negative_words = set(random.sample([w for w in range(vocab_size) if w not in positive_words], 50))

# Function to generate a document
def generate_doc(label):
    doc = []
    for _ in range(doc_length):
        if label == 1 and random.random() < 0.3:
            doc.append(random.choice(list(positive_words)))
        elif label == 0 and random.random() < 0.3:
            doc.append(random.choice(list(negative_words)))
        else:
            doc.append(random.randint(0, vocab_size - 1))
    return doc

# Generate corpus and labels
corpus = []
labels = []
for _ in range(n_docs):
    sentiment = random.randint(0, 1)
    doc = generate_doc(sentiment)
    corpus.append(doc)
    labels.append(sentiment)

# Split into train and test
corpus_train, corpus_test, labels_train, labels_test = train_test_split(corpus, labels, test_size=0.2, stratify=labels)

len(corpus_train), len(corpus_test), len(labels_train), len(labels_test)

(800, 200, 800, 200)

In [2]:
# Sentiment-Aware Word Vector Learning
# Based on "Learning Word Vectors for Sentiment Analysis" (Maas et al., ACL 2011)

import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import normalize
from scipy.special import softmax, expit as sigmoid

class SentimentWordVectorModel:
    def __init__(self, vocab_size, embedding_dim, alpha=1e-4):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.alpha = alpha

        # Initialize parameters
        self.phi = np.random.randn(vocab_size, embedding_dim) * 0.01  # word vectors
        self.b_w = np.zeros(vocab_size)  # word biases
        self.psi = np.random.randn(embedding_dim) * 0.01  # sentiment classifier weights
        self.b_c = 0.0  # sentiment classifier bias

    def document_log_likelihood(self, doc_word_indices, theta):
        logits = theta @ self.phi.T + self.b_w
        log_probs = logits - np.log(np.sum(np.exp(logits)))
        return np.sum(log_probs[doc_word_indices])

    def sentiment_log_likelihood(self, doc_word_indices, sentiment_label):
        word_vecs = self.phi[doc_word_indices]
        logits = word_vecs @ self.psi + self.b_c
        probs = sigmoid(logits)
        if sentiment_label == 1:
            return np.sum(np.log(probs + 1e-8))
        else:
            return np.sum(np.log(1 - probs + 1e-8))

    def total_loss(self, corpus, sentiment_labels, thetas):
        total = 0
        for i, doc in enumerate(corpus):
            total += self.document_log_likelihood(doc, thetas[i])
            total += self.sentiment_log_likelihood(doc, sentiment_labels[i])
        # Add L2 regularization
        total -= self.alpha * (np.sum(self.phi**2) + np.sum(self.psi**2))
        return -total  # negative log-likelihood

    def fit(self, corpus, sentiment_labels, n_iters=10):
        N = len(corpus)
        thetas = np.random.randn(N, self.embedding_dim) * 0.01

        for epoch in range(n_iters):
            for i, doc in enumerate(corpus):
                theta = thetas[i]
                grad_theta = self._grad_theta(doc, theta)
                thetas[i] += 0.01 * grad_theta

            grad_phi, grad_b_w, grad_psi, grad_b_c = self._grad_global(corpus, sentiment_labels, thetas)
            self.phi += 0.01 * grad_phi
            self.b_w += 0.01 * grad_b_w
            self.psi += 0.01 * grad_psi
            self.b_c += 0.01 * grad_b_c

            loss = self.total_loss(corpus, sentiment_labels, thetas)
            print(f"Epoch {epoch+1}, Loss: {loss:.4f}")

        self.thetas = thetas  # Save document representations
        return self

    def _grad_theta(self, doc_indices, theta):
        logits = theta @ self.phi.T + self.b_w
        probs = softmax(logits)
        grad = np.zeros_like(theta)
        for i in doc_indices:
            grad += self.phi[i] - (probs @ self.phi)
        return grad - self.alpha * theta

    def _grad_global(self, corpus, sentiment_labels, thetas):
        grad_phi = np.zeros_like(self.phi)
        grad_b_w = np.zeros_like(self.b_w)
        grad_psi = np.zeros_like(self.psi)
        grad_b_c = 0.0

        for i, doc in enumerate(corpus):
            theta = thetas[i]
            logits = theta @ self.phi.T + self.b_w
            probs = softmax(logits)

            for w in range(self.vocab_size):
                count = doc.count(w)
                if count > 0:
                    grad_phi[w] += count * (theta - (probs @ theta))
                    grad_b_w[w] += count * (1 - probs[w])

            word_vecs = self.phi[doc]
            logits = word_vecs @ self.psi + self.b_c
            probs = sigmoid(logits)
            error = (sentiment_labels[i] - probs)
            grad_phi_doc = error[:, None] * self.psi[None, :]
            np.add.at(grad_phi, doc, grad_phi_doc)
            grad_psi += np.sum(error[:, None] * word_vecs, axis=0)
            grad_b_c += np.sum(error)

        grad_phi -= self.alpha * self.phi
        grad_psi -= self.alpha * self.psi
        return grad_phi, grad_b_w, grad_psi, grad_b_c

# === Entraînement ===
if __name__ == "__main__":
    from sklearn.metrics import accuracy_score

    # Import corpus and labels
    model = SentimentWordVectorModel(vocab_size=vocab_size, embedding_dim=embedding_dim)
    model.fit(corpus_train, labels_train, n_iters=5)

    # Évaluer le modèle avec une régression logistique sur les représentations de documents
    X_train = model.thetas
    y_train = np.array(labels_train)
    X_test = np.array([np.mean(model.phi[doc], axis=0) for doc in corpus_test])
    y_test = np.array(labels_test)

    clf = LogisticRegression()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy: {acc:.4f}")

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 50 is different from 500)