In [2]:
from gensim import corpora
from gensim.models import LdaModel

# Sample corpus (list of documents)
documents = ["cat eats food", "dog eats food", "cat and dog are friends"]

# Tokenizing and creating a dictionary
texts = [[word for word in doc.split()] for doc in documents]
dictionary = corpora.Dictionary(texts)

# Creating a corpus
corpus = [dictionary.doc2bow(text) for text in texts]

# Building the LDA model
lda_model = LdaModel(corpus, num_topics=2, id2word=dictionary, passes=15)

# Print topics
topics = lda_model.print_topics(num_words=3)
for topic in topics:
    print(topic)

(0, '0.145*"food" + 0.145*"eats" + 0.143*"cat"')
(1, '0.173*"dog" + 0.173*"cat" + 0.172*"eats"')


In [None]:
import numpy as np
import pandas as pd
import gensim
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from datasets import load_dataset

nltk.download('punkt')

# Charger l'ensemble de données IMDB (exemple simplifié)
dataset = load_dataset("imdb")

# Récupérer les données
texts = dataset['train']['text'] + dataset['test']['text']
labels = dataset['train']['label'] + dataset['test']['label']

# Tokenisation des textes
tokenized_texts = [word_tokenize(text.lower()) for text in texts]

# Séparer en jeu d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(tokenized_texts, labels, test_size=0.2, random_state=42)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\courn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [2]:
# Entraînement du modèle Word2Vec
word2vec_model = gensim.models.Word2Vec(sentences=X_train, vector_size=50, window=5, min_count=5, workers=4)

# Fonction pour obtenir la moyenne des vecteurs de mots d'un texte
def get_avg_vector(text, model, vector_size=50):
    vectors = [model.wv[word] for word in text if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(vector_size)

# Transformation des textes en vecteurs
X_train_vectors = np.array([get_avg_vector(text, word2vec_model, 50) for text in X_train])
X_test_vectors = np.array([get_avg_vector(text, word2vec_model, 50) for text in X_test])

In [4]:
# Entraîner un classificateur de sentiment (régression logistique)
classifier = LogisticRegression()
classifier.fit(X_train_vectors, y_train)

# Prédictions
y_pred = classifier.predict(X_test_vectors)

# Évaluer les performances
accuracy = accuracy_score(y_test, y_pred)
print(f"Exactitude du modèle: {accuracy:.4f}")

Exactitude du modèle: 0.8098


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
import torch
import torch.nn as nn
import torch.optim as optim

# Conversion des données en tenseurs PyTorch
X_train_tensor = torch.tensor(X_train_vectors, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_vectors, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Définition du modèle de classification
class SentimentClassifier(nn.Module):
    def __init__(self, input_dim):
        super(SentimentClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 100)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(100, 2)  # 2 classes (positif/négatif)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return self.softmax(x)

# Initialisation du modèle
model = SentimentClassifier(input_dim=50)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Entraînement du modèle
num_epochs = 1000
for epoch in range(num_epochs):
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()
    print(f"Epoch [{epoch+1}/{num_epochs}], Perte: {loss.item():.4f}")

# Évaluation
with torch.no_grad():
    y_pred_test = model(X_test_tensor).argmax(dim=1)
    accuracy = (y_pred_test == y_test_tensor).float().mean()
    print(f"Exactitude du modèle PyTorch: {accuracy:.4f}")

Epoch [1/1000], Perte: 0.6960
Epoch [2/1000], Perte: 0.6931
Epoch [3/1000], Perte: 0.6906
Epoch [4/1000], Perte: 0.6884
Epoch [5/1000], Perte: 0.6863
Epoch [6/1000], Perte: 0.6841
Epoch [7/1000], Perte: 0.6819
Epoch [8/1000], Perte: 0.6797
Epoch [9/1000], Perte: 0.6773
Epoch [10/1000], Perte: 0.6749
Epoch [11/1000], Perte: 0.6725
Epoch [12/1000], Perte: 0.6701
Epoch [13/1000], Perte: 0.6676
Epoch [14/1000], Perte: 0.6651
Epoch [15/1000], Perte: 0.6625
Epoch [16/1000], Perte: 0.6598
Epoch [17/1000], Perte: 0.6571
Epoch [18/1000], Perte: 0.6543
Epoch [19/1000], Perte: 0.6514
Epoch [20/1000], Perte: 0.6485
Epoch [21/1000], Perte: 0.6455
Epoch [22/1000], Perte: 0.6425
Epoch [23/1000], Perte: 0.6394
Epoch [24/1000], Perte: 0.6362
Epoch [25/1000], Perte: 0.6329
Epoch [26/1000], Perte: 0.6295
Epoch [27/1000], Perte: 0.6261
Epoch [28/1000], Perte: 0.6227
Epoch [29/1000], Perte: 0.6191
Epoch [30/1000], Perte: 0.6155
Epoch [31/1000], Perte: 0.6119
Epoch [32/1000], Perte: 0.6082
Epoch [33/1000], 