# Document Embeddings (Word2Vec Average)

Objectif: Créer une représentation vectorielle fixe pour chaque document (review) en moyennant les vecteurs de ses mots.

Partie de la story **SAE-78**.

In [1]:
import sys
import os
import pandas as pd
import numpy as np
import gensim
from gensim.models import Word2Vec
from tqdm import tqdm

# Enable tqdm for pandas
tqdm.pandas()

## Chargement des Données et du Modèle

In [2]:
preprocessed_path = '../../outputs/reviews_preprocessed.pkl'
model_path = '../../outputs/models/word2vec_yelp.model'

# 1. Load Data
if os.path.exists(preprocessed_path):
    print(f"Loading data from {preprocessed_path}...")
    reviews = pd.read_pickle(preprocessed_path)
    print(f"Loaded {len(reviews)} reviews.")
else:
    print("Error: Preprocessed data not found. Please run SAE-74 notebook first.")
    # Fallback to empty for structure check
    reviews = pd.DataFrame({'tokens_final': [['good', 'food'], ['bad']]})

# 2. Load Model
if os.path.exists(model_path):
    print(f"Loading model from {model_path}...")
    model = Word2Vec.load(model_path)
    print(f"Model loaded. Vector size: {model.vector_size}")
else:
    print("Error: Model not found. Please run SAE-77 notebook first.")
    # Dummy model for structure check
    from gensim.models import Word2Vec
    model = Word2Vec(sentences=[['test']], vector_size=100, min_count=1)

Loading data from ../../outputs/reviews_preprocessed.pkl...
Loaded 2000 reviews.
Loading model from ../../outputs/models/word2vec_yelp.model...
Model loaded. Vector size: 100


## Calcul des Document Vectors

In [3]:
def document_vector_mean(tokens, model):
    """
    Calculate the mean of word vectors for a document.
    
    Args:
        tokens (list): List of tokens in the document.
        model (Word2Vec): Trained Word2Vec model.
        
    Returns:
        numpy.array: Mean vector (size: model.vector_size).
    """
    # Filter tokens present in the model's vocabulary
    vectors = [model.wv[token] for token in tokens if token in model.wv]
    
    if len(vectors) == 0:
        # Return zero vector if no words in vocab
        return np.zeros(model.vector_size)
    
    # Calculate mean
    return np.mean(vectors, axis=0)

print("Calculating document embeddings...")
# Apply to all reviews
doc_vectors = np.array([
    document_vector_mean(tokens, model) 
    for tokens in tqdm(reviews['tokens_final'], desc="Computing vectors")
])

print(f"\nDocument Embeddings Shape: {doc_vectors.shape}")

Calculating document embeddings...


Computing vectors:   0%|          | 0/2000 [00:00<?, ?it/s]

Computing vectors:  46%|████▌     | 923/2000 [00:00<00:00, 9203.06it/s]

Computing vectors:  92%|█████████▏| 1844/2000 [00:00<00:00, 9108.44it/s]

Computing vectors: 100%|██████████| 2000/2000 [00:00<00:00, 9132.46it/s]


Document Embeddings Shape: (2000, 100)





## Vérification

In [4]:
# Check first vector
print("First document vector sample:", doc_vectors[0][:10])

# Check for zero vectors (documents with no known words)
zero_vectors_count = np.sum(np.all(doc_vectors == 0, axis=1))
print(f"Number of zero vectors: {zero_vectors_count} / {len(doc_vectors)}")

First document vector sample: [-0.02792758  0.10660151  0.00516811 -0.02206274 -0.00614953 -0.31578484
  0.08371179  0.2872118  -0.20687567 -0.08231399]
Number of zero vectors: 0 / 2000


## Sauvegarde

In [5]:
output_dir = '../../outputs'
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, 'doc_embeddings_w2v.npy')

np.save(output_path, doc_vectors)
print(f"Document embeddings saved to {output_path}")

Document embeddings saved to ../../outputs\doc_embeddings_w2v.npy
