# Component 1: Document Embedding with Word2Vec

This component transforms raw movie documents into dense vector representations using Word2Vec embeddings.

**Pipeline:**
1. Load movie datasets (1970s-2020s)
2. Tokenize documents (title + plot)
3. Train Word2Vec model on tokenized corpus
4. Compute document embeddings (mean of token vectors)
5. L2-normalize vectors for cosine similarity
6. Save embeddings and metadata for downstream components

In [1]:
"""
Component 1: Document embeddings with Word2Vec

- Load movie plots
- Tokenize (title + plot)
- Train Word2Vec
- Compute mean-pooled doc embeddings
- L2-normalize
- Save to Data/processed for Components 2–4
"""

# Setup: Imports, warnings, and reproducibility
import warnings
warnings.filterwarnings('ignore', category=UserWarning)

import pandas as pd
import numpy as np
import os
import sys
from pathlib import Path
from collections import Counter
from numpy.linalg import norm

# Directory paths (consistent across all components)
DATA_DIR = "Data"
PROCESSED_DIR = os.path.join(DATA_DIR, "processed")

# Set random seeds for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Add Components directory to path
sys.path.append('Components')

# Download NLTK resources (required for tokenization)
import nltk

# Force download punkt_tab (required for NLTK 3.8+)
try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab', quiet=True)

# Also download punkt as fallback (for older NLTK versions)
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt', quiet=True)

# Download stopwords
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords', quiet=True)

# Import Tokenizer and Word2Vec
from Tokenizer import tokenize
from gensim.models import Word2Vec

print("✓ Imports loaded | Seed set to", RANDOM_SEED)
print(f"✓ Data directories: {DATA_DIR}/, {PROCESSED_DIR}/")



✓ Imports loaded | Seed set to 42
✓ Data directories: Data/, Data/processed/


## Helper Functions

In [2]:
def load_movie_datasets(data_dir=DATA_DIR):
    """
    Load all movie datasets from CSV files.
    
    Args:
        data_dir: Directory containing the CSV files
        
    Returns:
        DataFrame with all movies and a 'decade' column
    """
    decades = ['1970s', '1980s', '1990s', '2000s', '2010s', '2020s']
    dataframes = []
    
    for decade in decades:
        filepath = os.path.join(data_dir, f'{decade}-movies.csv')
        df = pd.read_csv(filepath)
        df['decade'] = decade
        dataframes.append(df)
    
    all_movies = pd.concat(dataframes, ignore_index=True)
    return all_movies


def tokenize_documents(df, text_columns=['title', 'plot']):
    """
    Tokenize documents by combining specified columns.
    
    Args:
        df: DataFrame with movie data
        text_columns: List of column names to combine for tokenization
        
    Returns:
        DataFrame with added 'tokens' column
    """
    def combine_and_tokenize(row):
        text = ' '.join([str(row[col]) for col in text_columns])
        return tokenize(text, remove_stopwords=True, apply_stemming=True)
    
    df = df.copy()
    df['tokens'] = df.apply(combine_and_tokenize, axis=1)
    return df


def train_word2vec(sentences, vector_size=200, window=5, min_count=5, 
                   workers=4, sg=1, seed=42):
    """
    Train Word2Vec model on tokenized sentences.
    
    Args:
        sentences: List of token lists
        vector_size: Embedding dimension
        window: Context window size
        min_count: Minimum token frequency
        workers: Number of worker threads
        sg: Skip-gram (1) or CBOW (0)
        seed: Random seed for reproducibility
        
    Returns:
        Trained Word2Vec model
    """
    model = Word2Vec(
        sentences=sentences,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        workers=workers,
        sg=sg,
        seed=seed
    )
    return model


def compute_document_embedding(tokens, model, vector_size):
    """
    Compute document embedding as mean of token embeddings.
    
    Args:
        tokens: List of token strings
        model: Trained Word2Vec model
        vector_size: Embedding dimension
        
    Returns:
        1D numpy array of shape (vector_size,)
    """
    word_vectors = model.wv
    valid_tokens = [t for t in tokens if t in word_vectors.key_to_index]
    
    if not valid_tokens:
        return np.zeros(vector_size, dtype=np.float32)
    
    vecs = np.vstack([word_vectors[t] for t in valid_tokens])
    return vecs.mean(axis=0).astype(np.float32)


def compute_all_document_embeddings(df, model, vector_size):
    """
    Compute embeddings for all documents.
    
    Args:
        df: DataFrame with 'tokens' column
        model: Trained Word2Vec model
        vector_size: Embedding dimension
        
    Returns:
        Document matrix of shape (n_docs, vector_size)
    """
    doc_vectors = []
    for tokens in df['tokens']:
        doc_vec = compute_document_embedding(tokens, model, vector_size)
        doc_vectors.append(doc_vec)
    return np.vstack(doc_vectors)


def normalize_vectors(doc_matrix):
    """
    L2-normalize document vectors for cosine similarity.
    
    Args:
        doc_matrix: Document matrix of shape (n_docs, dim)
        
    Returns:
        Normalized document matrix
    """
    norms = norm(doc_matrix, axis=1, keepdims=True)
    norms[norms == 0.0] = 1.0  # Avoid division by zero
    return doc_matrix / norms


def save_embeddings(doc_matrix, df, output_dir=PROCESSED_DIR):
    """
    Save document embeddings and metadata to disk.
    
    Args:
        doc_matrix: Document matrix
        df: DataFrame with metadata
        output_dir: Output directory path
        
    Returns:
        Tuple of (vector_path, metadata_path)
    """
    os.makedirs(output_dir, exist_ok=True)
    
    # Prepare metadata with doc_id
    df_meta = df.copy().reset_index(drop=True)
    df_meta['doc_id'] = df_meta.index
    
    # Save vectors
    vector_path = os.path.join(output_dir, 'doc_vectors_w2v.npy')
    np.save(vector_path, doc_matrix)
    
    # Save metadata with doc_id
    metadata_path = os.path.join(output_dir, 'doc_metadata.csv')
    df_meta[['doc_id', 'title', 'decade']].to_csv(metadata_path, index=False)
    
    return vector_path, metadata_path


def show_similar_movies(target_idx, doc_matrix, df, top_k=5):
    """
    Print the top-k most similar movies to a given movie index
    using cosine similarity (dot product on normalized vectors).
    
    Args:
        target_idx: Index of the query movie
        doc_matrix: Normalized document matrix
        df: DataFrame with movie metadata
        top_k: Number of similar movies to return
    """
    target_vec = doc_matrix[target_idx]
    sims = doc_matrix @ target_vec  # cosine similarity (normalized vectors)
    
    # Get top-k most similar (excluding the query itself)
    ranked = np.argsort(-sims)
    ranked = ranked[ranked != target_idx][:top_k]
    
    print(f"Query movie [{target_idx}]: {df.iloc[target_idx]['title']} ({df.iloc[target_idx]['decade']})\n")
    print("Most similar movies:")
    for rank, idx in enumerate(ranked, start=1):
        title = df.iloc[idx]["title"]
        decade = df.iloc[idx]["decade"]
        score = sims[idx]
        print(f"  {rank}. {title} ({decade})  |  sim = {score:.3f}")

print("Helper functions defined")


Helper functions defined


## Component 1 – Design Notes

**Tokenization.**  
We use a custom tokenizer that lowercases text, removes stopwords and applies stemming (Porter). In classical IR this is a standard preprocessing pipeline to reduce sparsity and group morphological variants into a single term, which improves generalization for similarity search (Manning et al., *Introduction to Information Retrieval*, Ch. 2–3).

**Dense document representations (Word2Vec).**  
Instead of sparse TF–IDF vectors or LSI, we represent each document as the **average of its Word2Vec word embeddings**. Distributional word vectors (Mikolov et al., 2013) capture semantic similarity (synonyms, paraphrases), which is useful for "similar movie plots" retrieval: two plots can be close even if they do not share exactly the same words.

**Why average pooling?**  
Average pooling over word vectors is a simple, fast and surprisingly strong baseline for document embeddings. It keeps the representation size fixed (here 200-D), is easy to store and index, and works well in practice for downstream similarity tasks compared to heavier models (e.g. doc2vec, transformers) which would be overkill for this assignment.

**Hyperparameters for Word2Vec.**  
We set `vector_size=200`, `window=5`, `min_count=5`, `sg=1` (skip-gram).  
- 200 dimensions are a common trade-off between expressiveness and memory/runtime.  
- Window size 5 focuses on local context, which is appropriate for narrative movie plots.  
- `min_count=5` filters out extremely rare words that would just add noise.  
- Skip-gram (`sg=1`) typically works better on smaller datasets and for rare words than CBOW, which is helpful with ~17k movie plots.

These choices are in line with typical Word2Vec setups reported in the literature (Mikolov et al., 2013).

**Cosine similarity and L2-normalization.**  
We L2-normalize all document vectors so that cosine similarity reduces to a dot product. Cosine similarity is standard for comparing embedding vectors, because it focuses on direction instead of magnitude and is more robust to document length differences (common IR practice).

---

## Main Pipeline


In [3]:
# Step 1: Load datasets
print("Step 1: Loading movie datasets...")
all_movies = load_movie_datasets(data_dir=DATA_DIR)
print(f"  Loaded {len(all_movies)} movies")
print(f"  Distribution: {dict(all_movies['decade'].value_counts().sort_index())}")

Step 1: Loading movie datasets...
  Loaded 17830 movies
  Distribution: {'1970s': np.int64(1770), '1980s': np.int64(2338), '1990s': np.int64(3105), '2000s': np.int64(4416), '2010s': np.int64(4960), '2020s': np.int64(1241)}


In [4]:
# Step 2: Tokenize documents
# We follow a classic IR pipeline: lowercase + stopword removal + stemming
# → reduces sparsity and makes the later embedding model easier to train
print("\nStep 2: Tokenizing documents...")
all_movies = tokenize_documents(all_movies, text_columns=['title', 'plot'])

total_tokens = sum(len(tokens) for tokens in all_movies['tokens'])
vocabulary = set()
for tokens in all_movies['tokens']:
    vocabulary.update(tokens)

print(f"✓ Tokenized {len(all_movies)} documents")
print(f"  Total tokens: {total_tokens:,}")
print(f"  Vocabulary size: {len(vocabulary):,} unique tokens")



Step 2: Tokenizing documents...
✓ Tokenized 17830 documents
  Total tokens: 4,577,616
  Vocabulary size: 65,429 unique tokens


## Word2Vec Training

In [5]:
# Step 3: Train Word2Vec model
# 200-D skip-gram is a common, strong baseline for semantic similarity
print("\nStep 3: Training Word2Vec model...")

# Hyperparameters
VECTOR_SIZE = 200
WINDOW = 5
MIN_COUNT = 5
WORKERS = 4
SG = 1  # Skip-gram

sentences = list(all_movies['tokens'])
w2v_model = train_word2vec(
    sentences=sentences,
    vector_size=VECTOR_SIZE,
    window=WINDOW,
    min_count=MIN_COUNT,
    workers=WORKERS,
    sg=SG,
    seed=RANDOM_SEED
)

print(f"✓ Word2Vec model trained")
print(f"  Vocabulary size: {len(w2v_model.wv.key_to_index):,} tokens")
print(f"  Embedding dimension: {VECTOR_SIZE}")

# Save Word2Vec model for reproducibility and future use
model_path = os.path.join(PROCESSED_DIR, 'w2v_model.bin')
os.makedirs(PROCESSED_DIR, exist_ok=True)
w2v_model.save(model_path)
print(f"✓ Saved Word2Vec model: {model_path}")


Step 3: Training Word2Vec model...


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_fl

✓ Word2Vec model trained
  Vocabulary size: 29,086 tokens
  Embedding dimension: 200
✓ Saved Word2Vec model: Data/processed/w2v_model.bin


In [None]:
# Step 4: Compute document embeddings
# Simple but effective document embedding: average all word vectors that appear in the movie plot
print("\nStep 4: Computing document embeddings...")
doc_matrix = compute_all_document_embeddings(all_movies, w2v_model, VECTOR_SIZE)
print(f" Document embeddings computed: shape {doc_matrix.shape}")


Step 4: Computing document embeddings...
✓ Document embeddings computed: shape (17830, 200)


In [None]:
# Step 5: L2-normalize vectors
# Normalize so that later we can use cosine similarity / dot products directly
print("\nStep 5: L2-normalizing vectors for cosine similarity...")
doc_matrix_normalized = normalize_vectors(doc_matrix)
print(f"  Normalization complete")
print(f"  Example: norm before={norm(doc_matrix[0]):.4f}, after={norm(doc_matrix_normalized[0]):.4f}")



Step 5: L2-normalizing vectors for cosine similarity...
Normalization complete
  Example: norm before=1.5255, after=1.0000


### Dataset Statistics


In [8]:
# Dataset overview
print("Dataset Overview:")
print(f"  Total documents: {len(all_movies)}")
print(f"  Columns: {list(all_movies.columns)}")
print(f"  Missing values: {all_movies.isnull().sum().sum()}")

# Plot length statistics
all_movies['plot_length'] = all_movies['plot'].str.len()
print(f"\nPlot length statistics:")
print(f"  Mean: {all_movies['plot_length'].mean():.0f} chars")
print(f"  Median: {all_movies['plot_length'].median():.0f} chars")
print(f"  Min: {all_movies['plot_length'].min():.0f} chars")
print(f"  Max: {all_movies['plot_length'].max():.0f} chars")


Dataset Overview:
  Total documents: 17830
  Columns: ['title', 'image', 'plot', 'decade', 'tokens']
  Missing values: 0

Plot length statistics:
  Mean: 2700 chars
  Median: 2867 chars
  Min: 3 chars
  Max: 66145 chars


### Tokenization Statistics


In [9]:
# Token count per document
all_movies['token_count'] = all_movies['tokens'].apply(len)
print("Token count per document:")
print(f"  Mean: {all_movies['token_count'].mean():.1f} tokens")
print(f"  Median: {all_movies['token_count'].median():.1f} tokens")
print(f"  Min: {all_movies['token_count'].min()} tokens")
print(f"  Max: {all_movies['token_count'].max()} tokens")

# Most frequent tokens
all_tokens_flat = [token for tokens in all_movies['tokens'] for token in tokens]
token_freq = Counter(all_tokens_flat)
print(f"\nTop 10 most frequent tokens:")
for token, count in token_freq.most_common(10):
    print(f"  {token:20s}: {count:6,} occurrences")


Token count per document:
  Mean: 256.7 tokens
  Median: 271.0 tokens
  Min: 4 tokens
  Max: 6243 tokens

Top 10 most frequent tokens:
  kill                : 24,252 occurrences
  find                : 22,815 occurrences
  film                : 19,636 occurrences
  take                : 18,517 occurrences
  tell                : 17,799 occurrences
  one                 : 17,778 occurrences
  get                 : 17,059 occurrences
  leav                : 16,972 occurrences
  back                : 14,523 occurrences
  return              : 13,432 occurrences


### Sample Documents


In [10]:
# Show sample tokenized documents
print("Sample tokenized documents:")
for i in range(3):
    movie = all_movies.iloc[i]
    print(f"\n{i+1}. {movie['title']} ({movie['decade']})")
    print(f"   Tokens ({len(movie['tokens'])}): {movie['tokens'][:15]}...")


Sample tokenized documents:

1. 'Gator Bait (1970s)
   Tokens (59): ['bait', 'film', 'follow', 'barefoot', 'poacher', 'name', 'desire', 'thibodeau', 'live', 'deep', 'swampland', 'ben', 'bracken', 'deputi', 'billi']...

2. ...And Justice for All (film) (1970s)
   Tokens (483): ['justic', 'film', 'arthur', 'kirkland', 'baltimor', 'defens', 'attorney', 'jail', 'contempt', 'court', 'charg', 'punch', 'judg', 'henri', 'fleme']...

3. 10 (1979 film) (1970s)
   Tokens (284): ['10', '1979', 'film', 'surpris', '42nd', 'birthday', 'parti', 'wealthi', 'famou', 'compos', 'georg', 'webber', 'thrown', 'actress', 'girlfriend']...


In [11]:
# Test similarity search on a few examples
print("Testing semantic similarity with cosine distance:\n")
show_similar_movies(0, doc_matrix_normalized, all_movies, top_k=5)


Testing semantic similarity with cosine distance:

Query movie [0]: 'Gator Bait (1970s)

Most similar movies:
  1. Kill or Be Killed (2015 film) (2010s)  |  sim = 0.953
  2. The Tripper (2000s)  |  sim = 0.949
  3. Bloody Mama (1970s)  |  sim = 0.949
  4. Lake Dead (2000s)  |  sim = 0.948
  5. Sin City (film) (2000s)  |  sim = 0.948
