# BBC Text Representations - Dense Methods

**Roll Number:** SE22UARI195

**Tasks:**
1. Word2Vec Skip-gram with Negative Sampling (NS)
2. Word2Vec CBOW with Negative Sampling (NS)
3. Word2Vec Skip-gram with Hierarchical Softmax (HS)
4. Word2Vec CBOW with Hierarchical Softmax (HS)
5. GloVe (pretrained 100d)
6. TF-IDF weighted pooling for document vectors
7. Compare training speed and quality

---

## 1. Setup & Load Preprocessed Data

In [1]:
# Core libraries
import pandas as pd
import numpy as np
import pickle
import time
from pathlib import Path
from collections import Counter

# Gensim for Word2Vec
from gensim.models import Word2Vec

# Sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Progress bar
from tqdm.notebook import tqdm

print("‚úÖ Imports successful!")

‚úÖ Imports successful!


In [2]:
# Configuration
ROLL = "SE22UARI195"
CACHE_DIR = Path("../cache")
MODELS_DIR = Path("../models")
DATA_DIR = Path("../data")

print(f"Roll Number: {ROLL}")
print(f"Cache Directory: {CACHE_DIR}")
print(f"Models Directory: {MODELS_DIR}")
print(f"Data Directory: {DATA_DIR}")

Roll Number: SE22UARI195
Cache Directory: ../cache
Models Directory: ../models
Data Directory: ../data


In [3]:
# Load preprocessed data
print("üìÇ Loading preprocessed data...\n")

with open(CACHE_DIR / 'train_processed.pkl', 'rb') as f:
    train_df = pickle.load(f)
print(f"‚úÖ TRAIN: {len(train_df)} documents")

with open(CACHE_DIR / 'dev_processed.pkl', 'rb') as f:
    dev_df = pickle.load(f)
print(f"‚úÖ DEV: {len(dev_df)} documents")

with open(CACHE_DIR / 'test_processed.pkl', 'rb') as f:
    test_df = pickle.load(f)
print(f"‚úÖ TEST: {len(test_df)} documents")

with open(CACHE_DIR / 'vocab_counter.pkl', 'rb') as f:
    vocab_counter = pickle.load(f)
print(f"‚úÖ Vocabulary: {len(vocab_counter):,} unique tokens")

üìÇ Loading preprocessed data...

‚úÖ TRAIN: 1335 documents
‚úÖ DEV: 445 documents
‚úÖ TEST: 445 documents
‚úÖ Vocabulary: 20,404 unique tokens


In [4]:
# Load TF-IDF vectorizer (needed for weighted pooling)
with open(MODELS_DIR / 'tfidf_vectorizer.pkl', 'rb') as f:
    tfidf_vectorizer = pickle.load(f)

print(f"‚úÖ TF-IDF vectorizer loaded")
print(f"   Vocabulary size: {len(tfidf_vectorizer.vocabulary_):,}")

‚úÖ TF-IDF vectorizer loaded
   Vocabulary size: 11,515


In [5]:
# Prepare tokenized sentences for Word2Vec
train_sentences = train_df['tokens'].tolist()
dev_sentences = dev_df['tokens'].tolist()
test_sentences = test_df['tokens'].tolist()

print("\nüìä Tokenized sentences:")
print(f"  TRAIN: {len(train_sentences)} documents")
print(f"  DEV: {len(dev_sentences)} documents")
print(f"  TEST: {len(test_sentences)} documents")
print(f"\n  Sample: {train_sentences[0][:15]}...")


üìä Tokenized sentences:
  TRAIN: 1335 documents
  DEV: 445 documents
  TEST: 445 documents

  Sample: ['worldcom', 'bos', 'left', 'book', 'alone', 'former', 'worldcom', 'bos', 'bernie', 'ebbers', 'accused', 'overseeing', '11bn', '8bn', 'fraud']...


## 2. Helper Functions

In [6]:
def tfidf_weighted_pooling(tokens_list, word_vectors, tfidf_vec, tfidf_vocab):
    """
    Convert list of token lists to document vectors using TF-IDF weighted mean.
    
    Args:
        tokens_list: List of token lists
        word_vectors: Word embedding model (Word2Vec or dict of embeddings)
        tfidf_vec: Fitted TfidfVectorizer
        tfidf_vocab: TF-IDF vocabulary dict
    
    Returns:
        numpy array of shape (n_docs, embedding_dim)
    """
    # Get embedding dimension
    if hasattr(word_vectors, 'wv'):
        # Word2Vec model
        vector_size = word_vectors.wv.vector_size
        vocab = word_vectors.wv
    else:
        # Dictionary of embeddings (GloVe)
        vector_size = len(next(iter(word_vectors.values())))
        vocab = word_vectors
    
    doc_vectors = []
    
    for tokens in tqdm(tokens_list, desc="Pooling"):
        if len(tokens) == 0:
            # Empty document - use zero vector
            doc_vectors.append(np.zeros(vector_size))
            continue
        
        # Get TF-IDF weights for this document
        text = ' '.join(tokens)
        tfidf_vec_doc = tfidf_vec.transform([text]).toarray()[0]
        
        # Accumulate weighted vectors
        weighted_sum = np.zeros(vector_size)
        total_weight = 0.0
        
        for token in tokens:
            # Check if token is in both embeddings and TF-IDF vocab
            if token in tfidf_vocab:
                tfidf_idx = tfidf_vocab[token]
                tfidf_weight = tfidf_vec_doc[tfidf_idx]
                
                # Get word vector
                if hasattr(vocab, '__contains__'):
                    if token in vocab:
                        if hasattr(vocab, 'get_vector'):
                            word_vec = vocab.get_vector(token)
                        else:
                            word_vec = vocab[token]
                        weighted_sum += tfidf_weight * word_vec
                        total_weight += tfidf_weight
                else:
                    if token in vocab:
                        word_vec = vocab[token]
                        weighted_sum += tfidf_weight * word_vec
                        total_weight += tfidf_weight
        
        # Average by total weight
        if total_weight > 0:
            doc_vectors.append(weighted_sum / total_weight)
        else:
            # No valid tokens - use zero vector
            doc_vectors.append(np.zeros(vector_size))
    
    return np.array(doc_vectors)

print("‚úÖ Helper functions defined!")

‚úÖ Helper functions defined!


## 3. Word2Vec - Skip-gram with Negative Sampling (NS)

In [7]:
print("\nüîß Training Word2Vec Skip-gram with Negative Sampling...\n")

# Count total tokens for speed calculation
total_tokens_train = sum(len(sent) for sent in train_sentences)
print(f"Total training tokens: {total_tokens_train:,}")

# Train Word2Vec Skip-gram with NS
start_time = time.time()

w2v_sg_ns = Word2Vec(
    sentences=train_sentences,
    sg=1,  # Skip-gram
    vector_size=100,
    window=5,
    min_count=3,
    negative=5,  # Negative sampling with k=5
    hs=0,  # No hierarchical softmax
    epochs=10,
    workers=4,
    seed=42
)

train_time_sg_ns = time.time() - start_time
tokens_per_sec_sg_ns = total_tokens_train / train_time_sg_ns

print(f"‚úÖ Training complete in {train_time_sg_ns:.2f}s")
print(f"   Tokens/sec: {tokens_per_sec_sg_ns:,.0f}")
print(f"   Vocabulary size: {len(w2v_sg_ns.wv):,}")
print(f"   Vector size: {w2v_sg_ns.wv.vector_size}")


üîß Training Word2Vec Skip-gram with Negative Sampling...

Total training tokens: 285,829


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


‚úÖ Training complete in 6.25s
   Tokens/sec: 45,760
   Vocabulary size: 9,848
   Vector size: 100


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


In [8]:
# Save model
w2v_sg_ns.save(str(MODELS_DIR / 'w2v_sg_ns.model'))
print("üíæ Model saved!")

üíæ Model saved!


In [9]:
# Test nearest neighbors
print("\nüîç Nearest neighbors (Skip-gram NS):")
test_words = ['government', 'film', 'music', 'technology', 'economy']

for word in test_words:
    if word in w2v_sg_ns.wv:
        similar = w2v_sg_ns.wv.most_similar(word, topn=5)
        print(f"\n  {word}:")
        for sim_word, score in similar:
            print(f"    {sim_word:15s} : {score:.4f}")
    else:
        print(f"\n  {word}: NOT IN VOCABULARY")


üîç Nearest neighbors (Skip-gram NS):

  government:
    rebate          : 0.6258
    curb            : 0.6149
    quango          : 0.6146
    regulation      : 0.6135
    rethink         : 0.6120

  film:
    movie           : 0.6730
    cinema          : 0.6549
    hollywood       : 0.6513
    festival        : 0.6410
    documentary     : 0.6254

  music:
    label           : 0.5955
    urban           : 0.5926
    collection      : 0.5680
    digital         : 0.5666
    downloading     : 0.5654

  technology:
    optical         : 0.6866
    uwb             : 0.6835
    intel           : 0.6748
    matsushita      : 0.6559
    nokia           : 0.6469

  economy:
    export          : 0.7806
    moderate        : 0.7782
    growth          : 0.7702
    economic        : 0.7675
    stable          : 0.7641


In [10]:
# Create document vectors using TF-IDF weighted pooling
print("\nüìä Creating document vectors with TF-IDF weighted pooling...")

X_train_w2v_sg_ns = tfidf_weighted_pooling(
    train_sentences, w2v_sg_ns, tfidf_vectorizer, tfidf_vectorizer.vocabulary_
)

X_dev_w2v_sg_ns = tfidf_weighted_pooling(
    dev_sentences, w2v_sg_ns, tfidf_vectorizer, tfidf_vectorizer.vocabulary_
)

X_test_w2v_sg_ns = tfidf_weighted_pooling(
    test_sentences, w2v_sg_ns, tfidf_vectorizer, tfidf_vectorizer.vocabulary_
)

print(f"\n‚úÖ Document vectors created:")
print(f"   TRAIN: {X_train_w2v_sg_ns.shape}")
print(f"   DEV: {X_dev_w2v_sg_ns.shape}")
print(f"   TEST: {X_test_w2v_sg_ns.shape}")


üìä Creating document vectors with TF-IDF weighted pooling...


Pooling:   0%|          | 0/1335 [00:00<?, ?it/s]

Pooling:   0%|          | 0/445 [00:00<?, ?it/s]

Pooling:   0%|          | 0/445 [00:00<?, ?it/s]


‚úÖ Document vectors created:
   TRAIN: (1335, 100)
   DEV: (445, 100)
   TEST: (445, 100)


In [11]:
# Save document vectors
np.save(MODELS_DIR / 'X_train_w2v_sg_ns.npy', X_train_w2v_sg_ns)
np.save(MODELS_DIR / 'X_dev_w2v_sg_ns.npy', X_dev_w2v_sg_ns)
np.save(MODELS_DIR / 'X_test_w2v_sg_ns.npy', X_test_w2v_sg_ns)
print("\nüíæ Document vectors saved!")


üíæ Document vectors saved!


## 4. Word2Vec - CBOW with Negative Sampling (NS)

In [12]:
print("\nüîß Training Word2Vec CBOW with Negative Sampling...\n")

start_time = time.time()

w2v_cbow_ns = Word2Vec(
    sentences=train_sentences,
    sg=0,  # CBOW
    vector_size=100,
    window=5,
    min_count=3,
    negative=5,
    hs=0,
    epochs=10,
    workers=4,
    seed=42
)

train_time_cbow_ns = time.time() - start_time
tokens_per_sec_cbow_ns = total_tokens_train / train_time_cbow_ns

print(f"‚úÖ Training complete in {train_time_cbow_ns:.2f}s")
print(f"   Tokens/sec: {tokens_per_sec_cbow_ns:,.0f}")
print(f"   Vocabulary size: {len(w2v_cbow_ns.wv):,}")


üîß Training Word2Vec CBOW with Negative Sampling...



Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


‚úÖ Training complete in 2.01s
   Tokens/sec: 142,017
   Vocabulary size: 9,848


In [13]:
# Save model
w2v_cbow_ns.save(str(MODELS_DIR / 'w2v_cbow_ns.model'))
print("üíæ Model saved!")

üíæ Model saved!


In [14]:
# Test nearest neighbors
print("\nüîç Nearest neighbors (CBOW NS):")

for word in test_words:
    if word in w2v_cbow_ns.wv:
        similar = w2v_cbow_ns.wv.most_similar(word, topn=5)
        print(f"\n  {word}:")
        for sim_word, score in similar:
            print(f"    {sim_word:15s} : {score:.4f}")
    else:
        print(f"\n  {word}: NOT IN VOCABULARY")


üîç Nearest neighbors (CBOW NS):

  government:
    policy          : 0.9564
    pension         : 0.9528
    local           : 0.9464
    plan            : 0.9414
    proposal        : 0.9406

  film:
    actor           : 0.9144
    aviator         : 0.9063
    hollywood       : 0.8926
    award           : 0.8903
    abortionist     : 0.8807

  music:
    digital         : 0.9171
    song            : 0.9030
    mp3             : 0.8964
    screen          : 0.8952
    recorder        : 0.8951

  technology:
    network         : 0.9499
    apple           : 0.9454
    content         : 0.9358
    using           : 0.9300
    internet        : 0.9265

  economy:
    growth          : 0.9554
    unece           : 0.9527
    rise            : 0.9443
    forecast        : 0.9433
    debt            : 0.9431


In [15]:
# Create document vectors
print("\nüìä Creating document vectors with TF-IDF weighted pooling...")

X_train_w2v_cbow_ns = tfidf_weighted_pooling(
    train_sentences, w2v_cbow_ns, tfidf_vectorizer, tfidf_vectorizer.vocabulary_
)

X_dev_w2v_cbow_ns = tfidf_weighted_pooling(
    dev_sentences, w2v_cbow_ns, tfidf_vectorizer, tfidf_vectorizer.vocabulary_
)

X_test_w2v_cbow_ns = tfidf_weighted_pooling(
    test_sentences, w2v_cbow_ns, tfidf_vectorizer, tfidf_vectorizer.vocabulary_
)

print(f"\n‚úÖ Document vectors created:")
print(f"   TRAIN: {X_train_w2v_cbow_ns.shape}")
print(f"   DEV: {X_dev_w2v_cbow_ns.shape}")
print(f"   TEST: {X_test_w2v_cbow_ns.shape}")


üìä Creating document vectors with TF-IDF weighted pooling...


Pooling:   0%|          | 0/1335 [00:00<?, ?it/s]

Pooling:   0%|          | 0/445 [00:00<?, ?it/s]

Pooling:   0%|          | 0/445 [00:00<?, ?it/s]


‚úÖ Document vectors created:
   TRAIN: (1335, 100)
   DEV: (445, 100)
   TEST: (445, 100)


In [16]:
# Save document vectors
np.save(MODELS_DIR / 'X_train_w2v_cbow_ns.npy', X_train_w2v_cbow_ns)
np.save(MODELS_DIR / 'X_dev_w2v_cbow_ns.npy', X_dev_w2v_cbow_ns)
np.save(MODELS_DIR / 'X_test_w2v_cbow_ns.npy', X_test_w2v_cbow_ns)
print("\nüíæ Document vectors saved!")


üíæ Document vectors saved!


## 5. Word2Vec - Skip-gram with Hierarchical Softmax (HS)

In [17]:
print("\nüîß Training Word2Vec Skip-gram with Hierarchical Softmax...\n")

start_time = time.time()

w2v_sg_hs = Word2Vec(
    sentences=train_sentences,
    sg=1,  # Skip-gram
    vector_size=100,
    window=5,
    min_count=3,
    negative=0,  # No negative sampling
    hs=1,  # Hierarchical softmax
    epochs=10,
    workers=4,
    seed=42
)

train_time_sg_hs = time.time() - start_time
tokens_per_sec_sg_hs = total_tokens_train / train_time_sg_hs

print(f"‚úÖ Training complete in {train_time_sg_hs:.2f}s")
print(f"   Tokens/sec: {tokens_per_sec_sg_hs:,.0f}")
print(f"   Vocabulary size: {len(w2v_sg_hs.wv):,}")


üîß Training Word2Vec Skip-gram with Hierarchical Softmax...



Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


‚úÖ Training complete in 6.38s
   Tokens/sec: 44,775
   Vocabulary size: 9,848


In [18]:
# Save model
w2v_sg_hs.save(str(MODELS_DIR / 'w2v_sg_hs.model'))
print("üíæ Model saved!")

üíæ Model saved!


In [19]:
# Test nearest neighbors
print("\nüîç Nearest neighbors (Skip-gram HS):")

for word in test_words:
    if word in w2v_sg_hs.wv:
        similar = w2v_sg_hs.wv.most_similar(word, topn=5)
        print(f"\n  {word}:")
        for sim_word, score in similar:
            print(f"    {sim_word:15s} : {score:.4f}")
    else:
        print(f"\n  {word}: NOT IN VOCABULARY")


üîç Nearest neighbors (Skip-gram HS):

  government:
    raynsford       : 0.6564
    local           : 0.6053
    taxation        : 0.5943
    expenditure     : 0.5806
    outcry          : 0.5751

  film:
    hollywood       : 0.6844
    sundance        : 0.6640
    nod             : 0.6541
    festival        : 0.6458
    gritty          : 0.6447

  music:
    1xtra           : 0.6151
    napster         : 0.5973
    digital         : 0.5964
    showcasing      : 0.5692
    downloading     : 0.5669

  technology:
    lucent          : 0.6971
    evolution       : 0.6619
    samsung         : 0.6588
    matsushita      : 0.6579
    souped          : 0.6449

  economy:
    growth          : 0.8142
    economic        : 0.7490
    unece           : 0.7368
    export          : 0.7129
    indicator       : 0.6685


In [20]:
# Create document vectors
print("\nüìä Creating document vectors with TF-IDF weighted pooling...")

X_train_w2v_sg_hs = tfidf_weighted_pooling(
    train_sentences, w2v_sg_hs, tfidf_vectorizer, tfidf_vectorizer.vocabulary_
)

X_dev_w2v_sg_hs = tfidf_weighted_pooling(
    dev_sentences, w2v_sg_hs, tfidf_vectorizer, tfidf_vectorizer.vocabulary_
)

X_test_w2v_sg_hs = tfidf_weighted_pooling(
    test_sentences, w2v_sg_hs, tfidf_vectorizer, tfidf_vectorizer.vocabulary_
)

print(f"\n‚úÖ Document vectors created:")
print(f"   TRAIN: {X_train_w2v_sg_hs.shape}")
print(f"   DEV: {X_dev_w2v_sg_hs.shape}")
print(f"   TEST: {X_test_w2v_sg_hs.shape}")


üìä Creating document vectors with TF-IDF weighted pooling...


Pooling:   0%|          | 0/1335 [00:00<?, ?it/s]

Pooling:   0%|          | 0/445 [00:00<?, ?it/s]

Pooling:   0%|          | 0/445 [00:00<?, ?it/s]


‚úÖ Document vectors created:
   TRAIN: (1335, 100)
   DEV: (445, 100)
   TEST: (445, 100)


In [21]:
# Save document vectors
np.save(MODELS_DIR / 'X_train_w2v_sg_hs.npy', X_train_w2v_sg_hs)
np.save(MODELS_DIR / 'X_dev_w2v_sg_hs.npy', X_dev_w2v_sg_hs)
np.save(MODELS_DIR / 'X_test_w2v_sg_hs.npy', X_test_w2v_sg_hs)
print("\nüíæ Document vectors saved!")


üíæ Document vectors saved!


## 6. Word2Vec - CBOW with Hierarchical Softmax (HS)

In [22]:
print("\nüîß Training Word2Vec CBOW with Hierarchical Softmax...\n")

start_time = time.time()

w2v_cbow_hs = Word2Vec(
    sentences=train_sentences,
    sg=0,  # CBOW
    vector_size=100,
    window=5,
    min_count=3,
    negative=0,
    hs=1,  # Hierarchical softmax
    epochs=10,
    workers=4,
    seed=42
)

train_time_cbow_hs = time.time() - start_time
tokens_per_sec_cbow_hs = total_tokens_train / train_time_cbow_hs

print(f"‚úÖ Training complete in {train_time_cbow_hs:.2f}s")
print(f"   Tokens/sec: {tokens_per_sec_cbow_hs:,.0f}")
print(f"   Vocabulary size: {len(w2v_cbow_hs.wv):,}")


üîß Training Word2Vec CBOW with Hierarchical Softmax...

‚úÖ Training complete in 1.98s
   Tokens/sec: 144,672
   Vocabulary size: 9,848


In [23]:
# Save model
w2v_cbow_hs.save(str(MODELS_DIR / 'w2v_cbow_hs.model'))
print("üíæ Model saved!")

üíæ Model saved!


In [24]:
# Test nearest neighbors
print("\nüîç Nearest neighbors (CBOW HS):")

for word in test_words:
    if word in w2v_cbow_hs.wv:
        similar = w2v_cbow_hs.wv.most_similar(word, topn=5)
        print(f"\n  {word}:")
        for sim_word, score in similar:
            print(f"    {sim_word:15s} : {score:.4f}")
    else:
        print(f"\n  {word}: NOT IN VOCABULARY")


üîç Nearest neighbors (CBOW HS):

  government:
    health          : 0.7455
    reform          : 0.6846
    immigration     : 0.6764
    policy          : 0.6737
    plan            : 0.6399

  film:
    nominated       : 0.6774
    star            : 0.6751
    hollywood       : 0.6722
    movie           : 0.6621
    aviator         : 0.6564

  music:
    digital         : 0.6575
    song            : 0.5569
    artist          : 0.5523
    medium          : 0.5421
    gadget          : 0.5310

  technology:
    data            : 0.6571
    content         : 0.6557
    network         : 0.6553
    wireless        : 0.6543
    us              : 0.6477

  economy:
    growth          : 0.8156
    economic        : 0.7897
    export          : 0.7758
    forecast        : 0.7201
    overall         : 0.6969


In [25]:
# Create document vectors
print("\nüìä Creating document vectors with TF-IDF weighted pooling...")

X_train_w2v_cbow_hs = tfidf_weighted_pooling(
    train_sentences, w2v_cbow_hs, tfidf_vectorizer, tfidf_vectorizer.vocabulary_
)

X_dev_w2v_cbow_hs = tfidf_weighted_pooling(
    dev_sentences, w2v_cbow_hs, tfidf_vectorizer, tfidf_vectorizer.vocabulary_
)

X_test_w2v_cbow_hs = tfidf_weighted_pooling(
    test_sentences, w2v_cbow_hs, tfidf_vectorizer, tfidf_vectorizer.vocabulary_
)

print(f"\n‚úÖ Document vectors created:")
print(f"   TRAIN: {X_train_w2v_cbow_hs.shape}")
print(f"   DEV: {X_dev_w2v_cbow_hs.shape}")
print(f"   TEST: {X_test_w2v_cbow_hs.shape}")


üìä Creating document vectors with TF-IDF weighted pooling...


Pooling:   0%|          | 0/1335 [00:00<?, ?it/s]

Pooling:   0%|          | 0/445 [00:00<?, ?it/s]

Pooling:   0%|          | 0/445 [00:00<?, ?it/s]


‚úÖ Document vectors created:
   TRAIN: (1335, 100)
   DEV: (445, 100)
   TEST: (445, 100)


In [26]:
# Save document vectors
np.save(MODELS_DIR / 'X_train_w2v_cbow_hs.npy', X_train_w2v_cbow_hs)
np.save(MODELS_DIR / 'X_dev_w2v_cbow_hs.npy', X_dev_w2v_cbow_hs)
np.save(MODELS_DIR / 'X_test_w2v_cbow_hs.npy', X_test_w2v_cbow_hs)
print("\nüíæ Document vectors saved!")


üíæ Document vectors saved!


## 7. GloVe Embeddings (Pretrained)

In [27]:
print("\nüì• Loading GloVe embeddings...\n")

glove_path = DATA_DIR / 'glove.6B.100d.txt'

if not glove_path.exists():
    print("‚ùå ERROR: GloVe file not found!")
    print(f"   Expected location: {glove_path}")
    print("\n   Please download from: http://nlp.stanford.edu/data/glove.6B.zip")
    print("   Extract glove.6B.100d.txt to the data/ folder")
    raise FileNotFoundError(f"GloVe embeddings not found at {glove_path}")

# Load GloVe embeddings into dictionary
glove_embeddings = {}

with open(glove_path, 'r', encoding='utf-8') as f:
    for line in tqdm(f, desc="Loading GloVe", total=400000):
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        glove_embeddings[word] = vector

print(f"\n‚úÖ Loaded {len(glove_embeddings):,} word vectors")
print(f"   Vector dimension: {len(next(iter(glove_embeddings.values())))}")


üì• Loading GloVe embeddings...



Loading GloVe:   0%|          | 0/400000 [00:00<?, ?it/s]


‚úÖ Loaded 400,000 word vectors
   Vector dimension: 100


In [28]:
# Test nearest neighbors for GloVe
print("\nüîç Testing GloVe embeddings:")

def get_most_similar_glove(word, embeddings, topn=5):
    """Find most similar words using cosine similarity."""
    if word not in embeddings:
        return None
    
    word_vec = embeddings[word]
    similarities = {}
    
    # Calculate similarities with a subset for speed
    for other_word, other_vec in list(embeddings.items())[:50000]:
        if other_word == word:
            continue
        sim = np.dot(word_vec, other_vec) / (np.linalg.norm(word_vec) * np.linalg.norm(other_vec))
        similarities[other_word] = sim
    
    return sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:topn]

for word in test_words:
    if word in glove_embeddings:
        similar = get_most_similar_glove(word, glove_embeddings, topn=5)
        print(f"\n  {word}:")
        if similar:
            for sim_word, score in similar:
                print(f"    {sim_word:15s} : {score:.4f}")
    else:
        print(f"\n  {word}: NOT IN GLOVE VOCABULARY")


üîç Testing GloVe embeddings:

  government:
    administration  : 0.7937
    governments     : 0.7701
    officials       : 0.7590
    authorities     : 0.7442
    opposition      : 0.7372

  film:
    movie           : 0.9055
    films           : 0.8914
    directed        : 0.8124
    documentary     : 0.8076
    drama           : 0.7929

  music:
    musical         : 0.8128
    songs           : 0.7978
    dance           : 0.7897
    pop             : 0.7863
    recording       : 0.7651

  technology:
    technologies    : 0.8506
    computer        : 0.7642
    tech            : 0.7489
    software        : 0.7359
    systems         : 0.7293

  economy:
    economic        : 0.8279
    growth          : 0.7947
    recession       : 0.7692
    economies       : 0.7545
    recovery        : 0.7491


In [29]:
# Create document vectors using GloVe
print("\nüìä Creating document vectors with GloVe + TF-IDF pooling...")

X_train_glove = tfidf_weighted_pooling(
    train_sentences, glove_embeddings, tfidf_vectorizer, tfidf_vectorizer.vocabulary_
)

X_dev_glove = tfidf_weighted_pooling(
    dev_sentences, glove_embeddings, tfidf_vectorizer, tfidf_vectorizer.vocabulary_
)

X_test_glove = tfidf_weighted_pooling(
    test_sentences, glove_embeddings, tfidf_vectorizer, tfidf_vectorizer.vocabulary_
)

print(f"\n‚úÖ Document vectors created:")
print(f"   TRAIN: {X_train_glove.shape}")
print(f"   DEV: {X_dev_glove.shape}")
print(f"   TEST: {X_test_glove.shape}")


üìä Creating document vectors with GloVe + TF-IDF pooling...


Pooling:   0%|          | 0/1335 [00:00<?, ?it/s]

Pooling:   0%|          | 0/445 [00:00<?, ?it/s]

Pooling:   0%|          | 0/445 [00:00<?, ?it/s]


‚úÖ Document vectors created:
   TRAIN: (1335, 100)
   DEV: (445, 100)
   TEST: (445, 100)


In [30]:
# Save GloVe document vectors
np.save(MODELS_DIR / 'X_train_glove.npy', X_train_glove)
np.save(MODELS_DIR / 'X_dev_glove.npy', X_dev_glove)
np.save(MODELS_DIR / 'X_test_glove.npy', X_test_glove)
print("\nüíæ GloVe document vectors saved!")


üíæ GloVe document vectors saved!


## 8. Calculate Health Metrics for Dense Methods

In [31]:
print("\nüìä Calculating health metrics for dense representations...\n")

# Calculate OOV rates
def calculate_oov_rate(tokens_list, vocab):
    """Calculate out-of-vocabulary rate."""
    total_tokens = 0
    oov_tokens = 0
    
    for tokens in tokens_list:
        for token in tokens:
            total_tokens += 1
            if token not in vocab:
                oov_tokens += 1
    
    return (oov_tokens / total_tokens * 100) if total_tokens > 0 else 0.0

# Calculate coverage
def calculate_coverage(tokens_list, vocab, k):
    """Calculate top-k coverage."""
    token_counts = Counter()
    for tokens in tokens_list:
        token_counts.update(tokens)
    
    top_k_tokens = [token for token, _ in token_counts.most_common(k)]
    covered = sum(1 for token in top_k_tokens if token in vocab)
    
    return (covered / k * 100) if k > 0 else 0.0

print("‚úÖ Helper functions defined!")


üìä Calculating health metrics for dense representations...

‚úÖ Helper functions defined!


In [32]:
# Word2Vec Skip-gram NS metrics
oov_sg_ns = calculate_oov_rate(test_sentences, w2v_sg_ns.wv)
cov100_sg_ns = calculate_coverage(test_sentences, w2v_sg_ns.wv, 100)
cov500_sg_ns = calculate_coverage(test_sentences, w2v_sg_ns.wv, 500)

print("Word2Vec Skip-gram NS:")
print(f"  Vocabulary size: {len(w2v_sg_ns.wv):,}")
print(f"  OOV rate (TEST): {oov_sg_ns:.2f}%")
print(f"  Top-100 coverage: {cov100_sg_ns:.2f}%")
print(f"  Top-500 coverage: {cov500_sg_ns:.2f}%")
print(f"  Training time: {train_time_sg_ns:.2f}s")
print(f"  Tokens/sec: {tokens_per_sec_sg_ns:,.0f}")

Word2Vec Skip-gram NS:
  Vocabulary size: 9,848
  OOV rate (TEST): 8.28%
  Top-100 coverage: 100.00%
  Top-500 coverage: 100.00%
  Training time: 6.25s
  Tokens/sec: 45,760


In [33]:
# Word2Vec CBOW NS metrics
oov_cbow_ns = calculate_oov_rate(test_sentences, w2v_cbow_ns.wv)
cov100_cbow_ns = calculate_coverage(test_sentences, w2v_cbow_ns.wv, 100)
cov500_cbow_ns = calculate_coverage(test_sentences, w2v_cbow_ns.wv, 500)

print("\nWord2Vec CBOW NS:")
print(f"  Vocabulary size: {len(w2v_cbow_ns.wv):,}")
print(f"  OOV rate (TEST): {oov_cbow_ns:.2f}%")
print(f"  Top-100 coverage: {cov100_cbow_ns:.2f}%")
print(f"  Top-500 coverage: {cov500_cbow_ns:.2f}%")
print(f"  Training time: {train_time_cbow_ns:.2f}s")
print(f"  Tokens/sec: {tokens_per_sec_cbow_ns:,.0f}")


Word2Vec CBOW NS:
  Vocabulary size: 9,848
  OOV rate (TEST): 8.28%
  Top-100 coverage: 100.00%
  Top-500 coverage: 100.00%
  Training time: 2.01s
  Tokens/sec: 142,017


In [34]:
# Word2Vec Skip-gram HS metrics
oov_sg_hs = calculate_oov_rate(test_sentences, w2v_sg_hs.wv)
cov100_sg_hs = calculate_coverage(test_sentences, w2v_sg_hs.wv, 100)
cov500_sg_hs = calculate_coverage(test_sentences, w2v_sg_hs.wv, 500)

print("\nWord2Vec Skip-gram HS:")
print(f"  Vocabulary size: {len(w2v_sg_hs.wv):,}")
print(f"  OOV rate (TEST): {oov_sg_hs:.2f}%")
print(f"  Top-100 coverage: {cov100_sg_hs:.2f}%")
print(f"  Top-500 coverage: {cov500_sg_hs:.2f}%")
print(f"  Training time: {train_time_sg_hs:.2f}s")
print(f"  Tokens/sec: {tokens_per_sec_sg_hs:,.0f}")


Word2Vec Skip-gram HS:
  Vocabulary size: 9,848
  OOV rate (TEST): 8.28%
  Top-100 coverage: 100.00%
  Top-500 coverage: 100.00%
  Training time: 6.38s
  Tokens/sec: 44,775


In [35]:
# Word2Vec CBOW HS metrics
oov_cbow_hs = calculate_oov_rate(test_sentences, w2v_cbow_hs.wv)
cov100_cbow_hs = calculate_coverage(test_sentences, w2v_cbow_hs.wv, 100)
cov500_cbow_hs = calculate_coverage(test_sentences, w2v_cbow_hs.wv, 500)

print("\nWord2Vec CBOW HS:")
print(f"  Vocabulary size: {len(w2v_cbow_hs.wv):,}")
print(f"  OOV rate (TEST): {oov_cbow_hs:.2f}%")
print(f"  Top-100 coverage: {cov100_cbow_hs:.2f}%")
print(f"  Top-500 coverage: {cov500_cbow_hs:.2f}%")
print(f"  Training time: {train_time_cbow_hs:.2f}s")
print(f"  Tokens/sec: {tokens_per_sec_cbow_hs:,.0f}")


Word2Vec CBOW HS:
  Vocabulary size: 9,848
  OOV rate (TEST): 8.28%
  Top-100 coverage: 100.00%
  Top-500 coverage: 100.00%
  Training time: 1.98s
  Tokens/sec: 144,672


In [36]:
# GloVe metrics
oov_glove = calculate_oov_rate(test_sentences, glove_embeddings)
cov100_glove = calculate_coverage(test_sentences, glove_embeddings, 100)
cov500_glove = calculate_coverage(test_sentences, glove_embeddings, 500)

print("\nGloVe (pretrained):")
print(f"  Vocabulary size: {len(glove_embeddings):,}")
print(f"  OOV rate (TEST): {oov_glove:.2f}%")
print(f"  Top-100 coverage: {cov100_glove:.2f}%")
print(f"  Top-500 coverage: {cov500_glove:.2f}%")
print(f"  Training time: 0.00s (pretrained)")
print(f"  Tokens/sec: N/A (pretrained)")


GloVe (pretrained):
  Vocabulary size: 400,000
  OOV rate (TEST): 0.61%
  Top-100 coverage: 100.00%
  Top-500 coverage: 100.00%
  Training time: 0.00s (pretrained)
  Tokens/sec: N/A (pretrained)


## 9. Save Results Summary

In [37]:
import json

# Compile dense methods results
dense_results = {
    'w2v_sg_ns': {
        'vocab_size': len(w2v_sg_ns.wv),
        'vector_dim': w2v_sg_ns.wv.vector_size,
        'oov_rate': float(oov_sg_ns),
        'top100_coverage': float(cov100_sg_ns),
        'top500_coverage': float(cov500_sg_ns),
        'train_time_sec': float(train_time_sg_ns),
        'tokens_per_sec': float(tokens_per_sec_sg_ns),
        'train_shape': list(X_train_w2v_sg_ns.shape),
        'dev_shape': list(X_dev_w2v_sg_ns.shape),
        'test_shape': list(X_test_w2v_sg_ns.shape)
    },
    'w2v_cbow_ns': {
        'vocab_size': len(w2v_cbow_ns.wv),
        'vector_dim': w2v_cbow_ns.wv.vector_size,
        'oov_rate': float(oov_cbow_ns),
        'top100_coverage': float(cov100_cbow_ns),
        'top500_coverage': float(cov500_cbow_ns),
        'train_time_sec': float(train_time_cbow_ns),
        'tokens_per_sec': float(tokens_per_sec_cbow_ns),
        'train_shape': list(X_train_w2v_cbow_ns.shape),
        'dev_shape': list(X_dev_w2v_cbow_ns.shape),
        'test_shape': list(X_test_w2v_cbow_ns.shape)
    },
    'w2v_sg_hs': {
        'vocab_size': len(w2v_sg_hs.wv),
        'vector_dim': w2v_sg_hs.wv.vector_size,
        'oov_rate': float(oov_sg_hs),
        'top100_coverage': float(cov100_sg_hs),
        'top500_coverage': float(cov500_sg_hs),
        'train_time_sec': float(train_time_sg_hs),
        'tokens_per_sec': float(tokens_per_sec_sg_hs),
        'train_shape': list(X_train_w2v_sg_hs.shape),
        'dev_shape': list(X_dev_w2v_sg_hs.shape),
        'test_shape': list(X_test_w2v_sg_hs.shape)
    },
    'w2v_cbow_hs': {
        'vocab_size': len(w2v_cbow_hs.wv),
        'vector_dim': w2v_cbow_hs.wv.vector_size,
        'oov_rate': float(oov_cbow_hs),
        'top100_coverage': float(cov100_cbow_hs),
        'top500_coverage': float(cov500_cbow_hs),
        'train_time_sec': float(train_time_cbow_hs),
        'tokens_per_sec': float(tokens_per_sec_cbow_hs),
        'train_shape': list(X_train_w2v_cbow_hs.shape),
        'dev_shape': list(X_dev_w2v_cbow_hs.shape),
        'test_shape': list(X_test_w2v_cbow_hs.shape)
    },
    'glove': {
        'vocab_size': len(glove_embeddings),
        'vector_dim': 100,
        'oov_rate': float(oov_glove),
        'top100_coverage': float(cov100_glove),
        'top500_coverage': float(cov500_glove),
        'train_time_sec': 0.0,
        'tokens_per_sec': None,
        'train_shape': list(X_train_glove.shape),
        'dev_shape': list(X_dev_glove.shape),
        'test_shape': list(X_test_glove.shape)
    }
}

# Save results
with open(CACHE_DIR / 'dense_results.json', 'w') as f:
    json.dump(dense_results, f, indent=2)

print("\nüíæ Dense methods results saved to cache/dense_results.json")


üíæ Dense methods results saved to cache/dense_results.json


## 10. Summary Comparison

In [38]:
print("\n" + "="*80)
print("DENSE METHODS SUMMARY")
print("="*80)

print(f"\n{'Method':<20} {'Vocab':<10} {'OOV%':<8} {'Top100%':<9} {'Top500%':<9} {'Time(s)':<10} {'Tok/s':<12}")
print("-"*80)

for method_name, metrics in dense_results.items():
    tok_s = f"{metrics['tokens_per_sec']:,.0f}" if metrics['tokens_per_sec'] else "N/A"
    print(f"{method_name:<20} {metrics['vocab_size']:<10,} {metrics['oov_rate']:<8.2f} "
          f"{metrics['top100_coverage']:<9.2f} {metrics['top500_coverage']:<9.2f} "
          f"{metrics['train_time_sec']:<10.2f} {tok_s:<12}")

print("\n" + "="*80)
print("‚úÖ Dense methods training complete!")
print("="*80)


DENSE METHODS SUMMARY

Method               Vocab      OOV%     Top100%   Top500%   Time(s)    Tok/s       
--------------------------------------------------------------------------------
w2v_sg_ns            9,848      8.28     100.00    100.00    6.25       45,760      
w2v_cbow_ns          9,848      8.28     100.00    100.00    2.01       142,017     
w2v_sg_hs            9,848      8.28     100.00    100.00    6.38       44,775      
w2v_cbow_hs          9,848      8.28     100.00    100.00    1.98       144,672     
glove                400,000    0.61     100.00    100.00    0.00       N/A         

‚úÖ Dense methods training complete!


In [39]:
print("\nüéâ Notebook 03: Dense Methods - COMPLETE!")
print("\nNext steps:")
print("  1. Run notebook 04: Classification")
print("  2. Train classifiers on all representations")
print("  3. Compare performance on TEST set")


üéâ Notebook 03: Dense Methods - COMPLETE!

Next steps:
  1. Run notebook 04: Classification
  2. Train classifiers on all representations
  3. Compare performance on TEST set
