# Exploring Static Word Embeddings with FastText

This script demonstrates key concepts of static word embeddings using a small pre-trained FastText model (wiki.simple, ~600 MB, 300-dimensional vectors). We focus on the word 'mercury', which has two meanings (a metal and a planet), to explore how embeddings capture semantic relationships.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from gensim.models import KeyedVectors
from sklearn.manifold import TSNE
from pathlib import Path
import urllib.request
import zipfile
import os
import random
# Set random seed for reproducibility
np.random.seed(42)

# Create output directory for saving plots
Path("plots").mkdir(exist_ok=True)

## Section 1: Download and Load Pre-trained FastText Model

We use the wiki.simple model, a small pre-trained FastText model trained on English Wikipedia.
This model is ~600 MB (compressed) and uses 300-dimensional vectors, making it suitable for educational purposes. We download the .bin.gz file from FastText's official repository and load it using Gensim's load_facebook_model function, which supports FastText's binary format.
The model captures subword information, allowing it to handle out-of-vocabulary words.

In [None]:
model_url = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M-subword.vec.zip"
model_zip_path = Path("models/wiki-news-300d-1M-subword.vec.zip")
model_vec_path = Path("models/wiki-news-300d-1M.vec")

# Create models directory if it doesn't exist
Path("models").mkdir(exist_ok=True)

# Download and unzip the model only if the .vec file is not already present
if not model_vec_path.exists():
    print("Downloading wiki-news-300d-1M-subword FastText model (~600 MB)...")
    urllib.request.urlretrieve(model_url, model_zip_path)
    print(f"Model downloaded to {model_zip_path}")
    print("Unzipping model...")
    with zipfile.ZipFile(model_zip_path, 'r') as zip_ref:
        zip_ref.extractall(Path("models"))
    os.remove(model_zip_path)  # Clean up .zip file
    print(f"Model saved to {model_vec_path}")

# Load the model
print("Loading FastText model...")
model = KeyedVectors.load_word2vec_format(str(model_vec_path), binary=False)

## t-SNE Visualization of the Enormous Map of Relations

This section visualizes the enormous 'map of relations' in the FastText model, which contains 1 million words, each represented by a 300-dimensional vector (300 million total values).
To emphasize the vastness and complexity, we sample 10,000 words (1% of the vocabulary) and create a deliberately dense scatter plot using t-SNE to reduce the 300-dimensional vectors to 2D.
Only the core words (mercury, metals, planets) are highlighted and labeled to anchor the visualization, while the remaining points are plotted without labels to show the overwhelming density of relationships.
This illustrates the massive scale of the embedding space, where words are positioned based on semantic relationships, but the sheer number of points makes individual relations hard to discern.

In [None]:
def plot_tsne_space_of_meaning(core_words, sample_size=10000):
    # Sample additional random words from the model's vocabulary
    vocab = list(model.key_to_index.keys())
    random_words = random.sample(vocab, min(sample_size - len(core_words), len(vocab)))
    sampled_words = core_words + random_words
    print(f"Sampled {len(sampled_words)} words for 3D t-SNE visualization")
    
    # Get word vectors
    vectors = np.array([model[word] for word in sampled_words])
    
    # Apply t-SNE for 3D visualization
    tsne = TSNE(n_components=3, random_state=42, perplexity=30, n_iter=1000)
    vectors_3d = tsne.fit_transform(vectors)
    
    # Create 3D scatter plot
    fig = plt3D = plt.figure(figsize=(12, 10))
    ax = fig.add_subplot(111, projection='3d')
    
    # Plot all sampled words in gray with low alpha for dense, cluttered effect
    ax.scatter(
        vectors_3d[:, 0], vectors_3d[:, 1], vectors_3d[:, 2],
        c='blue', alpha=0.1, s=5
    )
    
    # Highlight core words in red
    core_indices = range(len(core_words))
    ax.scatter(
        vectors_3d[core_indices, 0], vectors_3d[core_indices, 1], vectors_3d[core_indices, 2],
        c='red', s=50, label='Core Words (Mercury, Metals, Planets)'
    )
    
    # Label only core words
    for i, word in enumerate(core_words):
        ax.text(
            vectors_3d[i, 0], vectors_3d[i, 1], vectors_3d[i, 2],
            word, fontsize=12, color='black'
        )
    
    ax.set_title("3D t-SNE Visualization of the Enormous Map of Relations\n(1M Words × 300 Dimensions, Sample of 10,000 Words)")
    ax.set_xlabel("t-SNE Dimension 1")
    ax.set_ylabel("t-SNE Dimension 2")
    ax.set_zlabel("t-SNE Dimension 3")
    ax.legend()
    plt.tight_layout()
    output_path = Path("plots/tsne_map_of_relations_3d.png")
    plt.savefig(output_path)
    plt.close()
    print(f"Saved 3D map of relations t-SNE plot to {output_path}")

# Main Execution

In [None]:
if __name__ == "__main__":
    words = [
        "mercury",
        "gold",
        "venus"
    ]
    # Generate t-SNE map of relations visualization
    plot_tsne_space_of_meaning(words)

# Exploring Sequential Embeddings and Hidden States

This script introduces sequential embeddings, focusing on the concept of the hidden state in Recurrent Neural Networks (RNNs) and their limitations. Unlike static embeddings (e.g., FastText), sequential embeddings process text sequentially, updating a hidden state to capture context.

We use a simple RNN to process a sentence containing "mercury" and generate two plots:

1. Hidden State Evolution Plot: Shows how the hidden state changes as the RNN processes each word, illustrating how it encodes temporal dependencies.
2. Context Limitation Plot: Demonstrates the limitation of sequential embeddings by showing how cosine similarities between hidden states decrease for distant words, highlighting issues like vanishing gradients or limited context retention.

Learning Objective: Understand the role of the hidden state in sequential models and recognize their limitations in capturing long-range dependencies.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
import torch
import torch.nn as nn
import urllib.request
import zipfile
from pathlib import Path
import os

# Set random seed for reproducibility
np.random.seed(42)
torch.manual_seed(42)

## Section 1: Download and Load Pre-trained GloVe Embeddings

We use a small pre-trained GloVe model (glove.6B.50d, ~160 MB) to embed words in our input sentence. GloVe provides static word embeddings, which we feed into an RNN to generate sequential embeddings. The hidden state of the RNN captures the context of the sequence, unlike static embeddings.

In [None]:
glove_url = "https://nlp.stanford.edu/data/glove.6B.zip"
glove_zip_path = Path("models/glove.6B.zip")
glove_txt_path = Path("models/glove.6B.50d.txt")

# Create models directory
Path("models").mkdir(exist_ok=True)

# Download and unzip GloVe if not already present
if not glove_txt_path.exists():
    if not glove_zip_path.exists():
        print("Downloading GloVe 6B.50d model (~800 MB)...")
        urllib.request.urlretrieve(glove_url, glove_zip_path)
        print(f"Model downloaded to {glove_zip_path}")
    print("Unzipping model...")
    with zipfile.ZipFile(glove_zip_path, 'r') as zip_ref:
        zip_ref.extractall(Path("models"))
    os.remove(glove_zip_path)
    print(f"Model saved to {glove_txt_path}")

# Load GloVe embeddings into a dictionary
print("Loading GloVe embeddings...")
word_to_vec = {}
with open(glove_txt_path, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.array(values[1:], dtype=np.float32)
        word_to_vec[word] = vector

## Section 2: Prepare Input Sentence

 We use a sample sentence containing "mercury" to demonstrate the hidden state. The sentence is tokenized, and each word is mapped to its GloVe embedding. Words not in GloVe are assigned a zero vector. This sentence ties into the theme of "mercury" as a metal and planet.

In [None]:
sentence = "mercury is a planet and a metal".lower().split()
vocab = list(set(sentence))  # Unique words
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
input_vectors = [word_to_vec.get(word, np.zeros(50)) for word in sentence]
input_tensor = torch.tensor(input_vectors, dtype=torch.float32).unsqueeze(0)  # Shape: (1, seq_len, 50)

## Section 3: Define Simple RNN Model

We define a basic RNN to process the sequence. The hidden state is a vector that updates at each timestep, encoding the context of the sequence up to that point. The hidden state is the key component of sequential embeddings, capturing temporal dependencies.

In [None]:
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x, hidden):
        out, hidden = self.rnn(x, hidden)
        out = self.fc(out)
        return out, hidden
# Initialize model
input_size = 50  # GloVe 50d
hidden_size = 20  # Small hidden state for visualization
output_size = len(vocab)  # Predict next word (toy task)
model = SimpleRNN(input_size, hidden_size, output_size)
initial_hidden = torch.zeros(1, 1, hidden_size)  # Initial hidden state

## Section 4: Hidden State Evolution Plot

This section visualizes how the hidden state evolves as the RNN processes each word in the sentence. We reduce the hidden state (20 dimensions) to 2D using PCA for plotting. This shows how the hidden state captures the sequence's context, changing with each word.

In [None]:
def plot_hidden_state_evolution(sentence, input_tensor, model, initial_hidden):
    model.eval()
    hidden_states = []
    
    # Process sequence and collect hidden states
    hidden = initial_hidden
    with torch.no_grad():
        for t in range(input_tensor.size(1)):
            out, hidden = model(input_tensor[:, t:t+1, :], hidden)
            hidden_states.append(hidden.squeeze().numpy())
    
    # Reduce hidden states to 2D using PCA
    hidden_states = np.array(hidden_states)
    pca = PCA(n_components=2)
    hidden_2d = pca.fit_transform(hidden_states)
    
    # Plot hidden state trajectory
    plt.figure(figsize=(10, 6))
    for i in range(len(sentence)):
        plt.scatter(hidden_2d[i, 0], hidden_2d[i, 1], s=100, label=sentence[i])
        if i > 0:
            plt.plot(hidden_2d[i-1:i+1, 0], hidden_2d[i-1:i+1, 1], 'b-')
        plt.text(hidden_2d[i, 0] + 0.1, hidden_2d[i, 1], sentence[i], fontsize=12)
    
    plt.title("Hidden State Evolution Across Sequence\n(RNN Processing 'mercury is a planet and a metal')")
    plt.xlabel("PCA Component 1")
    plt.ylabel("PCA Component 2")
    plt.legend()
    plt.tight_layout()
    output_path = Path("plots/hidden_state_evolution.png")
    plt.savefig(output_path)
    plt.close()
    print(f"Saved hidden state evolution plot to {output_path}")

## Section 5: Context Limitation Plot

This section illustrates a key limitation of sequential embeddings: difficulty capturing long-range dependencies. We compute cosine similarities between the hidden state at each timestep and the final hidden state, showing how similarity decreases for earlier words, indicating loss of context (e.g., due to vanishing gradients).

In [None]:
def plot_context_limitation(sentence, input_tensor, model, initial_hidden):
    model.eval()
    hidden_states = []
    
    # Collect hidden states
    hidden = initial_hidden
    with torch.no_grad():
        for t in range(input_tensor.size(1)):
            out, hidden = model(input_tensor[:, t:t+1, :], hidden)
            hidden_states.append(hidden.squeeze().numpy())
    
    hidden_states = np.array(hidden_states)
    final_hidden = hidden_states[-1].reshape(1, -1)
    
    # Compute cosine similarities with final hidden state
    similarities = [cosine_similarity(hidden.reshape(1, -1), final_hidden)[0, 0] for hidden in hidden_states]
    
    # Plot similarities
    plt.figure(figsize=(10, 6))
    plt.plot(range(len(sentence)), similarities, marker='o')
    for i, word in enumerate(sentence):
        plt.text(i, similarities[i] + 0.02, word, fontsize=12)
    plt.title("Context Limitation: Cosine Similarity to Final Hidden State\n(Decreasing Similarity Shows Loss of Early Context)")
    plt.xlabel("Word Position in Sequence")
    plt.ylabel("Cosine Similarity")
    plt.tight_layout()
    output_path = Path("plots/context_limitation.png")
    plt.savefig(output_path)
    plt.close()
    print(f"Saved context limitation plot to {output_path}")

# Main Execution

Run the visualizations to generate the plots, saved in the 'plots' directory. The GloVe model is downloaded and loaded if not already present.

In [None]:
if __name__ == "__main__":
    # Create plots directory
    Path("plots").mkdir(exist_ok=True)
    
    # Generate hidden state evolution plot
    plot_hidden_state_evolution(sentence, input_tensor, model, initial_hidden)
    
    # Generate context limitation plot
    plot_context_limitation(sentence, input_tensor, model, initial_hidden)