In [None]:
# Example: Document embeddings with Doc2Vec
# from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# # Prepare tagged documents
# tagged_docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(corpus)]

# # Train Doc2Vec model
# doc_model = Doc2Vec(
#     tagged_docs,
#     vector_size=100,
#     window=5,
#     min_count=5,
#     workers=4,
#     epochs=20
# )

# # Find similar documents
# doc_id = 0  # Document to query
# similar_docs = doc_model.dv.most_similar(doc_id, topn=5)
# print(f"Documents most similar to document {doc_id}:")
# for doc_id, similarity in similar_docs:
#     print(f"  Document {doc_id}: {similarity:.3f}")
#     print(f"    Preview: {df.iloc[doc_id]['text_column'][:100]}...")

# # Infer vector for new document
# new_doc = "Your new document text here".split()
# new_vector = doc_model.infer_vector(new_doc)
# similar_to_new = doc_model.dv.most_similar([new_vector], topn=3)

### 4. Document Embeddings with Doc2Vec

**Challenge**: Move beyond word embeddings to represent entire documents as vectors.

**Tasks:**
- Implement Doc2Vec on your corpus
- Compare document similarities
- Build a document recommendation system
- Cluster documents based on their embeddings
- Create a semantic search engine for your documents

**Applications:**
- Find the most similar documents to a query document
- Classify documents based on their embeddings
- Track how document themes evolve over time

In [None]:
# Example: Working with multilingual embeddings
# # If you have text in multiple languages
# from langdetect import detect

# # Detect language of documents
# df['language'] = df['text_column'].apply(lambda x: detect(x) if len(x) > 20 else 'unknown')

# # Separate by language
# english_docs = df[df['language'] == 'en']['processed_text'].tolist()
# spanish_docs = df[df['language'] == 'es']['processed_text'].tolist()

# # Train separate models
# model_en = Word2Vec(english_docs, vector_size=100, window=5, min_count=5)
# model_es = Word2Vec(spanish_docs, vector_size=100, window=5, min_count=5)

# # Compare similar words across languages
# test_word_en = 'computer'
# if test_word_en in model_en.wv:
#     similar_en = model_en.wv.most_similar(test_word_en, topn=5)
#     print(f"English - similar to '{test_word_en}':")
#     for word, score in similar_en:
#         print(f"  {word}: {score:.3f}")

### 3. Cross-lingual Word Embeddings

**Challenge**: If you have multilingual data, explore cross-lingual embeddings.

**Tasks:**
- Train separate embeddings for different languages in your data
- Use aligned embeddings to find translations
- Compare semantic spaces across languages
- Identify language-specific concepts
- Build a simple cross-lingual similarity finder

**Visualization ideas:**
- Plot the same concepts in different language spaces
- Show how well concepts align across languages
- Identify culture-specific terms with no direct translation

In [None]:
# Reduce topic overlap if needed
# # If topics are too granular, merge similar ones
# target_num_topics = 10  # Desired number of topics
# topic_model.reduce_topics(documents, nr_topics=target_num_topics)

# # Re-visualize after reduction
# fig_reduced = topic_model.visualize_topics()
# fig_reduced.show()

# # Extract all documents from a specific topic
# topic_of_interest = 2
# topic_docs_indices = [i for i, t in enumerate(topics) if t == topic_of_interest]
# df_topic_subset = df.iloc[topic_docs_indices].copy()
# print(f"Found {len(df_topic_subset)} documents in Topic {topic_of_interest}")

In [None]:
# Visualize topics with BERTopic
# # Create interactive visualizations
# fig_topics = topic_model.visualize_topics()
# fig_topics.show()

# # Visualize topic hierarchy
# fig_hierarchy = topic_model.visualize_hierarchy()
# fig_hierarchy.show()

# # Find representative documents for a topic
# topic_num = 0  # Choose your topic
# representative_docs = topic_model.get_representative_docs(topic_num)
# print(f"\nRepresentative documents for Topic {topic_num}:")
# for i, doc in enumerate(representative_docs[:3], 1):
#     print(f"\n{i}. {doc[:200]}...")  # First 200 chars

In [None]:
# Example: Topic modeling with BERTopic
# Note: This may require installation: pip install bertopic

# from bertopic import BERTopic

# # Prepare your documents (use your cleaned text)
# documents = df['text_column'].tolist()  # Replace with your text column

# # Create and fit BERTopic model
# topic_model = BERTopic(verbose=True)
# topics, probabilities = topic_model.fit_transform(documents)

# # Get topic information
# topic_info = topic_model.get_topic_info()
# print(f"Number of topics found: {len(topic_info) - 1}")  # -1 for outlier topic
# print("\nTop 5 topics by frequency:")
# print(topic_info.head(6))  # Including outlier topic (-1)

# # Get top words for a specific topic
# topic_words = topic_model.get_topic(0)  # Topic 0
# print(f"\nTop words for Topic 0:")
# for word, score in topic_words[:10]:
#     print(f"  {word}: {score:.4f}")

### 2. Topic Modeling with BERTopic

**Challenge**: Use modern neural topic modeling to discover and analyze themes in your corpus.

**Tasks:**
- Install and configure BERTopic for your dataset
- Extract topics using transformer-based embeddings
- Compare topics to your word embedding clusters
- Create interactive visualizations of topic distributions
- Find representative documents for each topic

**Analysis ideas:**
- How do BERTopic themes compare to word embedding clusters?
- Which topics are most prevalent in your dataset?
- Can you track topic evolution over time (if temporal data)?
- How do different documents get classified into topics?

In [None]:
# Example: Compare Skip-gram vs CBOW
# from gensim.models import Word2Vec

# # Skip-gram model (sg=1)
# model_skipgram = Word2Vec(
#     sentences=corpus,
#     vector_size=100,
#     window=5,
#     min_count=5,
#     sg=1,  # Skip-gram
#     epochs=10,
#     seed=42
# )

# # CBOW model (sg=0)
# model_cbow = Word2Vec(
#     sentences=corpus,
#     vector_size=100,
#     window=5,
#     min_count=5,
#     sg=0,  # CBOW
#     epochs=10,
#     seed=42
# )

# # Compare performance on analogies or similarity tasks
# test_word = 'your_test_word'
# if test_word in model_skipgram.wv and test_word in model_cbow.wv:
#     print(f"Skip-gram similar to '{test_word}':")
#     for word, score in model_skipgram.wv.most_similar(test_word, topn=5):
#         print(f"  {word}: {score:.3f}")
#     
#     print(f"\nCBOW similar to '{test_word}':")
#     for word, score in model_cbow.wv.most_similar(test_word, topn=5):
#         print(f"  {word}: {score:.3f}")

### 1. Advanced Word2Vec/FastText Parameters

Experiment with different model architectures and training parameters:

**Tasks:**
- Compare Skip-gram vs CBOW architectures
- Test different vector dimensions (50, 100, 200, 300)
- Vary the context window size
- Use hierarchical softmax vs negative sampling
- Compare Word2Vec with FastText on out-of-vocabulary words

**Questions to explore:**
- How does model performance change with vector dimensionality?
- Which architecture works better for your specific dataset?
- How do rare words perform with different parameter settings?

# Word Embeddings Lab: Apply to Your Own Dataset

---

<div class="alert alert-info">
    
### Lab Objectives

* Train word embeddings on your chosen dataset
* Explore semantic relationships specific to your domain
* Analyze biases and patterns in your text corpus
* Practice interpreting embedding results in context
</div>

### Getting Started

Choose a text dataset that interests you. This could be:
- News articles from a specific time period or topic
- Social media posts (Twitter, Reddit, etc.)
- Academic papers or book reviews
- Product reviews or customer feedback
- Historical documents or literature

**Requirements**: Your dataset should have at least 1,000 documents with meaningful text content.

### Sections
1. [Data Loading and Exploration](#section1)
2. [Text Preprocessing](#section2)
3. [Training Word Embeddings](#section3)
4. [Exploring Semantic Relationships](#section4)
5. [Word Analogies in Your Domain](#section5)
6. [Bias Analysis](#section6)
7. [Visualization and Interpretation](#section7)

## Setup and Imports

Import the necessary libraries for text processing and embedding analysis.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gensim
import gensim.downloader as api
from gensim.models import FastText, Word2Vec
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import re
import string
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

<a id='section1'></a>
## 1. Data Loading and Exploration

Load your dataset and explore its basic properties. Understanding your data is crucial for effective preprocessing and analysis.

### Load Your Dataset

Load your chosen dataset. Identify which column(s) contain the text you want to analyze.

In [None]:
# Load your dataset here
# df = pd.read_csv('your_data.csv')
# or df = pd.read_json('your_data.json')
# or however you load your data

# Display basic information about your dataset


### Explore Text Properties

Examine the characteristics of your text data: length distribution, common words, etc.

In [None]:
# Explore your text column
# Replace 'text_column' with the actual name of your text column

# Calculate text lengths
# df['text_length'] = df['text_column'].str.len()

# Display statistics
# print(f"Average text length: {df['text_length'].mean():.0f} characters")
# print(f"Median text length: {df['text_length'].median():.0f} characters")

# Show sample texts
# print("\nSample texts:")
# for i in range(3):
#     print(f"{i+1}. {df['text_column'].iloc[i][:200]}...")


### Visualize Data Distribution

Create visualizations to understand your data better.

In [None]:
# Create visualizations relevant to your dataset
# Examples:
# - Text length histogram
# - Distribution of categories/labels (if applicable)
# - Time series of posts (if you have dates)



<a id='section2'></a>
## 2. Text Preprocessing

Clean and tokenize your text data. The preprocessing steps will depend on your specific dataset.

### Define Preprocessing Function

Create a function to clean and tokenize your text. Consider what domain-specific terms or patterns you want to preserve.

In [None]:
def preprocess_text(text):
    """Clean and tokenize text for embedding training."""
    if pd.isna(text):
        return []
    
    # Convert to lowercase
    text = text.lower()
    
    # Add domain-specific preprocessing here
    # Examples:
    # - Remove URLs: text = re.sub(r'http\S+|www\S+', '', text)
    # - Remove mentions: text = re.sub(r'@\w+', '', text)
    # - Handle special characters specific to your domain
    
    # Basic tokenization
    tokens = re.findall(r"\b[a-z]+(?:'[a-z]+)?\b|[a-z]+", text)
    
    # Filter tokens based on your needs
    # - Remove very short words: tokens = [t for t in tokens if len(t) > 2]
    # - Keep domain-specific short terms
    
    return tokens

# Test your preprocessing function
# sample_text = "Your sample text here"
# print(f"Original: {sample_text}")
# print(f"Processed: {preprocess_text(sample_text)}")

### Process Your Corpus

Apply preprocessing to your entire dataset and prepare it for training.

In [None]:
# Apply preprocessing to your entire dataset
# corpus = df['text_column'].apply(preprocess_text).tolist()

# Remove empty documents
# corpus = [doc for doc in corpus if len(doc) > 0]

# Display corpus statistics
# print(f"Total documents: {len(corpus)}")
# print(f"Average tokens per document: {np.mean([len(doc) for doc in corpus]):.0f}")
# print(f"\nSample processed document:")
# print(corpus[0][:20])  # First 20 tokens


<a id='section3'></a>
## 3. Training Word Embeddings

Train a word embeddings model on your preprocessed corpus. Choose between Word2Vec and FastText based on your data characteristics.

### Choose Your Model

**Word2Vec** vs **FastText**:
- Use **FastText** if your text has many misspellings, abbreviations, or domain-specific terms
- Use **Word2Vec** for cleaner, more formal text

Adjust the parameters based on your corpus size and characteristics.

In [None]:
# Train your embedding model
print("Training word embedding model...")

# Choose one of these approaches:

# Option 1: FastText (good for noisy text)
# model = FastText(
#     sentences=corpus,
#     vector_size=100,      # Dimensionality of embeddings
#     window=5,             # Context window size
#     min_count=5,          # Ignore words appearing less than 5 times
#     min_n=3,              # Min character n-gram
#     max_n=6,              # Max character n-gram
#     sg=1,                 # Use Skip-gram
#     epochs=10,            # Training iterations
#     seed=42
# )

# Option 2: Word2Vec (good for clean text)
# model = Word2Vec(
#     sentences=corpus,
#     vector_size=100,      # Dimensionality of embeddings
#     window=5,             # Context window size
#     min_count=5,          # Ignore words appearing less than 5 times
#     sg=1,                 # Use Skip-gram
#     epochs=10,            # Training iterations
#     seed=42
# )

# Print model statistics
# print(f"Model trained! Vocabulary size: {len(model.wv)}")
# print(f"Vector dimensions: {model.wv.vector_size}")


### Test Your Model

Verify that your model learned meaningful representations by testing some domain-relevant words.

In [None]:
# Test your model with relevant terms from your domain
# test_words = ['word1', 'word2', 'word3']  # Replace with your domain terms

# Check which words are in vocabulary
# print("Words in vocabulary:")
# for word in test_words:
#     if word in model.wv:
#         print(f"✓ '{word}' - vector shape: {model.wv[word].shape}")
#     else:
#         print(f"✗ '{word}' - not in vocabulary")


<a id='section4'></a>
## 4. Exploring Semantic Relationships

Investigate how your model captures relationships between domain-specific terms.

### Find Similar Words

Explore what words your model considers similar to key terms in your domain.

In [None]:
# Explore similar words for key terms in your domain
# key_terms = ['term1', 'term2', 'term3']  # Replace with your terms

# for term in key_terms:
#     if term in model.wv:
#         print(f"\nMost similar to '{term}':")
#         for similar_word, score in model.wv.most_similar(term, topn=5):
#             print(f"  {similar_word}: {score:.3f}")


### Calculate Semantic Similarities

Compute similarities between related concepts in your domain.

In [None]:
# Calculate similarities between domain-specific term pairs
# term_pairs = [('word1', 'word2'), ('word3', 'word4')]  # Your term pairs

# print("Semantic similarities:")
# for term1, term2 in term_pairs:
#     if term1 in model.wv and term2 in model.wv:
#         similarity = model.wv.similarity(term1, term2)
#         print(f"{term1} ↔ {term2}: {similarity:.3f}")


### Create Similarity Heatmap

Visualize relationships between multiple terms using a heatmap.

In [None]:
# Create a similarity heatmap for important terms in your domain
# important_terms = ['term1', 'term2', 'term3', 'term4']  # Your terms

# Filter to terms that exist in vocabulary
# available_terms = [t for t in important_terms if t in model.wv]

# Calculate similarity matrix
# n_terms = len(available_terms)
# similarity_matrix = np.zeros((n_terms, n_terms))

# for i, term1 in enumerate(available_terms):
#     for j, term2 in enumerate(available_terms):
#         similarity_matrix[i, j] = model.wv.similarity(term1, term2)

# Plot heatmap
# plt.figure(figsize=(8, 6))
# sns.heatmap(similarity_matrix, 
#             xticklabels=available_terms,
#             yticklabels=available_terms,
#             cmap='YlOrRd',
#             annot=True,
#             fmt='.2f',
#             cbar_kws={'label': 'Cosine Similarity'})
# plt.title('Similarity Matrix of Key Terms')
# plt.tight_layout()
# plt.show()


<a id='section5'></a>
## 5. Word Analogies in Your Domain

Test word analogies that are relevant to your specific domain.

### Create Domain-Specific Analogies

Think about relationships that might exist in your domain. What patterns would you expect?

In [None]:
def test_analogy(model, positive, negative, description):
    """Test a word analogy and display results."""
    try:
        result = model.wv.most_similar(positive=positive, negative=negative, topn=3)
        print(f"\n{description}:")
        for word, score in result:
            print(f"  {word}: {score:.3f}")
    except KeyError:
        print(f"\n{description}: Some words not in vocabulary")

# Test domain-specific analogies
# Examples to adapt to your domain:
# test_analogy(model, ['word1', 'word2'], ['word3'], 'word1 - word3 + word2')
# test_analogy(model, ['word4', 'word5'], ['word6'], 'word4 - word6 + word5')


### Explore Conceptual Dimensions

Create vectors that represent conceptual differences and test them on other terms.

In [None]:
# Create conceptual vectors by subtracting related terms
# For example: concept_vector = model.wv['positive_term'] - model.wv['negative_term']

# Test how other words align with this conceptual dimension
# test_terms = ['term1', 'term2', 'term3']  # Terms to test

# for term in test_terms:
#     if term in model.wv:
#         similarity = np.dot(concept_vector, model.wv[term]) / (
#             np.linalg.norm(concept_vector) * np.linalg.norm(model.wv[term]))
#         print(f"{term}: {similarity:.3f}")


<a id='section6'></a>
## 6. Bias Analysis

Investigate potential biases in your embeddings using semantic axes.

### Define Semantic Axes

Create semantic axes relevant to your domain. Think about what opposing concepts might exist in your data.

In [None]:
def create_semantic_axis(positive_words, negative_words, model):
    """Create a semantic axis from two sets of pole words."""
    # Get embeddings for available words
    pos_vectors = [model.wv[w] for w in positive_words if w in model.wv]
    neg_vectors = [model.wv[w] for w in negative_words if w in model.wv]
    
    if not pos_vectors or not neg_vectors:
        return None
    
    # Calculate means and return difference
    pos_mean = np.mean(pos_vectors, axis=0)
    neg_mean = np.mean(neg_vectors, axis=0)
    return pos_mean - neg_mean

def project_on_axis(word, model, axis):
    """Project a word onto a semantic axis."""
    if word not in model.wv or axis is None:
        return None
    word_vector = model.wv[word]
    return np.dot(word_vector, axis) / (np.linalg.norm(word_vector) * np.linalg.norm(axis))

# Define semantic axes relevant to your domain
# Example: positive vs negative sentiment
# positive_terms = ['good', 'great', 'excellent', 'amazing']
# negative_terms = ['bad', 'terrible', 'awful', 'horrible']
# sentiment_axis = create_semantic_axis(positive_terms, negative_terms, model)

# Define other axes relevant to your domain
# formal_terms = ['formal', 'professional', 'official']
# informal_terms = ['casual', 'informal', 'relaxed']
# formality_axis = create_semantic_axis(formal_terms, informal_terms, model)


### Test Words Against Your Axes

Project domain-relevant terms onto your semantic axes to reveal patterns.

In [None]:
# Test how different terms project onto your semantic axes
# test_words = ['word1', 'word2', 'word3', 'word4']  # Domain-relevant terms

# print("Projections onto semantic axis:")
# print("(Positive values = closer to positive pole, Negative = closer to negative pole)")
# print("-" * 60)

# projections = {}
# for word in test_words:
#     projection = project_on_axis(word, model, sentiment_axis)  # Use your axis
#     if projection is not None:
#         projections[word] = projection
#         print(f"{word:15} {projection:+.3f}")


### Visualize Bias Patterns

Create visualizations to show how terms cluster along your semantic dimensions.

In [None]:
# Create a bar plot showing projections
# if projections:
#     plt.figure(figsize=(10, 6))
#     words = list(projections.keys())
#     values = list(projections.values())
#     colors = ['green' if v > 0 else 'red' for v in values]
    
#     plt.barh(words, values, color=colors)
#     plt.xlabel('Semantic Axis Projection')
#     plt.title('Word Projections on Semantic Axis')
#     plt.axvline(x=0, color='gray', linestyle='--', alpha=0.5)
#     plt.tight_layout()
#     plt.show()


<a id='section7'></a>
## 7. Visualization and Interpretation

Create comprehensive visualizations to understand your embedding space.

### 2D Embedding Visualization

Use PCA or t-SNE to visualize your embeddings in 2D space.

In [None]:
# Select important terms from your domain for visualization
# visualization_terms = ['term1', 'term2', 'term3']  # Your important terms

# Get vectors for available terms
# available_viz_terms = [t for t in visualization_terms if t in model.wv]
# if available_viz_terms:
#     term_vectors = np.array([model.wv[term] for term in available_viz_terms])

#     # Reduce to 2D using PCA
#     pca = PCA(n_components=2, random_state=42)
#     coords_2d = pca.fit_transform(term_vectors)

#     # Create visualization
#     plt.figure(figsize=(12, 8))
#     plt.scatter(coords_2d[:, 0], coords_2d[:, 1], s=100, alpha=0.6)

#     # Add labels
#     for i, term in enumerate(available_viz_terms):
#         plt.annotate(term, (coords_2d[i, 0], coords_2d[i, 1]), 
#                     xytext=(5, 5), textcoords='offset points', fontsize=10)

#     plt.title('2D Visualization of Domain Terms')
#     plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
#     plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
#     plt.grid(True, alpha=0.3)
#     plt.tight_layout()
#     plt.show()


### Multi-Dimensional Analysis

If you created multiple semantic axes, create a 2D plot showing how terms position along two dimensions.

In [None]:
# Create a 2D plot with two semantic axes
# analysis_terms = ['term1', 'term2', 'term3']  # Terms to analyze

# Calculate projections on both axes (if you have two)
# axis1_projections = []
# axis2_projections = []
# plot_terms = []

# for term in analysis_terms:
#     proj1 = project_on_axis(term, model, your_first_axis)
#     proj2 = project_on_axis(term, model, your_second_axis)
    
#     if proj1 is not None and proj2 is not None:
#         axis1_projections.append(proj1)
#         axis2_projections.append(proj2)
#         plot_terms.append(term)

# if plot_terms:
#     plt.figure(figsize=(10, 8))
#     plt.scatter(axis1_projections, axis2_projections, s=100, alpha=0.6)
    
#     for i, term in enumerate(plot_terms):
#         plt.annotate(term, (axis1_projections[i], axis2_projections[i]),
#                     xytext=(5, 5), textcoords='offset points', fontsize=10)
    
#     plt.axhline(y=0, color='gray', linestyle='--', alpha=0.5)
#     plt.axvline(x=0, color='gray', linestyle='--', alpha=0.5)
#     plt.xlabel('First Semantic Dimension')
#     plt.ylabel('Second Semantic Dimension')
#     plt.title('Terms in 2D Semantic Space')
#     plt.grid(True, alpha=0.3)
#     plt.tight_layout()
#     plt.show()


### Neighborhood Exploration

Visualize neighborhoods around key terms to understand local semantic structures.

In [None]:
# Select anchor words and their neighborhoods
# anchors = ['anchor1', 'anchor2', 'anchor3']  # Your key terms
# points = {}

# for anchor in anchors:
#     if anchor in model.wv:
#         points[anchor] = model.wv[anchor]
#         # Add similar words to the anchor
#         for neighbor, _ in model.wv.most_similar(anchor, topn=5):
#             if neighbor not in points:
#                 points[neighbor] = model.wv[neighbor]

# if points:
#     words = list(points.keys())
#     X = np.vstack([points[w] for w in words])

#     # Use t-SNE for neighborhood visualization
#     perp = min(30, max(5, len(words)//5))
#     tsne = TSNE(n_components=2, perplexity=perp, random_state=42)
#     X2 = tsne.fit_transform(X)

#     plt.figure(figsize=(12, 8))
#     plt.scatter(X2[:, 0], X2[:, 1], alpha=0.7)
#     for i, word in enumerate(words):
#         plt.annotate(word, (X2[i, 0], X2[i, 1]), 
#                     xytext=(3, 3), textcoords='offset points', fontsize=9)
#     plt.title('Semantic Neighborhoods in Your Domain')
#     plt.show()


## Group Discussion and Reflection

Work with your group to discuss these questions. Be prepared to share your insights with the class:

1. **Semantic Patterns**: What were the most interesting patterns you discovered in how words cluster in semantic space?

2. **Bias and Assumptions**: What biases did you notice in the word embeddings? How might these affect analysis of social discourse?

3. **Topic Differences**: How do different topics in your data differ in their semantic positioning?

4. **Methodological Insights**: What are the strengths and limitations of using word embeddings for analyzing your genre of discourse?

### Discussion Notes

Use this space to document your group's findings and insights:

**Most interesting semantic patterns:**
- [Write your observations here]

**Biases discovered:**
- [Document any biases you found]

**Topic differences:**
- [Note how different topics cluster or separate]

**Methodological insights:**
- Strengths: [What worked well?]
- Limitations: [What were the challenges or limitations?]
- Future directions: [What would you explore next?]

## 🚀 Stretch Goals

For students who complete the lab early and want to explore further: