# S07 - Word Embeddings & Neural Networks
## Exercises

### Exercise 1 (Easy)
Load pre-trained Word2Vec embeddings and find similar words.

In [1]:
import gensim.downloader as api

# Load pre-trained word2vec (google-news-300 or glove-wiki-gigaword-100)
# Find top 5 most similar words to 'king'

# We load the 'glove-wiki-gigaword-100' model
model = api.load('glove-wiki-gigaword-100')
similar_words = model.most_similar('king', topn=5)
print(similar_words)

[('prince', 0.7682328820228577), ('queen', 0.7507690787315369), ('son', 0.7020888328552246), ('brother', 0.6985775232315063), ('monarch', 0.6977890729904175)]


### Exercise 2 (Easy)
Perform word analogy: king - man + woman = ?

In [2]:
# Use the model to solve: king - man + woman = ?
# Also try: paris - france + spain = ?

result = model.most_similar(positive=['king', 'woman'], negative=['man'], topn=1)
print("king - man + woman =", result)
result2 = model.most_similar(positive=['paris', 'spain'], negative=['france'], topn=1)
print("paris - france + spain =", result2)

king - man + woman = [('queen', 0.7698540687561035)]
paris - france + spain = [('madrid', 0.8061118125915527)]


### Exercise 3 (Medium)
Train your own Word2Vec model on a custom corpus.

In [3]:
from gensim.models import Word2Vec

corpus = [
    ["the", "cat", "sat", "on", "the", "mat"],
    ["the", "dog", "ran", "in", "the", "park"],
    ["cats", "and", "dogs", "are", "pets"],
    ["the", "cat", "chased", "the", "dog"],
    ["pets", "need", "food", "and", "water"]
]

# Train Word2Vec model (vector_size=50, window=3, min_count=1)
model = Word2Vec(sentences=corpus, vector_size=50, window=3, min_count=1)

#  We find the vector for 'cat' and 'dog'
cat_vector = model.wv['cat']
dog_vector = model.wv['dog']

print("Vector for 'cat':", cat_vector)
print("Vector for 'dog':", dog_vector)

Vector for 'cat': [ 0.0001904   0.00615311 -0.01362527 -0.00274993  0.01533661  0.01469298
 -0.00734422  0.0052875  -0.0166348   0.01241069 -0.00927603 -0.00632881
  0.01862341  0.00174694  0.01498011 -0.01214779  0.0103222   0.0198463
 -0.01691667 -0.01027177 -0.01413098 -0.00972589 -0.00755462 -0.01707052
  0.01591139 -0.00969053  0.01684555  0.01052465 -0.013102    0.00791471
  0.01093842 -0.01485397 -0.01480907 -0.00495215 -0.01725133 -0.00316367
 -0.00080567  0.0065959   0.0028836  -0.00176379 -0.01119006  0.00346125
 -0.00179504  0.01358557  0.00794846  0.00906098  0.00286886 -0.00539834
 -0.0087339  -0.00206309]
Vector for 'dog': [ 1.56351421e-02 -1.90203730e-02 -4.11062239e-04  6.93839323e-03
 -1.87794445e-03  1.67635437e-02  1.80215668e-02  1.30730132e-02
 -1.42324204e-03  1.54208085e-02 -1.70686692e-02  6.41421322e-03
 -9.27599426e-03 -1.01779103e-02  7.17923651e-03  1.07406788e-02
  1.55390287e-02 -1.15330126e-02  1.48667218e-02  1.32509926e-02
 -7.41960062e-03 -1.74912829e-

### Exercise 4 (Medium)
Build a simple neural network for text classification using embeddings.

In [None]:
import torch
import torch.nn as nn

class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super().__init__()
        # Define: embedding layer, linear layer
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.classifier = nn.Linear(embed_dim, num_classes)
    
    def forward(self, x):
        # Embed -> mean pooling -> classify
        embedded = self.embedding(x)
        pooled = embedded.mean(dim=1)
        output = self.classifier(pooled)
        return output

# Test with dummy data
vocab_size = 1000
embed_dim = 128
num_classes = 2

model = TextClassifier(vocab_size, embed_dim, num_classes)
dummy_input = torch.randint(0, vocab_size, (1, 10))  # 1 is for batch size and 10 is sequence length
output = model(dummy_input)
print("Output shape:", output.shape)

Output shape: torch.Size([1, 2])


### Exercise 5 (Hard)
Implement the Skip-gram model from scratch (forward pass only).

*Research: Skip-gram predicts context words given center word.*

In [None]:
import torch
import torch.nn as nn

class SkipGram(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        # Two embedding matrices: center and context
        pass
    
    def forward(self, center, context):
        # Return dot product scores
        pass
