In [14]:
sentences = [
"I don’t like the PhD",
"I don’t like my current job",
"I want to get a better job",
"I want to get married",
"I love to travel",
"I want to get a scholarship",
"Duolingo is stupid",
"Do you like your university ?",
]

In [15]:
import pandas as pd
df = pd.DataFrame(sentences, columns=["sentences"])

word_to_id = {}

def create_vocabulary(sentences):
    id= 0
    for sentence in sentences:
        words = sentence.lower().split()
        for word in words:
            if word not in word_to_id:
                word_to_id[word] = id
                id += 1 
    return word_to_id

vocabulary = create_vocabulary(df['sentences'])
print("Vocabulary:", vocabulary)

Vocabulary: {'i': 0, 'don’t': 1, 'like': 2, 'the': 3, 'phd': 4, 'my': 5, 'current': 6, 'job': 7, 'want': 8, 'to': 9, 'get': 10, 'a': 11, 'better': 12, 'married': 13, 'love': 14, 'travel': 15, 'scholarship': 16, 'duolingo': 17, 'is': 18, 'stupid': 19, 'do': 20, 'you': 21, 'your': 22, 'university': 23, '?': 24}


In [16]:
import numpy as np

target_words = ['i', 'like', 'job', 'want', 'travel', 'university']
n = len(target_words)

# Create a mapping from word -> matrix index (0 to 5)
word_to_idx = {word: i for i, word in enumerate(target_words)}

# 2. Initialize Matrix (6x6)
co_occurrence_matrix = np.zeros((n, n), dtype=int)

# 3. Build the Matrix
window_size = 1

for sentence in sentences:
    tokens = sentence.lower().split()
    
    for i, token in enumerate(tokens):
        # Only process if the current word is in our target list
        if token in target_words:
            current_idx = word_to_idx[token]
            
            # Check context window (left and right)
            # Range: from i - window to i + window
            start = max(0, i - window_size)
            end = min(len(tokens), i + window_size + 1)
            
            for j in range(start, end):
                if i == j: continue # Skip the word itself
                
                neighbor = tokens[j]
                
                # If the neighbor is ALSO in our target list, increment count
                if neighbor in target_words:
                    neighbor_idx = word_to_idx[neighbor]
                    co_occurrence_matrix[current_idx][neighbor_idx] += 1

# 4. Convert to DataFrame for readability
df_co_oc = pd.DataFrame(co_occurrence_matrix, index=target_words, columns=target_words)

print("--- Co-occurrence Matrix (Window=1) ---")
print(df_co_oc)

--- Co-occurrence Matrix (Window=1) ---
            i  like  job  want  travel  university
i           0     0    0     3       0           0
like        0     0    0     0       0           0
job         0     0    0     0       0           0
want        3     0    0     0       0           0
travel      0     0    0     0       0           0
university  0     0    0     0       0           0


In [25]:
import torch
import torch.nn as nn
import torch.optim as optim

# ==========================================
# 1. Data Preparation
# ==========================================
raw_text = "i don’t like my current job"
tokens = raw_text.lower().split()


# Inverse map (ID -> Word) for printing results
id_to_word = {v: k for k, v in vocabulary.items()}

print(f"vocabularyulary: {vocabulary}")

# Step 2: Generate Training Data (Context -> Target)
WINDOW_SIZE = 2
data = []

for i in range(len(tokens)):
    target_word = tokens[i]
    target_id = vocabulary[target_word]
    
    # Get neighbors
    start = max(0, i - WINDOW_SIZE)
    end = min(len(tokens), i + WINDOW_SIZE + 1)
    
    context_words = [tokens[j] for j in range(start, end) if j != i]
    context_ids = [vocabulary[w] for w in context_words]
    
    # PADDING: Fill with 0s so all inputs are length 4 (2*window)
    max_len = WINDOW_SIZE * 2
    while len(context_ids) < max_len:
        context_ids.append(0) # Append <PAD> ID
        
    data.append((context_ids, target_id))

# ==========================================
# 2. Define the CBOW Model
# ==========================================
class CBOW(nn.Module):
    def __init__(self, vocabulary_size, embedding_dim):
        super(CBOW, self).__init__()
        
        # 1. Embedding Layer
        # padding_idx=0 tells PyTorch not to learn a vector for the <PAD> token
        self.embeddings = nn.Embedding(vocabulary_size, embedding_dim, padding_idx=0)
        
        # 2. Linear Layer (Hidden -> Output)
        self.linear = nn.Linear(embedding_dim, vocabulary_size)
        
    def forward(self, inputs):
        # Input shape: [batch_size, context_window_size] -> e.g., [1, 4]
        
        # Look up embeddings: [1, 4] -> [1, 4, embedding_dim]
        embeds = self.embeddings(inputs)
        
        # Aggregation: Take the MEAN of the context vectors
        # Shape becomes: [1, embedding_dim]
        hidden = torch.mean(embeds, dim=1)
        
        # Prediction: Project back to vocabularyulary size
        output = self.linear(hidden)
        return output

# ==========================================
# 3. Training Loop
# ==========================================
EMBED_DIM = 10
LEARNING_RATE = 0.05
EPOCHS = 500

model = CBOW(len(vocabulary), EMBED_DIM)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)

# Convert data to tensors
context_data = torch.tensor([item[0] for item in data]) # Shape: [6, 4]
target_data = torch.tensor([item[1] for item in data])  # Shape: [6]

print("\nStarting Training...")
for epoch in range(EPOCHS):
    model.train()
    optimizer.zero_grad()
    
    # Forward pass
    log_probs = model(context_data)
    
    # Compute loss
    loss = loss_function(log_probs, target_data)
    
    # Backward pass
    loss.backward()
    optimizer.step()
    
    if epoch % 100 == 0:
        print(f"Epoch {epoch}: Loss = {loss.item():.4f}")


vocabularyulary: {'i': 0, 'don’t': 1, 'like': 2, 'the': 3, 'phd': 4, 'my': 5, 'current': 6, 'job': 7, 'want': 8, 'to': 9, 'get': 10, 'a': 11, 'better': 12, 'married': 13, 'love': 14, 'travel': 15, 'scholarship': 16, 'duolingo': 17, 'is': 18, 'stupid': 19, 'do': 20, 'you': 21, 'your': 22, 'university': 23, '?': 24}

Starting Training...
Epoch 0: Loss = 3.0324
Epoch 100: Loss = 1.7065
Epoch 200: Loss = 1.1251
Epoch 300: Loss = 0.7673
Epoch 400: Loss = 0.5438


In [26]:
def interference(test_words):
    # Testing (Inference)
    print("\n--- Testing Specific Case ---")
    # Let's test the "job" example: Context = "my current"
    test_ids = [vocabulary[w] for w in test_words]

    # Pad to length 4
    while len(test_ids) < 4:
        test_ids.append(0)

    # Convert to tensor and predict
    input_tensor = torch.tensor([test_ids]) # Shape [1, 4]
    with torch.no_grad():
        output = model(input_tensor)
        predicted_id = torch.argmax(output).item()

    print(f"Context: {test_words}")
    print(f"Predicted ID: {predicted_id}")
    print(f"Predicted Word: '{id_to_word[predicted_id]}'")

test_words1 = ["i", "don’t","my","current"]
test_words2 = ["my", "current"]

for test_words in [test_words1, test_words2]:
    interference(test_words)


--- Testing Specific Case ---
Context: ['i', 'don’t', 'my', 'current']
Predicted ID: 2
Predicted Word: 'like'

--- Testing Specific Case ---
Context: ['my', 'current']
Predicted ID: 7
Predicted Word: 'job'


In [27]:
import pandas as pd

# 1. Setup Data
# IDs matching your Exercise 1 & 4
word_to_id = {
    "i": 0, "want": 8, "to": 11, "get": 12, "married": 13
}
id_to_word = {v: k for k, v in word_to_id.items()}

sentence = "i want to get married"
tokens = sentence.lower().split()
token_ids = [word_to_id[t] for t in tokens]
window_size = 2

# ==========================================
# Generate Skip-gram Samples
# ==========================================
skipgram_pairs = []

print(f"Sentence: {tokens}\n")
print("--- Skip-gram Generation ---")

for i, center_id in enumerate(token_ids):
    # Determine window range
    start = max(0, i - window_size)
    end = min(len(token_ids), i + window_size + 1)
    
    # Get context words (excluding self)
    context_ids = [token_ids[j] for j in range(start, end) if j != i]
    
    # Create pairs: (Center, Context)
    for ctx_id in context_ids:
        skipgram_pairs.append((center_id, ctx_id))
        
    # Print specific examples requested in the prompt
    center_word = id_to_word[center_id]
    if center_word in ["want", "married"]:
        print(f"Center: '{center_word}' ({center_id}) -> Contexts: {context_ids}")

# ==========================================
# Comparison Logic
# ==========================================
cbow_count = len(token_ids) # One sample per center word
skipgram_count = len(skipgram_pairs)

print("\n--- Step 3: Comparison ---")
print(f"Total CBOW Samples:      {cbow_count}")
print(f"Total Skip-gram Samples: {skipgram_count}")
print(f"Ratio: {skipgram_count / cbow_count:.1f}x more samples in Skip-gram")

Sentence: ['i', 'want', 'to', 'get', 'married']

--- Skip-gram Generation ---
Center: 'want' (8) -> Contexts: [0, 11, 12]
Center: 'married' (13) -> Contexts: [11, 12]

--- Step 3: Comparison ---
Total CBOW Samples:      5
Total Skip-gram Samples: 14
Ratio: 2.8x more samples in Skip-gram


In [36]:
import numpy as np

words_vecs = {
    "phd": np.array([0.9, 0.1]),
    "job": np.array([0.8, 0.2]),
    "better": np.array([0.1, 0.9]),
    "married": np.array([0.2, 0.8])
}

result = words_vecs["phd"] - words_vecs["job"] + words_vecs["better"]
print("Resulting Vector:", result)

def closest_word(vector, words_vecs):
    best_word = None
    best_dist = float("inf")

    for word, vec in words_vecs.items():
        dist = np.linalg.norm(vector - vec)
        if dist < best_dist:
            best_dist = dist
            best_word = word
    return best_word

print("Closest word:", closest_word(result, words_vecs))


Resulting Vector: [0.2 0.8]
Closest word: married


In [None]:
def ngrams(word, n=3):
    return [word[i:i+n] for i in range(len(word)-n+1)]