# INFO 159/259

# <center> Homework 1: Word Embeddings </center>
<center> Due: February 3, 2026 @ 11:59pm </center>

# HW1: Word Embeddings

In this homework, you will implement _word2vec_ with skip-grams and negative sampling, training on a small slice of Wikipedia data.

*Learning objectives*:
- Understand the implementation details of _word2vec_
- Gain familiarity with `numpy` for matrix math
- Gain familiarity with training a classifier using stochastic gradient descent.

You may want to consult SLP chapter 5 (_Embeddings_) as a reference for the implementation. This homework is designed to run on the CPU only, so if you are using Google Colab, you may want to ensure that your CPU is selected (under `Runtime > Change runtime type` in the top bar) so that you save your GPU allocation for later assignments in the semester.

In [6]:
import itertools
from collections import Counter

import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from tqdm import tqdm

# nltk.download("punkt_tab")

## Data loading

We will begin by loading and tokenizing the data. The file contains a list of paragraphs from Wikipeda, separated by newlines. Because each document (a paragraph) is sampled independently, we want to maintain the document boundaries when we sample contexts later.

Inside `FileDataLoader`:
- `idx2vocab` is a list of unique word types
- `vocab2idx` is a dict mapping from a word type to its index in `idx2vocab`
- `word_freqs` is a dict mapping from a word type to its frequency in the corpus

You should implement:
1. The `negative_sample_weights()` function

   This function should calculate the weighted sample probabilities for each of the words in our vocabulary.
   Recall SLP3 eq. 5.19:
    $$
     P_{\alpha}(w) = \frac{\text{count}(w)^{\alpha}}{\sum_{w'}\text{count}(w')^{\alpha}}
    $$
   We calculate and store the sample weights to save time when generating contexts later.
3. The `negative_sample()` function

   This function should sample `num_samples` negative context words given a target word. Recall from SLP3 5.5.2
   > A noise word is a random word from the lexicon, **constrained not to be the target word $w$**. (_emph added_)

   So, when sampling, you will want to copy the original `.sample_weights` numpy array, set the probability of the target word to 0, and renormalize the weights before sampling.

   You may want to consult the numpy documentation for [`numpy.random.Generator.choice()`](https://numpy.org/doc/stable/reference/random/generated/numpy.random.Generator.choice.html#numpy.random.Generator.choice). We have instantiated a random generator for your convenience in `self.rng`.

_Learning objectives_:
> - Understand the implementation details of _word2vec_


In [8]:
corpus_path = "./en_wiki_sample.txt"

In [121]:
class FileDataLoader():
    def __init__(self, filepath, negative_sample_alpha=0.75, min_threshold=5):
        self.negative_sample_alpha = negative_sample_alpha
        self.min_threshold = min_threshold

        self.tokenized_documents = self.load_data(filepath)
        self.word_freqs = self.get_word_freqs(self.tokenized_documents)

        # replace words that appear fewer than min_threshold times with an [UNK] token
        for word, freq in list(self.word_freqs.items()):
            if freq < min_threshold:
                self.word_freqs["[UNK]"] += freq
                del self.word_freqs[word]

        self.idx2vocab = list(self.word_freqs.keys())
        self.vocab2idx = {word: index for index, word in enumerate(self.idx2vocab)}

        # set up a random number generator we can use for sampling
        self.rng = np.random.default_rng(159259)
        self.sample_weights = self.negative_sample_weights(alpha=negative_sample_alpha)

        ...

    def tokenize_and_lowercase(self, doc):
        """Tokenize a doc and lowercase all the words."""
        return [word.lower() for word in word_tokenize(doc)]

    def get_word_freqs(self, tokenized_documents):
        """Return a dictionary mapping each word to its frequency."""
        return Counter(itertools.chain.from_iterable(tokenized_documents))

    def load_data(self, filepath):
        return [self.tokenize_and_lowercase(doc) for doc in tqdm(open(corpus_path, "r").readlines())]

    def negative_sample_weights(self, alpha):
        """Calculate the weighted probabilities of each word.

        Return a (v,)-shaped numpy array, where v is the size of the vocabulary.
        """
        freq_arr = np.array(list(self.word_freqs.values()))
        num = freq_arr**alpha
        den = np.sum(num)
        wt_prob = num/den
        return wt_prob

    def negative_sample(self, target_word_idx, num_samples):
        """Sample num_samples noise words from the lexicon that is not the target word.
    
        The sample probabilities should be proportional to their weighted unigram probability if the target word probability is set to 0.

        Return a (num_samples,)-shaped numpy array of sampled indices.
        """
        freq_arr = np.array(self.sample_weights)
        freq_arr[target_word_idx] = 0
        freq_arr_norm = freq_arr/sum(freq_arr)
        negative_idx = np.random.choice(len(freq_arr_norm), size=num_samples, p=freq_arr_norm)
        return negative_idx

    def sample_contexts(self, window_size, sample_k):
        for doc in self.tokenized_documents:
            if len(doc) < (2 * window_size) + 1:
                # the doc is too short for our desired window size; we skip it
                continue
            for word_idx in range(window_size, len(doc) - window_size):
                target_word_idx = self.vocab2idx[doc[word_idx]] if doc[word_idx] in self.vocab2idx else self.vocab2idx["[UNK]"]
                # sample positive words from the window
                positive_word_idxs = np.array([
                    self.vocab2idx[word] if word in self.vocab2idx else self.vocab2idx["[UNK]"] for word in doc[word_idx - window_size:word_idx] + doc[word_idx + 1:word_idx + 1 + window_size]
                    
                ])
                # sample len(positive_word_idxs) * sample_k number of negative words
                negative_word_idxs = self.negative_sample(target_word_idx, sample_k * len(positive_word_idxs))
                yield (target_word_idx, positive_word_idxs, negative_word_idxs)


In [122]:
# this should take roughly 30 seconds
dataloader = FileDataLoader(corpus_path)

100%|█████████████████████████████████| 100000/100000 [00:17<00:00, 5765.93it/s]


**Quick check**: The unweighted probability for "the" should be 0.063; the weighted probability should be 0.016.

In [129]:
dataloader.word_freqs['the']

511984

In [144]:
print(f"Unweighted probability for `the`: \t\t{dataloader.word_freqs['the'] / sum(dataloader.word_freqs.values()):.3f}")
print(f"Weighted (alpha=0.75) probability for `the`: \t{dataloader.sample_weights[dataloader.vocab2idx['the']]:.3f}")

Unweighted probability for `the`: 		0.063
Weighted (alpha=0.75) probability for `the`: 	0.016


In [123]:
print(f"Unweighted probability for `the`: \t\t{dataloader.word_freqs['the'] / dataloader.word_freqs.total():.3f}")
print(f"Weighted (alpha=0.75) probability for `the`: \t{dataloader.sample_weights[dataloader.vocab2idx['the']]:.3f}")

AttributeError: 'Counter' object has no attribute 'total'

## Setting up the model

The word2vec model consists of two matrices: the target (or input) embedding and the context (or output) embedding. We set those up here.

You should implement:
- The `nearest_neighbors()` function

  This given a $d$-dimensional $\vec{v}$ and a $(v \times d)$-dimensional matrix $M$ of vectors to query against, we want to calculate the cosine similarity of $\vec{v}$ with each row of $M$ and return the indices (and the corresponding similarities) of the most similar rows in $M$.

  As a reminder, the cosine similarity of two vectors $\vec{a}$ and $\vec{b}$ is
  $$
    \text{cosine\_sim}(\vec{a}, \vec{b}) = \frac{\vec{a} \cdot \vec{b}}{\|{\vec{a}}\|\|\vec{b}\|}
  $$

  This is derived from one of the formulations for the dot product:
  $$
    \vec{a} \cdot \vec{b} = \|\vec{a}\| \|\vec{b}\| \cos({\theta})
  $$

  $\|\vec{a}\|$ denotes the $l_2$-norm of a vector, or its magnitude.

  You might want to consult the numpy documentation for [`numpy.matmul`](https://numpy.org/doc/2.1/reference/generated/numpy.matmul.html), [`numpy.argsort`](https://numpy.org/doc/2.1/reference/generated/numpy.argsort.html#numpy-argsort), and [`numpy.linalg.norm`](https://numpy.org/doc/2.1/reference/generated/numpy.linalg.norm.html)


_Learning objectives_:
> - Gain familiarity with `numpy` for matrix math


In [310]:
class Word2Vec():
    def __init__(self, dataloader, hidden_dim=100):
        self.dataloader = dataloader
        self.vocab_size = len(self.dataloader.idx2vocab)
        self.hidden_dim = hidden_dim

        np.random.seed(159259)
        # We initialize the model weights to be uniformly randomly distributed and centered around 0.
        self.target_embs = (np.random.random((self.vocab_size, hidden_dim)) - 0.5) / hidden_dim
        self.context_embs = (np.random.random((self.vocab_size, hidden_dim)) - 0.5) / hidden_dim

    def nearest_neighbors(self, query_vector, vectors, n=10):
        """Finds the `n` indices of the rows in `vectors` that have the highest cosine similarity to `query_vector`.

        query_vector: (d,)-shaped numpy array
        vectors: (v, d)-shaped numpy array
        n: int
        
        Return a tuple of (indices, similarities), where both are (n,)-shaped ndarrays.
        """
        query_vector_norm = (query_vector/np.sqrt((query_vector**2).sum()))[:, np.newaxis]
        vectors_norm = vectors/(np.sqrt((vectors**2).sum(axis=1))[:, np.newaxis])

        vector_sim = ((vectors_norm)@(query_vector_norm)).ravel()

        idx = np.argsort(vector_sim)[-n:][::-1]
        sims = vector_sim[idx]
        
        return (idx, sims)

    def print_nearest_neighbors(self, word, n=5):
        """Prints the `n` nearest neighbors for a word using the context embeddings.

        word: str

        Return None
        """
        query_vector = self.context_embs[self.dataloader.vocab2idx[word]]
        closest_inds, similarities = self.nearest_neighbors(query_vector, self.context_embs, n)
        words = [self.dataloader.idx2vocab[ind] for ind in closest_inds]

        print(words)


In [311]:
w2v_model = Word2Vec(dataloader)

**Quick check**: you can check your function against this toy example. The output should be:

- `(array([4, 5, 0, 6, 3]), array([0.91347529, 0.87409283, 0.84518755, 0.83396453, 0.8111933 ]))`

In [319]:
def quick_check():
    np.random.seed(159259)
    query_vec = np.random.random(size=(5,))
    other_vecs = np.random.random(size=(10, 5))
    print(w2v_model.nearest_neighbors(query_vec, other_vecs, n=5))

quick_check()

(array([4, 5, 0, 6, 3]), array([0.91347529, 0.87409283, 0.84518755, 0.83396453, 0.8111933 ]))


**Quick check**: the nearest neighbors for "the" should be random at this point; if you did not edit the `__init__` function, the nearest neighbors should be:

- `['the', 'asian', 'habilitation', 'toward', 'capacity-building']`

In [320]:
w2v_model.print_nearest_neighbors("the")

['the', 'asian', 'habilitation', 'toward', 'capacity-building']


  vector_sim = ((vectors_norm)@(query_vector_norm)).ravel()
  vector_sim = ((vectors_norm)@(query_vector_norm)).ravel()
  vector_sim = ((vectors_norm)@(query_vector_norm)).ravel()


## Setting up the training loop

### Calculating gradients

To update the weights using gradient descent, we have to find the partial derivatives of the loss with respect to the parameters. You can find the loss function and its partial derivatives in SLP 5.5.2 (eqs. 5.22 - 5.24); we've also reproduced them for you below. While we give you the derivatives, it can be a good exercise to try to derive them yourself!

These rely on the sigmoid function, which we've implemented for you as an example.

You should implement:
- `loss_fn`
- `c_pos_grad`
- `c_neg_grad`
- `w_grad`

In each of these functions, you should expect:
- `w` to be a `d`-dimensional vector,
- `c_pos` to be a `(n_pos, d)`-dimensional matrix (where `n_pos` is the number of positive context examples)
- `c_neg` to be a `(n_neg, d)`-dimensional matrix (where `n_neg` is the number of negative context examples)

As a reminder, the sigmoid function is defined as
$$
\sigma(x) = \frac{1}{1 + e^{-x}}
$$

For filling out the rest of the functions, you may want to use [`np.log`](https://numpy.org/devdocs/reference/generated/numpy.log.html#numpy.log), [`np.sum`](https://numpy.org/devdocs/reference/generated/numpy.sum.html), [`np.newaxis`](https://numpy.org/devdocs/reference/constants.html#numpy.newaxis), [`np.matmul`](https://numpy.org/devdocs/reference/generated/numpy.matmul.html#numpy-matmul), and of course, the `sigmoid` function that we have implemented for you.

In [328]:
# we wrap these functions in the @njit decorator to speed up calculations
# using just-in-time compilation
# you don't have to worry about this
from numba import njit

@njit
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [441]:
@njit
def loss_fn(w, c_pos, c_neg):
    pos_prob = sigmoid(c_pos@w[:, np.newaxis])
    neg_prob = sigmoid(-c_neg@w[:, np.newaxis])
    loss = -(np.log(pos_prob).sum() + np.log(neg_prob).sum())
    return loss

In [442]:
@njit
def c_pos_grad(w, c_pos):
    grad = ((sigmoid(c_pos@w[:, np.newaxis])-1)@w[:, np.newaxis].T)
    return grad

In [443]:
@njit
def c_neg_grad(w, c_neg):
    grad = ((sigmoid(c_neg@w[:, np.newaxis]))@w[:, np.newaxis].T)
    return grad

In [449]:
@njit
def w_grad(w, c_pos, c_neg):
    pos_grad = (c_pos.T@(sigmoid(c_pos@w[:, np.newaxis])-1))
    neg_grad = (c_neg.T@(sigmoid(c_neg@w[:, np.newaxis])))
    grad = (pos_grad+neg_grad).ravel()
    return grad

**(Not so) Quick check**: We can check the correctness of the loss function and gradient calculations by numerically approximating the gradients using neighboring points and seeing if they match up. Recall from your calculus class:

$$
\frac{d}{dx} f(x) = \lim_{h \to 0} \frac{f(x + h) - f(x - h)}{2h}
$$

We implement this in the `approximate_gradient` function so that we can estimate the local gradient and see if the closed-form solution that you implemented in the functions above are accurate. However, we never numerically approximate the gradient during training because we have a closed-form solution that is both more accurate and more efficient to calculate.

> **Aside**: In this assignment, we have you manually calculate the loss and gradients. If you have taken other deep learning classes, you may have experience with libraries like Pytorch, which implement automatic differentiation so that you can just specify the loss function and not have to work out the gradients manually.
>
> These libraries _don't_ use numerical approximation for the gradients. Instead, they rely on the chain rule:
>
> $$
    \frac{d}{dx} f(g(x)) = f'(g(x)) g'(x)
  $$
> As long as all of the functions you apply to an input are differentiable, and the closed-form derivatives are known (which they often are, since most functions break down into basic differentiable operations like addition, multiplication, or exponentiation), the library can construct a graph to track all of the applications of the functions and calculate the partial derivatives using this graph.\
>
> You can read more about this in the [Pytorch autograd tutorial](https://docs.pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html#computational-graph).

Your loss should be roughly 8.05; if it is not, all of the assertions in the `quick_check` will likely fail even if (especially if) your gradients are implemented correctly.

In [452]:
def quick_check():
    np.random.seed(159259)

    w = np.random.random((5,))
    c_pos = np.random.random((2, 5))
    c_neg = np.random.random((4, 5))

    eps = 1e-5

    def approximate_gradient(func, vec, eps=1e-5):
        est_grad = np.zeros(vec.shape)
        for ind, el in np.ndenumerate(vec):
            perturb = np.zeros(vec.shape)
            perturb[ind] = eps
            est_grad[ind] = (func(vec + perturb) - func(vec - perturb)) / (2 * eps)
        return est_grad

    print("loss:", loss_fn(w, c_pos, c_neg))

    assert np.allclose(w_grad(w, c_pos, c_neg), approximate_gradient(lambda x: loss_fn(x, c_pos, c_neg), w)), "c_pos_grad is not correct for loss_fn"
    assert np.allclose(c_pos_grad(w, c_pos), approximate_gradient(lambda x: loss_fn(w, x, c_neg), c_pos)), "c_pos_grad is not correct for loss_fn"
    assert np.allclose(c_neg_grad(w, c_neg), approximate_gradient(lambda x: loss_fn(w, c_pos, x), c_neg)), "c_neg_grad is not correct for loss_fn"

quick_check()

loss: 8.052986383619253


### Updating weights in the training loop

The training loop for SGD consists of sampling one instance of the data (in our case, a target word and its positive and negative contexts), and calculating the partial derivatives of the loss.

We then update the parameters using these partial derivatives, multiplying each gradient by the learning rate $\eta$. When we perform gradient descent, we subtract the gradients from the weights in order to shift the weights in a direction that decreases the loss (locally, at least). Here are the updates we make:
$$
c_{\text{pos}}^{t + 1} = c_{\text{pos}}^{t} - \eta \frac{\partial L}{\partial {c}_{\text{pos}}^t},
$$
$$
c_{\text{neg}}^{t + 1} = c_{\text{neg}}^{t} - \eta \frac{\partial L}{\partial {c}_{\text{neg}}^t}
,$$
$$
w^{t + 1} = w^{t} - \eta \frac{\partial L}{\partial w^t}
,$$
where $t + 1$ is the next timestep in the stochastic gradient descent loop.

**Note**: We print some diagnostic information, including the loss, to help you monitor the training. You should convince yourself that, though we calculate the loss and print it here to track our training, SGD doesn't actually require that we compute the loss as such; we really only need the gradients.

You implement:
- the section of the code where you calculate the gradients
- the section of the code where you use the gradients to update the embedding

You may want to read about [numpy indexing](https://numpy.org/doc/2.2/user/basics.indexing.html#), since the `.sample_contexts()` returns lists of indices; you might also want to look into [`np.subtract.at()`](https://numpy.org/doc/2.2/reference/generated/numpy.ufunc.at.html) (see the usage of `np.add.at()` in the starter code as another example).

With a learning rate of 0.01, you should see some nearest neighbors start to make sense after about the loss drops under 60 or so. This took around 60K steps and 1m21s on our solution code; we recommend running for at least 10 minutes.

_Learning objectives_:
> - Gain familiarity with training a classifier using stochastic gradient descent.


In [454]:
NUM_EPOCHS = 1
LEARNING_RATE = 0.01

def train(model, dataloader):

    num_target_updates = np.zeros((model.target_embs.shape[0],))
    num_context_updates = np.zeros((model.context_embs.shape[0],))

    def print_diagnostic(word):
        print(f"`{word}` was updated {int(num_target_updates[dataloader.vocab2idx[word]])} times in target and {int(num_context_updates[dataloader.vocab2idx[word]])} times in context")
        model.print_nearest_neighbors(word, 4)

    for i in range(NUM_EPOCHS):
        losses = []
        for i, (target, pos, neg) in enumerate(tqdm(dataloader.sample_contexts(window_size=2, sample_k=100))):

            if i % 10_000 == 0:
                # Print diagnostic info every 10_000 steps.
                print("avg loss:", sum(losses) / len(losses) if losses else "")
                losses = []
                print_diagnostic("he")
                print_diagnostic("original")
                print_diagnostic("january")

            # Get the vectors from the model
            w = model.target_embs[target]
            c_pos = model.context_embs[pos]
            c_neg = model.context_embs[neg]

            # Calculate and store the loss
            losses.append(loss_fn(w, c_pos, c_neg))

            # TODO: Calculate the gradients and implement the gradient update.
            w_new = w-LEARNING_RATE*w_grad(w, c_pos, c_neg)
            c_neg_new = c_neg-LEARNING_RATE*c_neg_grad(w, c_neg)
            c_pos_new = c_pos-LEARNING_RATE*c_pos_grad(w, c_pos)

            w = w_new
            c_neg = c_neg_new
            c_pos = c_pos_new
            
            # Tally up how many times each word has been seen, just for fun.
            np.add.at(num_target_updates, target, 1)
            np.add.at(num_context_updates, pos, 1)
            np.add.at(num_context_updates, neg, 1)

w2v_model = Word2Vec(dataloader)
train(w2v_model, dataloader)

  vector_sim = ((vectors_norm)@(query_vector_norm)).ravel()
  vector_sim = ((vectors_norm)@(query_vector_norm)).ravel()
  vector_sim = ((vectors_norm)@(query_vector_norm)).ravel()
96it [00:00, 489.45it/s]

avg loss: 
`he` was updated 0 times in target and 0 times in context
['he', 'transponders', 'dusty', 'lse']
`original` was updated 0 times in target and 0 times in context
['original', 'quivering', 'saltire', 'bae']
`january` was updated 0 times in target and 0 times in context
['january', 'dailey', 'neutron', 'apogee']


10067it [00:18, 527.34it/s]

avg loss: 280.03156064652
`he` was updated 62 times in target and 10611 times in context
['he', 'transponders', 'dusty', 'lse']
`original` was updated 0 times in target and 1099 times in context
['original', 'quivering', 'saltire', 'bae']
`january` was updated 5 times in target and 1686 times in context
['january', 'dailey', 'neutron', 'apogee']


20057it [00:36, 487.76it/s]

avg loss: 280.03154296061166
`he` was updated 115 times in target and 21116 times in context
['he', 'transponders', 'dusty', 'lse']
`original` was updated 3 times in target and 2208 times in context
['original', 'quivering', 'saltire', 'bae']
`january` was updated 9 times in target and 3225 times in context
['january', 'dailey', 'neutron', 'apogee']


30082it [00:55, 522.37it/s]

avg loss: 280.0315253753248
`he` was updated 167 times in target and 31796 times in context
['he', 'transponders', 'dusty', 'lse']
`original` was updated 4 times in target and 3275 times in context
['original', 'quivering', 'saltire', 'bae']
`january` was updated 18 times in target and 4856 times in context
['january', 'dailey', 'neutron', 'apogee']


40078it [01:14, 512.23it/s]

avg loss: 280.03156137022694
`he` was updated 240 times in target and 42343 times in context
['he', 'transponders', 'dusty', 'lse']
`original` was updated 7 times in target and 4240 times in context
['original', 'quivering', 'saltire', 'bae']
`january` was updated 25 times in target and 6522 times in context
['january', 'dailey', 'neutron', 'apogee']


50054it [01:33, 502.20it/s]

avg loss: 280.0315563759385
`he` was updated 295 times in target and 52852 times in context
['he', 'transponders', 'dusty', 'lse']
`original` was updated 10 times in target and 5207 times in context
['original', 'quivering', 'saltire', 'bae']
`january` was updated 29 times in target and 8107 times in context
['january', 'dailey', 'neutron', 'apogee']


60090it [01:52, 509.69it/s]

avg loss: 280.0315331277815
`he` was updated 358 times in target and 63553 times in context
['he', 'transponders', 'dusty', 'lse']
`original` was updated 11 times in target and 6261 times in context
['original', 'quivering', 'saltire', 'bae']
`january` was updated 32 times in target and 9738 times in context
['january', 'dailey', 'neutron', 'apogee']


70088it [02:11, 498.09it/s]

avg loss: 280.03154338647005
`he` was updated 423 times in target and 74161 times in context
['he', 'transponders', 'dusty', 'lse']
`original` was updated 13 times in target and 7313 times in context
['original', 'quivering', 'saltire', 'bae']
`january` was updated 38 times in target and 11341 times in context
['january', 'dailey', 'neutron', 'apogee']


80088it [02:30, 504.29it/s]

avg loss: 280.03152634301057
`he` was updated 472 times in target and 84699 times in context
['he', 'transponders', 'dusty', 'lse']
`original` was updated 15 times in target and 8343 times in context
['original', 'quivering', 'saltire', 'bae']
`january` was updated 43 times in target and 12937 times in context
['january', 'dailey', 'neutron', 'apogee']


90068it [02:49, 497.83it/s]

avg loss: 280.0315451633482
`he` was updated 524 times in target and 95044 times in context
['he', 'transponders', 'dusty', 'lse']
`original` was updated 22 times in target and 9393 times in context
['original', 'quivering', 'saltire', 'bae']
`january` was updated 49 times in target and 14583 times in context
['january', 'dailey', 'neutron', 'apogee']


100067it [03:09, 498.34it/s]

avg loss: 280.0315518184826
`he` was updated 581 times in target and 105553 times in context
['he', 'transponders', 'dusty', 'lse']
`original` was updated 26 times in target and 10475 times in context
['original', 'quivering', 'saltire', 'bae']
`january` was updated 52 times in target and 16161 times in context
['january', 'dailey', 'neutron', 'apogee']


110092it [03:28, 494.97it/s]

avg loss: 280.0315336327573
`he` was updated 617 times in target and 116189 times in context
['he', 'transponders', 'dusty', 'lse']
`original` was updated 31 times in target and 11503 times in context
['original', 'quivering', 'saltire', 'bae']
`january` was updated 59 times in target and 17812 times in context
['january', 'dailey', 'neutron', 'apogee']


120042it [03:47, 486.18it/s]

avg loss: 280.03155113914374
`he` was updated 682 times in target and 126782 times in context
['he', 'transponders', 'dusty', 'lse']
`original` was updated 34 times in target and 12570 times in context
['original', 'quivering', 'saltire', 'bae']
`january` was updated 61 times in target and 19484 times in context
['january', 'dailey', 'neutron', 'apogee']


130087it [04:07, 484.05it/s]

avg loss: 280.0315524725839
`he` was updated 747 times in target and 137361 times in context
['he', 'transponders', 'dusty', 'lse']
`original` was updated 36 times in target and 13607 times in context
['original', 'quivering', 'saltire', 'bae']
`january` was updated 70 times in target and 21095 times in context
['january', 'dailey', 'neutron', 'apogee']


140084it [04:27, 483.47it/s]

avg loss: 280.0315494403289
`he` was updated 805 times in target and 147898 times in context
['he', 'transponders', 'dusty', 'lse']
`original` was updated 38 times in target and 14703 times in context
['original', 'quivering', 'saltire', 'bae']
`january` was updated 71 times in target and 22715 times in context
['january', 'dailey', 'neutron', 'apogee']


150076it [04:46, 488.90it/s]

avg loss: 280.0315523743929
`he` was updated 852 times in target and 158632 times in context
['he', 'transponders', 'dusty', 'lse']
`original` was updated 40 times in target and 15756 times in context
['original', 'quivering', 'saltire', 'bae']
`january` was updated 79 times in target and 24340 times in context
['january', 'dailey', 'neutron', 'apogee']


160087it [05:06, 486.74it/s]

avg loss: 280.0315516375063
`he` was updated 910 times in target and 169165 times in context
['he', 'transponders', 'dusty', 'lse']
`original` was updated 42 times in target and 16795 times in context
['original', 'quivering', 'saltire', 'bae']
`january` was updated 87 times in target and 26011 times in context
['january', 'dailey', 'neutron', 'apogee']


168556it [05:23, 521.37it/s]


KeyboardInterrupt: 

Once you are satisfied with the training (you can stop it whenever you want), experiment with printing out some nearest neighbors. Do these align with your expectations? Do any surprise you?

In [495]:
model = w2v_model

In [496]:
model.print_nearest_neighbors("paris", 4)

['paris', 'settled', 'suppresses', 'wan']


  vector_sim = ((vectors_norm)@(query_vector_norm)).ravel()
  vector_sim = ((vectors_norm)@(query_vector_norm)).ravel()
  vector_sim = ((vectors_norm)@(query_vector_norm)).ravel()


## Submission

Congratulations on finishing HW1!

Please ensure that you submit a PDF of this notebook onto [Gradescope](https://www.gradescope.com/courses/1238346) before February 3 at 11:59pm.

You can run the cell below to generate a PDF if you are using Google Colab.

In [None]:
#EXPORT_EXCLUDE#

#@markdown This is a helper function to generate a PDF in Colab.
#@markdown If you are using Jupyter notebook, you can do `File > Save and Export Notebook as HTML`, then save the resulting HTML file as a PDF.
#@markdown Alternatively, in Juypter notebook, you might try `File > Save and Export Notebook as PDF`, but just make sure you already have `pandoc` installed.

def colab_export_pdf():
    # Modified from: https://medium.com/@jonathanagustin/convert-colab-notebook-to-pdf-0ccd8f847dd6
    try:
        import google.colab
        IN_COLAB = True
    except:
        IN_COLAB = False
        print("This cell only works in Google Colab!")
        print("If you are running locally, click File > Export as HTML. Then open the HTML file and save it as a PDF.")

    if IN_COLAB:
        print("Generating PDF. This may take a few seconds.")
        import os, datetime, json, locale, pathlib, urllib, requests, werkzeug, nbformat, google, yaml, warnings
        locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
        NAME = pathlib.Path(werkzeug.utils.secure_filename(urllib.parse.unquote(requests.get(f"http://{os.environ['COLAB_JUPYTER_IP']}:{os.environ['KMP_TARGET_PORT']}/api/sessions").json()[0]["name"])))
        TEMP = pathlib.Path("/content/pdfs") / f"{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}_{NAME.stem}"; TEMP.mkdir(parents=True, exist_ok=True)
        NB = [cell for cell in nbformat.reads(json.dumps(google.colab._message.blocking_request("get_ipynb", timeout_sec=30)["ipynb"]), as_version=4).cells if "--Colab2PDF" not in cell.source]
        warnings.filterwarnings('ignore', category=nbformat.validator.MissingIDFieldWarning)
        with (TEMP / f"{NAME.stem}.ipynb").open("w", encoding="utf-8") as nb_copy: nbformat.write(nbformat.v4.new_notebook(cells=NB or [nbformat.v4.new_code_cell("#")]), nb_copy)
        if not pathlib.Path("/usr/local/bin/quarto").exists():
            !wget -q "https://quarto.org/download/latest/quarto-linux-amd64.deb" -P {TEMP} && dpkg -i {TEMP}/quarto-linux-amd64.deb > /dev/null && quarto install tinytex --update-path --quiet
        with (TEMP / "config.yml").open("w", encoding="utf-8") as file: yaml.dump({'include-in-header': [{"text": r"\usepackage{fvextra}\DefineVerbatimEnvironment{Highlighting}{Verbatim}{breaksymbolleft={},showspaces=false,showtabs=false,breaklines,breakanywhere,commandchars=\\\{\}}"}],'include-before-body': [{"text": r"\DefineVerbatimEnvironment{verbatim}{Verbatim}{breaksymbolleft={},showspaces=false,showtabs=false,breaklines}"}]}, file)
        !quarto render {TEMP}/{NAME.stem}.ipynb --metadata-file={TEMP}/config.yml --to pdf -M latex-auto-install -M margin-top=1in -M margin-bottom=1in -M margin-left=1in -M margin-right=1in --quiet
        google.colab.files.download(str(TEMP / f"{NAME.stem}.pdf"))

colab_export_pdf()