In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from imdb.data import get_vocab
import torchtext as tt
import torch as t

In [3]:
DATAROOT = Path.home() / "mldata"

# Glove Vectors

This notebook explains how to use Glove vectors as a frozen embedding table when training. First lets get the Glove vectors, which are `Vectors` object and see how to create an embedding layer from it.

In [4]:
glove_vecs = tt.vocab.GloVe(name="6B", dim=50, cache=DATAROOT / "glove")
len(glove_vecs)

400000

In [5]:
glove_emb = t.nn.Embedding.from_pretrained(glove_vecs.vectors)
glove_emb

Embedding(400000, 50)

We will expect that a sequence that reads `["all", "hand"]` will result in a tensor that looks like -

$$
\begin{bmatrix}
\left [ \text{vector of all} \right ] \\
\left [ \text{vector of hand} \right ] \\
\end{bmatrix}
$$

Lets verify this. First we read the vector values directly from the glove vectors, then we pass a tensor with the indices of these two words to the embedding layer and should expect to get the same two vectors stacked on top of each other.

In [6]:
glove_vecs.get_vecs_by_tokens(["all", "hand"])

tensor([[ 0.1925,  0.1001,  0.0638, -0.0877,  0.5222,  0.3911, -0.4198, -0.4567,
         -0.3405, -0.1117,  0.0148,  0.3173, -0.5085, -0.1156,  0.7430,  0.0976,
          0.3441, -0.1213, -0.1694, -0.8409, -0.1123,  0.4060,  0.7680,  0.0911,
          0.1078, -1.2673, -0.5771, -0.3621,  0.3482, -0.7546,  4.0426,  0.9497,
         -0.2267, -0.3578,  0.3413,  0.1307,  0.2305, -0.0370, -0.2589,  0.1298,
         -0.3903, -0.0496,  0.4577,  0.5678, -0.4617,  0.4193, -0.5492,  0.0812,
         -0.3049, -0.3051],
        [ 0.0881, -0.4270,  0.2128, -0.4614,  0.8865,  0.3196, -0.0095,  0.1226,
         -0.0112, -0.2113, -0.1177,  0.0859, -0.5400,  0.2767, -0.0742,  0.1130,
         -0.3136, -0.3067,  0.1383, -0.9979, -0.1051,  0.5650,  0.3011, -0.6091,
          0.2153, -1.9955, -0.2307,  0.3617,  0.3657, -0.8359,  3.1593,  0.3848,
         -0.5879,  0.3027, -0.0801,  0.7723,  0.1453,  0.5484,  0.1391, -0.1582,
          0.3756,  0.6432, -0.3582,  0.2687,  0.3704, -0.1284,  0.1405, -0.3739,


In [7]:
all_idx = glove_vecs.stoi["all"]
hand_idx = glove_vecs.stoi["hand"]
print(all_idx, hand_idx)

64 823


In [8]:
glove_emb(t.tensor([64, 823]))

tensor([[ 0.1925,  0.1001,  0.0638, -0.0877,  0.5222,  0.3911, -0.4198, -0.4567,
         -0.3405, -0.1117,  0.0148,  0.3173, -0.5085, -0.1156,  0.7430,  0.0976,
          0.3441, -0.1213, -0.1694, -0.8409, -0.1123,  0.4060,  0.7680,  0.0911,
          0.1078, -1.2673, -0.5771, -0.3621,  0.3482, -0.7546,  4.0426,  0.9497,
         -0.2267, -0.3578,  0.3413,  0.1307,  0.2305, -0.0370, -0.2589,  0.1298,
         -0.3903, -0.0496,  0.4577,  0.5678, -0.4617,  0.4193, -0.5492,  0.0812,
         -0.3049, -0.3051],
        [ 0.0881, -0.4270,  0.2128, -0.4614,  0.8865,  0.3196, -0.0095,  0.1226,
         -0.0112, -0.2113, -0.1177,  0.0859, -0.5400,  0.2767, -0.0742,  0.1130,
         -0.3136, -0.3067,  0.1383, -0.9979, -0.1051,  0.5650,  0.3011, -0.6091,
          0.2153, -1.9955, -0.2307,  0.3617,  0.3657, -0.8359,  3.1593,  0.3848,
         -0.5879,  0.3027, -0.0801,  0.7723,  0.1453,  0.5484,  0.1391, -0.1582,
          0.3756,  0.6432, -0.3582,  0.2687,  0.3704, -0.1284,  0.1405, -0.3739,


Everything is working as it should.

Now lets get the IMDB vocab. However, it is possible that there might be words in the IMDB vocab that are missing from the glove vocab. Lets check this.

In [9]:
imdb_vocab, _ = get_vocab(DATAROOT)
len(imdb_vocab)

147156

In [10]:
missing_words = list(filter(lambda tok: tok not in glove_vecs.stoi, imdb_vocab.get_itos()))
print(len(missing_words))
print(missing_words[:10])

66922
['\x96', 'hadn', '****', 'camera-work', '100%', '*****', '*1/2', '#1', '$1', 'it`s']


Out of roughly 147K words in IMDB vocab, around 70K are missing from the glove vocab, i.e., half of the vocab is missing. If I get a sequence that reads "all hadn" then what should I pass to the embedding layer? The first number is the index of "all" which is 64, but what about "hadn"? This does not have any index at all in the Glove vectors, and therefore there is no corresponding entry in the embedding table for it. How do I handle such unknown tokens? I can do one of two things - 
  1. The `<unk>` token will have a vector of all zeros.
  2. The `<unk>` token will have a vector that is the average of all the other vectors in the vocab.
  
The ctor of the `Vectors` class lets us specify this via `None`, which means use all zeros, or a callback. Now, when I call `get_vecs_by_tokens` with some unknown tokens, it will return zeros (or whetever else I configure) for that.

In [11]:
glove_vecs.get_vecs_by_tokens(["all", "hadn"])

tensor([[ 0.1925,  0.1001,  0.0638, -0.0877,  0.5222,  0.3911, -0.4198, -0.4567,
         -0.3405, -0.1117,  0.0148,  0.3173, -0.5085, -0.1156,  0.7430,  0.0976,
          0.3441, -0.1213, -0.1694, -0.8409, -0.1123,  0.4060,  0.7680,  0.0911,
          0.1078, -1.2673, -0.5771, -0.3621,  0.3482, -0.7546,  4.0426,  0.9497,
         -0.2267, -0.3578,  0.3413,  0.1307,  0.2305, -0.0370, -0.2589,  0.1298,
         -0.3903, -0.0496,  0.4577,  0.5678, -0.4617,  0.4193, -0.5492,  0.0812,
         -0.3049, -0.3051],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,


But it still does not solve the problem of how do I call the embedding table with unknown tokens, they don't even have an index! The solution is to create imdb vectors, where all unknown tokens will get the `<unk>` value from glove, all known tokens will just get the corresponing glove values. E.g.,

```
imdb_vecs["the"] = glove_vecs["the"]
imdb_vecs["hadn"] = <unk> from glove
imdb_vecs["camera-work"] = <unk> from glove
...
```

This means that the indices of words will get messed up, i.e., a word like "minutes" that exists in both vocabs might have its index be different, but the vector values will be the same.

In [12]:
imdb_vecs = glove_vecs.get_vecs_by_tokens(imdb_vocab.get_itos())
for tok in ["all", "hadn", "minutes"]:
    imdb_idx = imdb_vocab.get_stoi()[tok]
    imdb_vec = imdb_vecs[imdb_idx]
    
    glove_idx = glove_vecs.stoi.get(tok, -1)
    glove_vec = glove_vecs.vectors[glove_idx] if glove_idx >= 0 else None
    
    print(f"Token: {tok}")
    print(f"IMDB index: {imdb_idx}  Glove index: {glove_idx}")
    print("IMDB vector: ", imdb_vec)
    print("Glove vector: ", glove_vec)
    print("\n")

Token: all
IMDB index: 36  Glove index: 64
IMDB vector:  tensor([ 0.1925,  0.1001,  0.0638, -0.0877,  0.5222,  0.3911, -0.4198, -0.4567,
        -0.3405, -0.1117,  0.0148,  0.3173, -0.5085, -0.1156,  0.7430,  0.0976,
         0.3441, -0.1213, -0.1694, -0.8409, -0.1123,  0.4060,  0.7680,  0.0911,
         0.1078, -1.2673, -0.5771, -0.3621,  0.3482, -0.7546,  4.0426,  0.9497,
        -0.2267, -0.3578,  0.3413,  0.1307,  0.2305, -0.0370, -0.2589,  0.1298,
        -0.3903, -0.0496,  0.4577,  0.5678, -0.4617,  0.4193, -0.5492,  0.0812,
        -0.3049, -0.3051])
Glove vector:  tensor([ 0.1925,  0.1001,  0.0638, -0.0877,  0.5222,  0.3911, -0.4198, -0.4567,
        -0.3405, -0.1117,  0.0148,  0.3173, -0.5085, -0.1156,  0.7430,  0.0976,
         0.3441, -0.1213, -0.1694, -0.8409, -0.1123,  0.4060,  0.7680,  0.0911,
         0.1078, -1.2673, -0.5771, -0.3621,  0.3482, -0.7546,  4.0426,  0.9497,
        -0.2267, -0.3578,  0.3413,  0.1307,  0.2305, -0.0370, -0.2589,  0.1298,
        -0.3903, -0.0

Now I can just pass these vectors to the embedding table and have a valid vector for any token - whether or not it is found in Glove.

In [13]:
imdb_emb = t.nn.Embedding.from_pretrained(imdb_vecs)

In [14]:
imdb_stoi = imdb_vocab.get_stoi()
x = t.tensor([imdb_stoi["all"], imdb_stoi["hadn"]])
imdb_emb(x)

tensor([[ 0.1925,  0.1001,  0.0638, -0.0877,  0.5222,  0.3911, -0.4198, -0.4567,
         -0.3405, -0.1117,  0.0148,  0.3173, -0.5085, -0.1156,  0.7430,  0.0976,
          0.3441, -0.1213, -0.1694, -0.8409, -0.1123,  0.4060,  0.7680,  0.0911,
          0.1078, -1.2673, -0.5771, -0.3621,  0.3482, -0.7546,  4.0426,  0.9497,
         -0.2267, -0.3578,  0.3413,  0.1307,  0.2305, -0.0370, -0.2589,  0.1298,
         -0.3903, -0.0496,  0.4577,  0.5678, -0.4617,  0.4193, -0.5492,  0.0812,
         -0.3049, -0.3051],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,


Embedding layer created from pre-trained vectors is frozen by default.

In [15]:
for param in imdb_emb.parameters():
    print(param.requires_grad)

False
