# Using the Glove Embeddings

Here we try to use the Glove Embeddings rather than using the OneHot Encoding.

Also, we will have a look at cosine similarity too.

## Downloading pre-trained glove embeddings

In [1]:
# !cd ~/data && wget https://huggingface.co/stanfordnlp/glove/resolve/main/glove.6B.zip
# !cd ~/data && unzip glove.6B.zip

In [2]:
from pathlib import Path
import torch

In [3]:
glove_path = Path.home()/"data"/"glove.6B.50d.txt"

In [4]:
!head {glove_path} -n 2

the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.044457 -0.49688 -0.17862 -0.00066023 -0.6566 0.27843 -0.14767 -0.55677 0.14658 -0.0095095 0.011658 0.10204 -0.12792 -0.8443 -0.12181 -0.016801 -0.33279 -0.1552 -0.23131 -0.19181 -1.8823 -0.76746 0.099051 -0.42125 -0.19526 4.0071 -0.18594 -0.52287 -0.31681 0.00059213 0.0074449 0.17778 -0.15897 0.012041 -0.054223 -0.29871 -0.15749 -0.34758 -0.045637 -0.44251 0.18785 0.0027849 -0.18411 -0.11514 -0.78581
, 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709 -0.42852 -0.55641 -0.364 -0.23938 0.13001 -0.063734 -0.39575 -0.48162 0.23291 0.090201 -0.13324 0.078639 -0.41634 -0.15428 0.10068 0.48891 0.31226 -0.1252 -0.037512 -1.5179 0.12612 -0.02442 -0.042961 -0.28351 3.5416 -0.11956 -0.014533 -0.1499 0.21864 -0.33412 -0.13872 0.31806 0.70358 0.44858 -0.080262 0.63003 0.32111 -0.46765 0.22786 0.36034 -0.37818 -0.56657 0.044691 0.30392


In [5]:
aa = !wc {glove_path}
n_words = int(aa[0].strip().split()[0])
n_words

400001

## Loading glove into PyTorch

In [6]:
words = []
word2idx = {}
embs = torch.zeros(n_words, 50)
n_line = 0
with open(glove_path, 'rb') as f:
    for l in f:
        line = l.decode().split()
        word = line[0]
        emb = [float(i) for i in line[1:]]
        embs[n_line] = torch.FloatTensor(emb)
        word2idx[word] = n_line
        words.append(word)
        n_line += 1

In [7]:
from lib.glove import GloveEmbeddings
# class GloveEmbeddings(torch.nn.Module):
#     def __init__(self, words, word2idx, embs):
#         super().__init__()
#         self.words = words
#         self.word2idx = word2idx
#         self.embs = embs
        
#     def to_id(self, word):
#         if word in self.words:
#             return self.word2idx[word]
#         else:
#             return self.word2idx['<unk>']
        
#     def to_token(self, id):
#         return self.words[id]
        
#     def forward(self, idx_list):
#         device = idx_list.device
#         embs = [self.embs[id].to(device) for id in idx_list]
#         return torch.stack(embs)
    
#     def make(self, input, device="cpu"):
#         curr_words = input.strip().lower().split()
#         token_ids = [self.to_id(word) for word in curr_words]
#         result = self.forward(torch.tensor(token_ids).to(device))
        
#         return result
    
#     def make_one(self, input, device="cpu"):
#         curr_words = input.strip().lower().split()
#         token_id  = self.to_id(curr_words[0])
#         result = self.forward(torch.tensor([token_id]).to(device))
        
#         return result.reshape(-1)
        
        
# g = GloveEmbeddings(words, word2idx, embs)

In [8]:
g = GloveEmbeddings(words, word2idx, embs)

In [9]:
g.make_one("the").shape

torch.Size([50])

In [10]:
g.make("This is srilanka").shape

torch.Size([3, 50])

## Comparing Word Embeddings

Here we use the cosine similarity of the embeddings. Basically that's the dot product of normalized embeddings.

![](https://i.imgur.com/2nQ4nud.png)

In [11]:
def cosine_sim(a, b):
    norm_a = torch.linalg.norm(a)
    norm_b = torch.linalg.norm(b)
    return a.dot(b) / (norm_a * norm_b)

### Let's try some

In [12]:
cosine_sim(g.make_one("india"), g.make_one("india"))

tensor(1.)

In [13]:
cosine_sim(g.make_one("india"), g.make_one("srilanka"))

tensor(0.2476)

In [14]:
cosine_sim(g.make_one("india"), g.make_one("pakistan"))

tensor(0.8530)

In [15]:
cosine_sim(g.make_one("india"), g.make_one("usa"))

tensor(0.3199)

In [16]:
cosine_sim(g.make_one("jam"), g.make_one("butter"))

tensor(0.4013)

In [17]:
cosine_sim(g.make_one("colombo"), g.make_one("usa"))

tensor(0.0224)

### We can try the difference of these embeddings as well

In [18]:
cosine_sim(g.make_one("delhi") - g.make_one("india"), g.make_one("paris") - g.make_one("france"))

tensor(0.6959)

See. They are quite similar.

In [19]:
cosine_sim(g.make_one("delhi") - g.make_one("india"), g.make_one("paris") - g.make_one("jam"))

tensor(0.1494)

But not above.

In [20]:
cosine_sim(g.make_one("delhi") - g.make_one("india"), g.make_one("france") - g.make_one("paris"))

tensor(-0.6959)

In this case, we reverse the order in "france-paris", so it gave us the negative similarity.
Which makes sense.

## Dump it to a Pickel Object

In [22]:
import pickle
output_path = Path.home()/"data"/"glove.pkl"
filehandler = open(output_path, 'wb') 
pickle.dump(g, filehandler)
print(f"glove embeddings object saved to: {output_path}")

glove embeddings object saved to: /home/data/glove.pkl
