# GloVE


In [23]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

## 1. Load data

In [24]:
with open ("./data.txt", "r") as doc:
    corpus = doc.read()

In [25]:
corpus

'It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife. However little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered as the rightful property of some one or other of their daughters.'

In [26]:
#1. tokenization
corpus = [corpus.split(" ") for sent in corpus]
corpus[0:5000]

[['It',
  'is',
  'a',
  'truth',
  'universally',
  'acknowledged,',
  'that',
  'a',
  'single',
  'man',
  'in',
  'possession',
  'of',
  'a',
  'good',
  'fortune,',
  'must',
  'be',
  'in',
  'want',
  'of',
  'a',
  'wife.',
  'However',
  'little',
  'known',
  'the',
  'feelings',
  'or',
  'views',
  'of',
  'such',
  'a',
  'man',
  'may',
  'be',
  'on',
  'his',
  'first',
  'entering',
  'a',
  'neighbourhood,',
  'this',
  'truth',
  'is',
  'so',
  'well',
  'fixed',
  'in',
  'the',
  'minds',
  'of',
  'the',
  'surrounding',
  'families,',
  'that',
  'he',
  'is',
  'considered',
  'as',
  'the',
  'rightful',
  'property',
  'of',
  'some',
  'one',
  'or',
  'other',
  'of',
  'their',
  'daughters.'],
 ['It',
  'is',
  'a',
  'truth',
  'universally',
  'acknowledged,',
  'that',
  'a',
  'single',
  'man',
  'in',
  'possession',
  'of',
  'a',
  'good',
  'fortune,',
  'must',
  'be',
  'in',
  'want',
  'of',
  'a',
  'wife.',
  'However',
  'little',
  'know

In [27]:
#get word sequences and unique words
flatten = lambda l: [item for sublist in l for item in sublist]
vocab = list(set(flatten(corpus)))
vocab

['It',
 'must',
 'this',
 'want',
 'that',
 'families,',
 'single',
 'known',
 'little',
 'well',
 'surrounding',
 'his',
 'fixed',
 'universally',
 'neighbourhood,',
 'However',
 'minds',
 'so',
 'other',
 'possession',
 'man',
 'daughters.',
 'one',
 'truth',
 'on',
 'considered',
 'as',
 'first',
 'be',
 'property',
 'rightful',
 'the',
 'a',
 'some',
 'he',
 'is',
 'such',
 'entering',
 'in',
 'views',
 'good',
 'may',
 'fortune,',
 'feelings',
 'or',
 'acknowledged,',
 'their',
 'wife.',
 'of']

In [28]:
#numericalization
word2index = {w: i for i, w in enumerate(vocab)}
print(word2index)

{'It': 0, 'must': 1, 'this': 2, 'want': 3, 'that': 4, 'families,': 5, 'single': 6, 'known': 7, 'little': 8, 'well': 9, 'surrounding': 10, 'his': 11, 'fixed': 12, 'universally': 13, 'neighbourhood,': 14, 'However': 15, 'minds': 16, 'so': 17, 'other': 18, 'possession': 19, 'man': 20, 'daughters.': 21, 'one': 22, 'truth': 23, 'on': 24, 'considered': 25, 'as': 26, 'first': 27, 'be': 28, 'property': 29, 'rightful': 30, 'the': 31, 'a': 32, 'some': 33, 'he': 34, 'is': 35, 'such': 36, 'entering': 37, 'in': 38, 'views': 39, 'good': 40, 'may': 41, 'fortune,': 42, 'feelings': 43, 'or': 44, 'acknowledged,': 45, 'their': 46, 'wife.': 47, 'of': 48}


In [29]:
#vocab size
voc_size = len(vocab)
print(voc_size)

49


In [30]:
#append UNK
vocab.append('<UNK>')

In [31]:
vocab[:10]

['It',
 'must',
 'this',
 'want',
 'that',
 'families,',
 'single',
 'known',
 'little',
 'well']

In [32]:
word2index['<UNK>'] = 0

In [33]:
#just in case we need to use
index2word = {v:k for k, v in word2index.items()} 

## 2. Build Co-occurence Matrix X

In [34]:
from collections import Counter

X_i = Counter(flatten(corpus))

In [35]:
skip_grams = []

for doc in corpus:
    for i in range(1, len(doc)-1):
        center = doc[i]
        outside = [doc[i-1], doc[i+1]]
        for each_out in outside:
            skip_grams.append((center, each_out))

In [36]:
X_ik_skipgrams = Counter(skip_grams)

### Weighting function

GloVe includes a weighting function to scale down too frequent words.

<img src = "../figures/glove_weighting_func.png" width=400>

In [37]:
def weighting(w_i, w_j, X_ik):
    
    #check whether the co-occurences between w_i and w_j is available
    try:
        x_ij = X_ik[(w_i, w_j)]
        #if not exist, then set to 1 "laplace smoothing"
    except:
        x_ij = 1
        
    #set xmax
    x_max = 100
    #set alpha
    alpha = 0.75
    
    #if co-ocurrence does not exceeed xmax, then just multiply with some alpha
    if x_ij < x_max:
        result = (x_ij / x_max)**alpha
    #otherwise, set to 1
    else:
        result = 1
    
    return result

In [38]:
from itertools import combinations_with_replacement

X_ik = {} #keeping the co-occurences
weighting_dic = {} #already scale the co-occurences using the weighting function

In [40]:
for bigram in combinations_with_replacement(vocab, 2):
    if X_ik_skipgrams.get(bigram):  #if the pair exists in our corpus
        co = X_ik_skipgrams[bigram]
        X_ik[bigram] = co + 1 #for stability
        X_ik[(bigram[1], bigram[0])] = co + 1 #basically apple, banana = banana, apple
    else:
        pass
    
    weighting_dic[bigram] = weighting(bigram[0], bigram[1], X_ik)
    weighting_dic[(bigram[1], bigram[0])] = weighting(bigram[1], bigram[0], X_ik)

## 3. Prepare train data

In [41]:
import math

def random_batch(batch_size, word_sequence, skip_grams, X_ik, weighting_dic):
    
    random_inputs, random_labels, random_coocs, random_weightings = [], [], [], []
    
    #convert our skipgrams to id
    skip_grams_id = [(word2index[skip_gram[0]], word2index[skip_gram[1]]) for skip_gram in skip_grams]
    
    #randomly choose indexes based on batch size
    random_index = np.random.choice(range(len(skip_grams_id)), batch_size, replace=False)
    
    #get the random input and labels
    for index in random_index:
        random_inputs.append([skip_grams_id[index][0]])
        random_labels.append([skip_grams_id[index][1]])
        #coocs
        pair = skip_grams[index] #e.g., ('banana', 'fruit')
        try:
            cooc = X_ik[pair]
        except:
            cooc = 1
        random_coocs.append([math.log(cooc)])
    
        #weightings
        weighting = weighting_dic[pair]
        random_weightings.append([weighting])
        
    return np.array(random_inputs), np.array(random_labels), np.array(random_coocs), np.array(random_weightings)

## 4. Model

<img src ="../figures/glove.png" width=400>

In [44]:
class Glove(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(Glove, self).__init__()
        self.center_embedding  = nn.Embedding(voc_size, emb_size)
        self.outside_embedding = nn.Embedding(voc_size, emb_size)
        
        self.center_bias       = nn.Embedding(voc_size, 1) 
        self.outside_bias      = nn.Embedding(voc_size, 1)
    
    def forward(self, center, outside, coocs, weighting):
        center_embeds  = self.center_embedding(center) #(batch_size, 1, emb_size)
        outside_embeds = self.outside_embedding(outside) #(batch_size, 1, emb_size)
        
        center_bias    = self.center_bias(center).squeeze(1)
        target_bias    = self.outside_bias(outside).squeeze(1)
        
        inner_product  = outside_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #(batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1)
        
        loss = weighting * torch.pow(inner_product + center_bias + target_bias - coocs, 2)
        
        return torch.sum(loss)

In [45]:
def get_embed(self, word):
    # Assuming word2index is a dictionary mapping words to indices
    id_tensor = torch.LongTensor([word2index[word]])
    
    v_embed = self.center_embedding(id_tensor)
    u_embed = self.outside_embedding(id_tensor)
    
    word_embed = (v_embed + u_embed) / 2

    return word_embed


In [46]:
#test our system
voc_size = len(vocab)
emb_size = 2
model = Glove(voc_size, emb_size)

In [47]:
x_tensor = torch.LongTensor(x)
y_tensor = torch.LongTensor(y)
cooc_tensor = torch.FloatTensor(cooc)
weighting_tensor = torch.FloatTensor(weighting)

In [48]:
loss = model(x_tensor, y_tensor, cooc_tensor, weighting_tensor)

In [49]:
loss

tensor(124.8764, grad_fn=<SumBackward0>)

## 5. Training

In [50]:
batch_size     = 5 # mini-batch size
embedding_size = 2 #so we can later plot
model          = Glove(voc_size, embedding_size)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [51]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [52]:
import time

# Training
num_epochs = 5000
losses = []
for epoch in range(num_epochs):
    
    start = time.time()
    
    input_batch, target_batch, cooc_batch, weighting_batch = random_batch(batch_size, corpus, skip_grams, X_ik, weighting_dic)
    input_batch  = torch.LongTensor(input_batch)         #[batch_size, 1]
    target_batch = torch.LongTensor(target_batch)        #[batch_size, 1]
    cooc_batch   = torch.FloatTensor(cooc_batch)         #[batch_size, 1]
    weighting_batch = torch.FloatTensor(weighting_batch) #[batch_size, 1]
    
    optimizer.zero_grad()
    loss = model(input_batch, target_batch, cooc_batch, weighting_batch)
    losses.append(loss)
    loss.backward()
    optimizer.step()
    
    end = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start, end)

    if (epoch + 1) % 1000 == 0:
        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")


Epoch: 1000 | cost: 34.730404 | time: 0m 0s
Epoch: 2000 | cost: 88.764748 | time: 0m 0s
Epoch: 3000 | cost: 61.813717 | time: 0m 0s
Epoch: 4000 | cost: 21.116520 | time: 0m 0s
Epoch: 5000 | cost: 12.924250 | time: 0m 0s


In [53]:
print(f'Training Loss: {loss}')

Training Loss: 12.924249649047852


In [54]:
# Saving the model for testing
torch.save(model.state_dict(), 'app/models/GloVe.pt')

In [60]:
GloVeData = {
    'corpus': corpus,
    'vocab': vocab,
    'word2index': word2index,
    'voc_size': voc_size,
    'embedding_size': embedding_size
}

In [61]:
import pickle
pickle.dump(GloVeData,open('./app/models/GloVeData.pkl', 'wb'))