In [335]:
import sys
import torch
import numpy as np
import pandas as pd

try:
    import torchtext
except ImportError:
    sys.path.append('/usr/local/lib/python3.8/site-packages/')
    import torchtext

In [18]:
dict_data_path = '../data/dictionary/reverse-dict-singleton.tsv'

glove_embed_dim = 300 # other options are 100, 200, 300
glove_embed_path = f'../data/glove_embed/glove.6B.{glove_embed_dim}d.txt'

In [12]:
# Load dictionary data
# Assuming the .tsv files from Prof Oflazer are placed in the data/dictionary folder
dict_data = pd.read_csv(dict_data_path, sep='\t', header=None)

In [None]:
dict_data.head(5)

In [19]:
# Load pretrained GloVe embeddings
# Download them from http://nlp.stanford.edu/data/glove.6B.zip
# and place them in the data/glove_embed folder
glove_embed = torchtext.vocab.Vectors(glove_embed_path)

100%|█████████▉| 400000/400001 [00:11<00:00, 35331.98it/s]


In [368]:
s, p = glove_embed.get_vecs_by_tokens(['ice', 'gorilla'])
print(s @ p)

d, p = glove_embed.get_vecs_by_tokens(['ice', 'cold'])
print(d @ p)

tensor(3.3170)
tensor(16.4342)


In [222]:
class DictDataset(torch.utils.data.Dataset):
    def __init__(self, definitions, embeddings, embedding_dim, tokenizer=None):
        super(DictDataset, self).__init__()
        if tokenizer is None:
            tokenizer = torchtext.data.utils.get_tokenizer("basic_english")
        self.tokenizer = tokenizer
        
        self.embedding_dim = embedding_dim
        self.embeddings = embeddings
        
        f = lambda x: not torch.all(embeddings.get_vecs_by_tokens([x]) == 0)
        # Filter out words that do not have embeddings
        self.definitions = definitions.loc[definitions[0].apply(f)]
        self.definitions.index = pd.RangeIndex(len(self.definitions.index))
        
    def __getitem__(self, i):
        word, def_text = self.definitions.loc[i] # definition, in plain text form
        tokens = self.tokenizer(def_text)
        return self.embeddings.get_vecs_by_tokens(tokens), self.embeddings.get_vecs_by_tokens([word]).squeeze()
    
    def __len__(self):
        return len(self.definitions)
    
    @staticmethod
    def collate_fn(batch):
        batch.sort(key=lambda elem: len(elem[0]), reverse=True)
        Xs = [x for x, _ in batch]
        Ys = ([y for _, y in batch])
        return (torch.nn.utils.rnn.pack_sequence(Xs), 
                torch.stack(Ys))

In [194]:
data = DictDataset(dict_data, glove_embed, 50)

In [195]:
data.definitions

Unnamed: 0,0,1
0,1000th,the ordinal number of one thousand in counting...
1,100th,the ordinal number of one hundred in counting ...
2,101,being one more than one hundred
3,101st,the ordinal number of one hundred one in count...
4,105,being five more than one hundred
...,...,...
30821,winterize,prepare for winter
30822,woosh,move with a sibilant sound
30823,wreak,cause to happen or to occur as a consequence
30824,wrest,"obtain by seizing forcibly or violently, also ..."


In [332]:
loader = torch.utils.data.DataLoader(data, shuffle=True, batch_size=16, collate_fn=DictDataset.collate_fn)

In [370]:
# The below code is attempting to learn the word embedding from 
# the definition, which isn't exactly what we want to do
model = torch.nn.LSTM(50, 50)
criterion = torch.nn.MSELoss()
optim = torch.optim.Adam(model.parameters(), lr=1e-3)

model.train()
for i, (x, y) in zip(range(1000), loader):
    optim.zero_grad()
    
    out, (h,c) = model(x)
    (out_pad, out_lengths) = torch.nn.utils.rnn.pad_packed_sequence(out)
    
    out_embeds = torch.stack(list(out_pad[j] for j in zip(out_lengths-1, range(len(out_lengths)))))
    
    loss = criterion(out_embeds, y)
    if i % 100 == 0:
        print(i, loss.detach())
    loss.backward()

    optim.step()

0 tensor(0.4699)
100 tensor(0.4191)
200 tensor(0.3966)
300 tensor(0.4262)
400 tensor(0.3349)
500 tensor(0.3054)
600 tensor(0.3089)
700 tensor(0.3683)
800 tensor(0.3783)
900 tensor(0.3236)


In [283]:
out, (h,c) = model(x)
(out_pad, out_lengths) = torch.nn.utils.rnn.pad_packed_sequence(out)

In [284]:
out_pad.shape, out_lengths

(torch.Size([40, 50, 100]),
 tensor([40, 36, 33, 24, 24, 23, 21, 21, 20, 19, 19, 19, 17, 16, 16, 15, 15, 14,
         14, 14, 13, 13, 12, 11, 11, 11, 11,  9,  9,  9,  9,  9,  8,  8,  8,  7,
          6,  6,  6,  6,  6,  5,  5,  5,  5,  4,  3,  3,  2,  2]))

In [314]:
for q in list(zip(out_lengths-1, range(len(out_lengths)))):
    r = out_pad[q]