In [None]:
import os
glov_dir='/content/sample_data/text8.zip'
import zipfile
with zipfile.ZipFile(glov_dir,"r") as zip_ref:
    zip_ref.extractall('/content/sample_data/reception_gloe')

In [None]:
import re
from collections import Counter

def preprocess(text):

    # Replace punctuation with tokens so we can use them in our model
    text = text.lower()
    text = text.replace('.', ' <PERIOD> ')
    text = text.replace(',', ' <COMMA> ')
    text = text.replace('"', ' <QUOTATION_MARK> ')
    text = text.replace(';', ' <SEMICOLON> ')
    text = text.replace('!', ' <EXCLAMATION_MARK> ')
    text = text.replace('?', ' <QUESTION_MARK> ')
    text = text.replace('(', ' <LEFT_PAREN> ')
    text = text.replace(')', ' <RIGHT_PAREN> ')
    text = text.replace('--', ' <HYPHENS> ')
    text = text.replace('?', ' <QUESTION_MARK> ')
    # text = text.replace('\n', ' <NEW_LINE> ')
    text = text.replace(':', ' <COLON> ')
    words = text.split()
    
    # Remove all words with  5 or fewer occurences
    word_counts = Counter(words)
    trimmed_words = [word for word in words if word_counts[word] > 5]

    return trimmed_words


def create_lookup_tables(words):
    """
    Create lookup tables for vocabulary
    :param words: Input list of words
    :return: Two dictionaries, vocab_to_int, int_to_vocab
    """
    word_counts = Counter(words)
    # sorting the words from most to least frequent in text occurrence
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    # create int_to_vocab dictionaries
    int_to_vocab = {ii: word for ii, word in enumerate(sorted_vocab)}
    vocab_to_int = {word: ii for ii, word in int_to_vocab.items()}

    return vocab_to_int, int_to_vocab


In [None]:
os_dir='/content/sample_data/reception_gloe/text8'
with open(os_dir) as f:
  data=f.read()
text_processed=preprocess(data)


In [None]:
text_processed[:10]
print(len(text_processed))
print(len(set(text_processed)))
vocab_to_int, int_to_vocab= create_lookup_tables(text_processed)


16680599
63641


In [None]:
int_words=[vocab_to_int[word] for word in text_processed]
int_words[:30]

[5233,
 3080,
 11,
 5,
 194,
 1,
 3133,
 45,
 58,
 155,
 127,
 741,
 476,
 10571,
 133,
 0,
 27349,
 1,
 0,
 102,
 854,
 2,
 0,
 15067,
 58112,
 1,
 0,
 150,
 854,
 3580]

In [None]:
from collections import Counter
import random
import numpy as np

threshold=1e-5
word_count=Counter(int_words)
freqs={word:count/len(int_words) for word,count in word_count.items()}
drop_proba={word:1-np.sqrt(threshold/freqs[word]) for word,_ in word_count.items()}
train_words=[word for word in int_words if random.random() < (1 - drop_proba[word])]
print(train_words[:30])

[5233, 3080, 3133, 741, 10571, 27349, 2, 15067, 58112, 854, 10712, 371, 539, 2757, 7088, 1052, 44611, 2877, 5233, 1134, 2621, 25, 8983, 279, 4147, 59, 6437, 4186, 153, 5233]


In [None]:
def get_target(words, idx, window_size=5):
   R = np.random.randint(1, window_size+1)
   print(R)
   start=idx-R if (idx-R) >0 else 0
   stop=idx+R
   target_words=words[start:idx] + words[idx+1:stop+1]
   return list(target_words)

In [None]:
def get_batches(words, batch_size, window_size=5):
    ''' Create a generator of word batches as a tuple (inputs, targets) '''
    
    n_batches = len(words)//batch_size
    
    # only full batches
    words = words[:n_batches*batch_size]
    
    for idx in range(0, len(words), batch_size):
        x, y = [], []
        batch = words[idx:idx+batch_size]
        for ii in range(len(batch)):
            batch_x = batch[ii]
            batch_y = get_target(batch, ii, window_size)
            y.extend(batch_y)
            x.extend([batch_x]*len(batch_y))
        yield x, y

In [None]:
int_text = [i for i in range(20)]
x,y = next(get_batches(int_text, batch_size=4, window_size=5))
print(x)


3
1
4
1
[0, 0, 0, 1, 1, 2, 2, 2, 3]


In [None]:
import torch
tt=[[1,2,3]]
l=torch.LongTensor(tt)
print(l)

tensor([[1, 2, 3]])


In [None]:
def cosine_similarity(embedding,valid_size=16,valid_windows=100,device='cpu'):
  embed_vectors=embedding.weight

  magnitude_vector=embed_vectors.pow(2).sum(dim=1).sqrt().unsqueeze(0)

  valid_examples=np.array(random.sample(range(valid_windows),valid_size//2))
  valid_examples=np.append(valid_examples, random.sample(range(1000,1000+valid_windows),valid_size//2))
  valid_examples=torch.LongTensor(valid_examples).to(device)
  valid_vectors=embedding(valid_examples)
  similarities=torch.mm(valid_vectors,embed_vectors.t())/magnitude_vector
  return valid_examples,similarities
  

In [None]:
import torch
from torch import nn
import torch.optim as optim

In [None]:
class SkipGram(nn.Module):
  def __init__(self,n_vocab,n_embeddings):
    super().__init__()

    self.embed=nn.Embedding(n_vocab,n_embeddings)
    self.output=nn.Linear(n_embeddings,n_vocab)
    self.log=nn.LogSoftmax(dim=1)

  def forward(self,x):
    x=self.embed(x)
    outputed=self.output(x)
    log_scores=self.log(outputed)

    return log_scores


In [None]:
#Training loop

device = 'cuda' if torch.cuda.is_available() else 'cpu'

embedding_dim=300 # you can change, if you want

model = SkipGram(len(vocab_to_int), embedding_dim).to(device)
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.003)

print_every = 500
steps = 0
epochs = 5

for e in range(epochs):

  for inputs ,targets in get_batches(train_words,512):
    steps+=1
    inputs,targets=torch.LongTensor(inputs), torch.LongTensor(targets)
    inputs, targets=inputs.to(device), targets.to(device)
    log_ps=model(inputs)
    loss=criterion(log_ps,targets)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if steps % print_every ==0:
      valid_examples, valid_similarities= cosine_similarity(model.embed,device=device)
      _,close_ids=valid_similarities.topk(6)
      valid_examples, close_ids = valid_examples.to('cpu'), close_ids.to('cpu')
      for ii, valid_idx in enumerate(valid_examples):
        closest=[int_to_vocab[idex.item()] for idex in close_ids[ii]][1:]
        print(int_to_vocab[valid_idx.item()] + " | " + ', '.join(closest))
        print("...")


In [None]:
  # getting examples and similarities      
  # getting examples and similarities      
  valid_examples, valid_similarities = cosine_similarity(model.embed, device=device)
  _, closest_idxs = valid_similarities.topk(6) # topk highest similarities
  valid_examples, closest_idxs = valid_examples.to('cpu'), closest_idxs.to('cpu')
  for ii, valid_idx in enumerate(valid_examples):
      closest_words = [int_to_vocab[idx.item()] for idx in closest_idxs[ii]][1:]
      print(int_to_vocab[valid_idx.item()] + " | " + ', '.join(closest_words))
  print("...")
      