<a href="https://colab.research.google.com/github/adithya-tp/PyTorch-Notebooks/blob/master/02_N_Grams.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ***Imports and prepping up some stuff***

In [17]:
import torch
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7f8d50147ed0>

In [18]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10

test_passage_list = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()

# we just made a list of words split using " " as a delimiter.
print(test_passage_list[:3])

['When', 'forty', 'winters']


In [19]:
# making a list of tri-grams for training
trigrams = [([test_passage_list[i], test_passage_list[i+1]], test_passage_list[i+2]) for i in range(len(test_passage_list) - 2)]
print(trigrams[:3])

# need to define our vocabulary which needs all the unique words in the training corpus.
# This is just a matter of type casting test_passage_list into a set. We need to define 
# the vocab so that we can index the vocab to serve as a lookup table when retrieving 
# the embedding corresponding to a word.
vocab = set(test_passage_list)
vocab_index = {word:i for i, word in enumerate(vocab)}
# print(vocab_index)

[(['When', 'forty'], 'winters'), (['forty', 'winters'], 'shall'), (['winters', 'shall'], 'besiege')]


# ***A slight detour***

  Checking if PyTorch returns multiple word embeddings if we pass a vector of indices. <br>
  Turns out, it DOES!

In [0]:
# just as a sanity check, let's see what happens when we pass in multiple 
# tensor wrapped indices into "embeds" for a basic vocab of say, 3 words.

basic_vocab = {"that's":0, "what":1, "she":2, "said":3}
basic_embeds = nn.Embedding(4, 5) # we'll make 5 dimensional embeddings
sample_context = ["that's", "she"]

In [21]:
# let us create an lookup_tensor for the words whose embeddings we wanna look up.
# the look up tensor is just the a vector of the indices of the words of interest 
# wrapped in a PyTorch Tensor.

lookup_tensor = torch.tensor([basic_vocab[word] for word in sample_context])
print(lookup_tensor)
sample_embeddings = basic_embeds(lookup_tensor)
print(sample_embeddings)

tensor([0, 2])
tensor([[-1.5256, -0.7502, -0.6540, -1.6095,  0.8657],
        [ 1.7674, -0.0954,  0.0612, -0.6177, -0.7981]],
       grad_fn=<EmbeddingBackward>)


# ***Back to making our NGramLanguageModeler***

In [0]:
# we can now make our NGramLanguageModeler class which extends the nn.Module Class
class NGramLanguageModeler(nn.Module):
  def __init__(self, len_vocab, len_dimension, len_context):
    super(NGramLanguageModeler, self).__init__()
    self.embeddings = nn.Embedding(len_vocab, len_dimension)
    self.linear1 = nn.Linear(len_context*len_dimension, 128)
    self.linear2 = nn.Linear(128, len_vocab)
   
  def forward(self, inputs):
    embeds = self.embeddings(inputs).view((1, -1))
    out = F.relu(self.linear1(embeds))
    out = self.linear2(out)
    log_probs = torch.log_softmax(out, dim=1)
    return log_probs

In [0]:
# ngram = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
# sample_context_indxs = torch.tensor([vocab_index[word] for word in trigrams[0][0]], dtype=torch.long)
# print(sample_context_indxs)
# ngram.zero_grad()
# sample_out, sample_log_prob = ngram(sample_context_indxs)
# print(sample_out, sample_out.shape)
# print(sample_log_prob, sample_log_prob.shape)

In [63]:
losses = []
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.01)
for epoch in range(100):
  total_loss = 0
  for context, target in trigrams:
    lookup_tensor = torch.tensor([vocab_index[word] for word in context], dtype=torch.long)
    model.zero_grad()
    log_probs = model(lookup_tensor)
    loss = loss_function(log_probs, torch.tensor([vocab_index[target]],dtype=torch.long))
    loss.backward()
    optimizer.step()
    total_loss += loss.item()
  losses.append(total_loss)
print(losses)

[521.7153425216675, 495.34286093711853, 470.53120517730713, 446.51178669929504, 422.76281237602234, 398.6674540042877, 373.74221205711365, 347.8202769756317, 320.93014216423035, 293.3289177417755, 265.32080340385437, 237.4350209236145, 210.24008774757385, 184.3236062526703, 160.2390697002411, 138.34731554985046, 118.90771269798279, 102.00278830528259, 87.57696986198425, 75.43582272529602, 65.30448746681213, 56.89397835731506, 49.945801973342896, 44.19325542449951, 39.43976020812988, 35.49859809875488, 32.22602081298828, 29.481675148010254, 27.178778648376465, 25.228428840637207, 23.566755771636963, 22.13834238052368, 20.896832942962646, 19.81930637359619, 18.869718074798584, 18.03038454055786, 17.283921718597412, 16.613677501678467, 16.015938758850098, 15.470390319824219, 14.977132320404053, 14.528476238250732, 14.11613130569458, 13.738945484161377, 13.392257690429688, 13.067373752593994, 12.767804622650146, 12.492863655090332, 12.231105327606201, 11.989766120910645, 11.766092777252197

# ***Testing out the trained model with a manually input sample context***

In [92]:
sample_context_indxs = torch.tensor([vocab_index[word] for word in ["deep", "sunken"]], dtype=torch.long)
print(sample_context_indxs)
log_probs = model(sample_context_indxs)
print(log_probs)
print(log_probs.max().item())
next_word_index = (torch.abs(log_probs.max().item()-log_probs<0.0001).nonzero()[0][1].item())
print(next_word_index)
for key, value in vocab_index.items():
  if value == next_word_index:
    print("The third word in the trigram is probably: ", key)

tensor([87, 41])
tensor([[-12.9000, -11.0517, -11.9180, -10.6109,  -9.4918,  -9.0281,  -9.4701,
         -10.0088, -12.4749, -14.1159,  -8.1237, -11.8906, -10.4751, -12.7095,
         -11.8933, -10.1434, -15.3022, -10.6923,  -8.8487, -10.6151, -11.6201,
         -12.7400, -12.9318, -11.6189, -12.0235, -12.1041, -10.9124, -11.7159,
         -11.1213, -12.0028, -13.5300, -13.2427, -10.0730, -10.7331, -12.1254,
         -11.9699, -10.4403,  -9.4813,  -9.5169,  -8.2570, -12.5946,  -9.3512,
         -10.2632, -10.7122,  -9.5034, -11.6179,  -9.7012, -13.3439, -11.9361,
         -13.4708, -12.7937, -10.9641,  -9.1360, -10.8557, -11.3210, -11.7869,
          -0.0242, -10.3778, -11.2452, -13.0966, -13.1321,  -7.4787,  -9.3193,
          -9.0731,  -5.2724, -11.8432, -11.3245,  -8.3738, -10.7255,  -4.7463,
          -7.4246,  -7.2568, -10.2593,  -6.9622, -10.6271,  -8.0720, -11.3145,
         -13.7419, -10.9333, -10.6944,  -7.5264,  -6.6923, -12.4230, -13.5464,
         -10.5582, -13.3388,  -7.43