# Natural Language Processing Using PyTorch

## Word Embedding

predictive model for trigrams

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
torch.manual_seed(1234)
word_to_ix = {"data": 0, "science": 1}
word_to_ix

{'data': 0, 'science': 1}

In [2]:
embeds = nn.Embedding(2, 5)  # 2 words in vocab, 5 dimensional embeddings
embeds

Embedding(2, 5)

In [3]:
lookup_tensor = torch.tensor([word_to_ix["data"]], dtype=torch.long)
lookup_tensor

tensor([0])

In [5]:
hello_embed = embeds(lookup_tensor)
hello_embed 

tensor([[ 0.0461,  0.4024, -1.0115,  0.2167, -0.6123]],
       grad_fn=<EmbeddingBackward0>)

In [9]:
test_sentence = """The popularity of the term "data science" has exploded 
in business environments and academia, as indicated by a jump in job 
openings.[32] However, many critical academics and journalists see no 
distinction between data science and statistics. Writing in Forbes, Gil 
Press argues that data science is a buzzword without a clear definition 
and has simply replaced "business analytics" in contexts such as graduate 
degree programs.[7] In the question-and-answer section of his keynote 
address at the Joint Statistical Meetings of American Statistical 
Association, noted applied statistician Nate Silver said, "I think data
scientist is a sexed up term for a statistician....Statistics is a branch 
of science. Data scientist is slightly redundant in some way and people 
shouldn't berate the term statistician."[9] Similarly, in business sector, 
multiple researchers and analysts state that data scientists alone are 
far from being sufficient in granting companies a real competitive 
advantage[33] and consider data scientists as only one of the four 
greater job families companies require to leverage big data effectively, 
namely: data analysts, data scientists, big data developers and big data 
engineers.[34]
 On the other hand, responses to criticism are as numerous. In a 2014 Wall 
Street Journal article, Irving Wladawsky-Berger compares the data science 
enthusiasm with the dawn of computer science. He argues data science, like 
any other interdisciplinary field, employs methodologies and practices from 
across the academia and industry, but then it will morph them into a new 
discipline. He brings to attention the sharp criticisms computer science, 
now a well respected academic discipline, had to once face.[35] Likewise, 
NYU Stern's Vasant Dhar, as do many other academic proponents of data 
science,[35] argues more specifically in December 2013 that data science is different from the 
existing practice of data analysis across all disciplines, which focuses 
only on explaining data sets. Data science seeks actionable and consistent 
pattern for predictive uses.[1] This practical engineering goal takes data 
science beyond traditional analytics. Now the data in those disciplines and 
applied fields that lacked solid theories, like health science and social 
science, could be sought and utilized to generate powerful predictive 
models.[1]""".split()
 # we should tokenize the input, but we will ignore that for now
 # build a list of tuples.  Each tuple is ([ word_i-2, word_i-1 ], target word)
trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2]) for i in range(len(test_sentence) - 2)]
# print the first 3, just so you can see what they look like
print(trigrams[:3])
vocab = set(test_sentence)
word_to_ix = {word: i for i, word in enumerate(vocab)}

[(['The', 'popularity'], 'of'), (['popularity', 'of'], 'the'), (['of', 'the'], 'term')]


In [12]:
class NGramLanguageModeler(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)
    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

In [14]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10

losses = []
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [18]:
for epoch in range(10):
    total_loss = 0
    for context, target in trigrams:
        #  Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        #  into integer indices and wrap them in tensors)
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)
        #  Step 2. Recall that torch *accumulates* gradients. Before passing in 
         # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()
        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_idxs)
        #  Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))
        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()
        #  Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
    losses.append(total_loss)
print(losses)  # The loss decreased every iteration over the training data!


[1871.6984968185425, 1857.3277735710144, 1843.2285251617432, 1829.3953928947449, 1815.8193395137787, 1802.4981145858765, 1789.4459414482117, 1776.6738185882568, 1764.1895637512207, 1752.0007112026215, 1740.1305613517761, 1728.5821821689606, 1717.3550736904144, 1706.4400453567505, 1695.8412848711014, 1685.5456243753433, 1675.538502573967, 1665.7822011709213, 1656.2583233118057, 1646.9528653621674, 1637.8404148817062, 1628.8908122777939, 1620.0785230398178, 1611.3936021327972, 1602.8187881708145, 1594.3363060355186, 1585.9359219670296, 1577.6017825603485, 1569.3233283162117, 1561.083087027073, 1552.871501326561, 1544.6799331903458, 1536.4954044222832, 1528.312023639679, 1520.1358532905579, 1511.9567221403122, 1503.7672375440598, 1495.5567085146904, 1487.3234621882439, 1479.06174659729]


## CBOW Model in PyTorch

In [27]:
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right 
raw_text = """For 
the future of data science, Donoho projects an ever-growing environment for 
open science where data sets used for academic publications are accessible 
to all researchers.[36] US National Institute of Health has already 
announced plans to enhance reproducibility and transparency of research 
data.[39] Other big journals are likewise following suit.[40][41] This way, 
the future of data science not only exceeds the boundary of statistical 
theories in scale and methodology, but data science will revolutionize 
current academia and research paradigms.[36] As Donoho concludes, "the 
scope and impact of data science will continue to expand enormously in 
coming decades as scientific data and data about science itself become 
ubiquitously available."[36]""".split()

In [28]:
# By deriving a set from `raw_text`, we deduplicate the array
vocab = set(raw_text)
vocab_size = len(vocab)
word_to_ix = {word: i for i, word in enumerate(vocab)}
data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 2], raw_text[i - 1],
    raw_text[i + 1], raw_text[i + 2]]
    target = raw_text[i]
    data.append((context, target))
print(data[:5])

[(['For', 'the', 'of', 'data'], 'future'), (['the', 'future', 'data', 'science,'], 'of'), (['future', 'of', 'science,', 'Donoho'], 'data'), (['of', 'data', 'Donoho', 'projects'], 'science,'), (['data', 'science,', 'projects', 'an'], 'Donoho')]


In [None]:
# class NGramLanguageModeler(nn.Module):
#     def __init__(self, vocab_size, embedding_dim, context_size):
#         super(NGramLanguageModeler, self).__init__()
#         self.embeddings = nn.Embedding(vocab_size, embedding_dim)
#         self.linear1 = nn.Linear(context_size * embedding_dim, 128)
#         self.linear2 = nn.Linear(128, vocab_size)
#     def forward(self, inputs):
#         embeds = self.embeddings(inputs).view((1, -1))
#         out = F.relu(self.linear1(embeds))
#         out = self.linear2(out)
#         log_probs = F.log_softmax(out, dim=1)
#         return log_probs
class CBOW(nn.Module):
    def __init__(self):
        pass
    def forward(self, inputs):
        pass
    # create your model and train.  here are some functions to help you make (above code is an example)

In [32]:
# the data ready for use by your module
def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)
make_context_vector(data[0][0], word_to_ix)  # example

tensor([25,  9, 72, 14])

In [33]:
lin = nn.Linear(5, 3)  # maps from R^5 to R^3, parameters A, b
# data is 2x5.  A maps from 5 to 3... can we map "data" under A?
data = torch.randn(2, 5)
print(lin(data))  # yes

tensor([[ 0.6877, -0.8275, -0.1261],
        [ 0.3976, -0.4415,  0.6140]], grad_fn=<AddmmBackward0>)


In [34]:
data = torch.randn(2, 2)
print(data)
print(F.relu(data))
# Softmax is also in torch.nn.functional
data = torch.randn(5)
print(data)
print(F.softmax(data, dim=0))
print(F.softmax(data, dim=0).sum())  #  Sums to 1 because it is a distribution!
print(F.log_softmax(data, dim=0))  # theres also log_softmax

tensor([[ 0.0349, -0.6073],
        [ 0.6068,  0.2397]])
tensor([[0.0349, 0.0000],
        [0.6068, 0.2397]])
tensor([ 0.1256,  0.0864,  2.1068,  0.0520, -1.9631])
tensor([0.0974, 0.0937, 0.7064, 0.0905, 0.0121])
tensor(1.0000)
tensor([-2.3288, -2.3681, -0.3476, -2.4025, -4.4175])


## LSTM Model

In [40]:
lstm = nn.LSTM(3, 3)  # Input dim is 3, output dim is 3
inputs = [torch.randn(1, 3) for _ in range(5)]  # make a sequence of length 5

In [41]:
 # initialize the hidden state.
hidden = (torch.randn(1, 1, 3),
    torch.randn(1, 1, 3))
for i in inputs:
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden state.
    out, hidden = lstm(i.view(1, 1, -1), hidden)
inputs = torch.cat(inputs).view(len(inputs), 1, -1)
hidden = (torch.randn(1, 1, 3), torch.randn(1, 1, 3))  #  clean out hidden state

out, hidden = lstm(inputs, hidden)
print(out)
print(hidden)

tensor([[[ 0.0006, -0.2630,  0.1169]],

        [[ 0.3497, -0.2128, -0.1850]],

        [[ 0.0167, -0.1945, -0.0034]],

        [[-0.0260,  0.0778,  0.0303]],

        [[ 0.0114,  0.1984, -0.0474]]], grad_fn=<MkldnnRnnLayerBackward0>)
(tensor([[[ 0.0114,  0.1984, -0.0474]]], grad_fn=<StackBackward0>), tensor([[[ 0.0191,  0.5876, -0.1018]]], grad_fn=<StackBackward0>))


In [48]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

training_data = [
    ("Probability and random variable are integral part of computation ".split(),
    ["DET", "NN", "V", "DET", "NN"]),
    ("Understanding of the probability and associated concepts are essential".split(),
    ["NN", "V", "DET", "NN"])
]
training_data

[(['Probability',
   'and',
   'random',
   'variable',
   'are',
   'integral',
   'part',
   'of',
   'computation'],
  ['DET', 'NN', 'V', 'DET', 'NN']),
 (['Understanding',
   'of',
   'the',
   'probability',
   'and',
   'associated',
   'concepts',
   'are',
   'essential'],
  ['NN', 'V', 'DET', 'NN'])]

In [47]:
word_to_ix = {}
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
print(word_to_ix)

tag_to_ix = {"DET": 0, "NN": 1, "V": 2}

EMBEDDING_DIM = 6
HIDDEN_DIM = 6


{'Probability': 0, 'and': 1, 'random': 2, 'variable': 3, 'are': 4, 'integral': 5, 'part': 6, 'of': 7, 'computation': 8, 'Understanding': 9, 'the': 10, 'probability': 11, 'associated': 12, 'concepts': 13, 'essential': 14}


In [51]:

class LSTMTagger(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        # The LSTM takes word embeddings as inputs, and outputs hidden states with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we don't have any hidden state.
        # Refer to the Pytorch documentation to see exactly why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, 1, self.hidden_dim),
                torch.zeros(1, 1, self.hidden_dim))

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(embeds.view(len(sentence), 1, -1), self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores


In [52]:

model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

print(model)
print(loss_function)
print(optimizer)

LSTMTagger(
  (word_embeddings): Embedding(15, 6)
  (lstm): LSTM(6, 6)
  (hidden2tag): Linear(in_features=6, out_features=3, bias=True)
)
NLLLoss()
SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    fused: None
    lr: 0.1
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 0
)


In [85]:
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)
    print(tag_scores)

tensor([[-1.0341, -1.1599, -1.1058],
        [-0.9666, -1.2035, -1.1410],
        [-1.0629, -1.1646, -1.0715],
        [-1.0081, -1.2106, -1.0875],
        [-0.9322, -1.2693, -1.1230],
        [-0.9678, -1.2411, -1.1056],
        [-0.9196, -1.2446, -1.1607],
        [-0.9279, -1.2243, -1.1691],
        [-1.0630, -1.1596, -1.0759]])
