In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import pdb

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x1240545f0>

# Example

In [None]:
lstm = nn.LSTM(3, 3)  # Input dim is 3, output dim is 3
inputs = [torch.randn(1, 3) for _ in range(5)]  # make a sequence of length 5

# initialize the hidden state.
hidden = (torch.randn(1, 1, 3),
          torch.randn(1, 1, 3))
# Why a tuple of 2 x (1,1,3) tensors?? --> Specific to LSTM!!  (hidden state, cell state)
# GRU only takes one tensor as hidden


for i in inputs:
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden state.
    out, hidden = lstm(i.view(1, 1, -1), hidden)
    print(out.shape)
    print(hidden)
    # hidden returns a tuple of 2 x tensors

In [None]:
torch.randn(1, 1, 3).shape, inputs[0].view(1,1,-1).shape

In [None]:
gru = nn.GRU(3,3)
o,h = gru(inputs[0].view(1,1,-1), hidden[0])
o.shape, h.shape

In [None]:
# alternatively, we can do the entire sequence all at once.
# the first value returned by LSTM is all of the hidden states throughout
# the sequence. the second is just the most recent hidden state
# (compare the last slice of "out" with "hidden" below, they are the same)
# The reason for this is that:
# "out" will give you access to all hidden states in the sequence
# "hidden" will allow you to continue the sequence and backpropagate,
# by passing it as an argument  to the lstm at a later time
# Add the extra 2nd dimension
inputs = torch.cat(inputs).view(len(inputs), 1, -1)
print(inputs.shape)
hidden = (torch.randn(1, 1, 3), torch.randn(1, 1, 3))  # clean out hidden state
out, hidden = lstm(inputs, hidden)
print(out.shape)
print(hidden)

In [None]:
inputs = torch.randn(5,1,3)
hidden = torch.randn(1,1,3)
o,h = gru(inputs,hidden)
o.shape, h.shape

# Part of Speech Tagging

## Prepare Data

In [4]:
training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]
word_to_ix = {}
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
print(word_to_ix)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}
ix_to_tag = {0: "DET", 1: "NN", 2: "V"}

# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8}


## Model

In [16]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, 1, self.hidden_dim),
                torch.zeros(1, 1, self.hidden_dim))

    def forward(self, sentence):  #=> ([ 0,  1,  2,  3,  4])
        embeds = self.word_embeddings(sentence) #=> ([5, 6])
        lstm_out, self.hidden = self.lstm(embeds.unsqueeze(1), self.hidden)  #=> ([5, 1, 6])
        #=> ([5, 1, 6]), (([1, 1, 6]),([1, 1, 6]))
        tag_space = self.hidden2tag(lstm_out.squeeze()) #=> ([5, 6])
        #=> ([5, 3])
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

## Train

In [5]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [8]:
training_data[0][0], word_to_ix

(['The', 'dog', 'ate', 'the', 'apple'],
 {'Everybody': 5,
  'The': 0,
  'apple': 4,
  'ate': 2,
  'book': 8,
  'dog': 1,
  'read': 6,
  'that': 7,
  'the': 3})

In [7]:
inputs = prepare_sequence(training_data[0][0], word_to_ix)
inputs.shape, inputs

(torch.Size([5]), tensor([ 0,  1,  2,  3,  4]))

In [17]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)
    print(tag_scores)

for epoch in range(500):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden = model.init_hidden()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

# See what the scores are after training
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)

    # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
    # for word i. The predicted tag is the maximum scoring tag.
    # Here, we can see the predicted sequence below is 0 1 2 0 1
    # since 0 is index of the maximum value of row 1,
    # 1 is the index of maximum value of row 2, etc.
    # Which is DET NOUN VERB DET NOUN, the correct sequence!
    print(tag_scores)

tensor([[-1.1153, -1.0948, -1.0859],
        [-1.1902, -1.0613, -1.0502],
        [-1.1256, -1.0805, -1.0903],
        [-1.2653, -1.0699, -0.9814],
        [-1.2593, -1.0399, -1.0144]])
tensor([[-0.0692, -4.4247, -2.9032],
        [-6.8791, -0.0068, -5.1656],
        [-3.8429, -4.3300, -0.0352],
        [-0.0318, -5.8369, -3.5614],
        [-6.4012, -0.0086, -4.9726]])


In [18]:
# def check(output):
#     return [ix_to_tag[i] for i in (torch.argmax(output, 1).numpy)]

# check(tag_scores)

idxs = torch.argmax(tag_scores, 1).numpy()
[ix_to_tag[i] for i in idxs]

['DET', 'NN', 'V', 'DET', 'NN']

# Exercise: Augmenting the LSTM part-of-speech tagger with character-level features

In the example above, each word had an embedding, which served as the inputs to our sequence model. Let’s augment the word embeddings with a representation derived from the characters of the word. We expect that this should help significantly, since character-level information like affixes have a large bearing on part-of-speech. For example, words with the affix -ly are almost always tagged as adverbs in English.

To do this, let cw be the character-level representation of word w. Let xw be the word embedding as before. Then the input to our sequence model is the concatenation of xw and cw. So if xw has dimension 5, and cw dimension 3, then our LSTM should accept an input of dimension 8.

To get the character level representation, do an LSTM over the characters of a word, and let cw be the final hidden state of this LSTM. Hints:

There are going to be two LSTM’s in your new model. The original one that outputs POS tag scores, and the new one that outputs a character-level representation of each word.
To do a sequence model over characters, you will have to embed characters. The character embeddings will be the input to the character LSTM.

In [4]:
training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]
word_to_ix = {}
char_to_ix = {0: '_pad_'}
for sentence, tags in training_data:
    for word in sentence:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
        for char in list(word):
            if char not in char_to_ix:
                char_to_ix[char] = len(char_to_ix)
                
print(word_to_ix)
print(char_to_ix)

tag_to_ix = {"DET": 0, "NN": 1, "V": 2}
ix_to_tag = {0: "DET", 1: "NN", 2: "V"}

# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
CHAR_DIM = 3
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8}
{0: '_pad_', 'T': 1, 'h': 2, 'e': 3, 'd': 4, 'o': 5, 'g': 6, 'a': 7, 't': 8, 'p': 9, 'l': 10, 'E': 11, 'v': 12, 'r': 13, 'y': 14, 'b': 15, 'k': 16}


In [6]:
def prepare_char_sequence(seq):
    '''pad to the longest word in sequence and return a 2d tensor'''
    h = len(seq)
    w = len(max(seq, key=len))
    tens = torch.zeros(h,w, dtype=torch.long)
    for i,w in enumerate(seq):
        idxs = [char_to_ix[c] for c in w]
        tens[i,:len(idxs)] = torch.tensor(idxs, dtype=torch.long)
    return tens

In [133]:
char_inputs = prepare_char_sequence(training_data[0][0])
char_inputs.shape

torch.Size([5, 5])

In [126]:
char_embeddings = nn.Embedding(len(char_to_ix), 3)
res = char_embeddings(char_inputs)
res.shape

torch.Size([5, 5, 3])

In [7]:
def prepare_inputs(seq):
    res = []
    w_idxs = [word_to_ix[w] for w in seq]
    res.append(torch.tensor(w_idxs, dtype=torch.long))
    for w in seq:
        idxs = [char_to_ix[c] for c in list(w)]
        res.append((torch.tensor([word_to_ix[w]], dtype=torch.long), torch.tensor(idxs, dtype=torch.long)))
    return res

In [112]:
prepare_inputs(training_data[0][0])

[(tensor([ 0]), tensor([ 0,  1,  2])),
 (tensor([ 1]), tensor([ 3,  4,  5])),
 (tensor([ 2]), tensor([ 6,  7,  2])),
 (tensor([ 3]), tensor([ 7,  1,  2])),
 (tensor([ 4]), tensor([ 6,  8,  8,  9,  2]))]

In [18]:
class CHAR_LSTM_Tagger(nn.Module):

    def __init__(self, vocab_size, char_size, tagset_size):
        super(CHAR_LSTM_Tagger, self).__init__()

        self.char_embeddings = nn.Embedding(char_size, CHAR_DIM)
        self.char_lstm = nn.LSTM(CHAR_DIM, CHAR_DIM)
        
        self.word_embeddings = nn.Embedding(vocab_size, EMBEDDING_DIM)
        self.lstm = nn.LSTM(EMBEDDING_DIM+CHAR_DIM, HIDDEN_DIM)
        
        self.hidden2tag = nn.Linear(HIDDEN_DIM, tagset_size)
        
    def init_hidden(self, dim, bs=1):
        return (torch.zeros(1, bs, dim), torch.zeros(1, bs, dim))

    def forward(self, sentence, characters):        
#         char_lstm_out = []
#         for w in characters:
#             char_embeds = self.char_embeddings(w) #([?, 3])
#             out, _ = self.char_lstm(char_embeds.unsqueeze(1), self.init_hidden(char_dim)) #=> ([?, 1, 3])
#             char_lstm_out.append(out[-1])
        
#         char_lstm_out = torch.stack(char_lstm_out) #([5, 1, 3])

        char_embeds = self.char_embeddings(characters) #([?, 5, 3])
        char_out, _ = self.char_lstm(char_embeds, self.init_hidden(CHAR_DIM, len(characters[0]))) #=> ([?, 1, 3])
        
        char_out = torch.mean(char_out, 1) #([5,3])
        
        word_embeds = self.word_embeddings(sentence) #=> ([5, 6])
        inp = torch.cat((word_embeds,char_out), 1)       #=> ([5, 9])
        out, _ = self.lstm(inp.unsqueeze(1), self.init_hidden(EMBEDDING_DIM))
        tag_space = self.hidden2tag(out.squeeze(1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [20]:
model = CHAR_LSTM_Tagger(len(word_to_ix), len(char_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    word_inputs = prepare_sequence(training_data[0][0], word_to_ix)
    char_inputs = prepare_char_sequence(training_data[0][0])
    tag_scores = model(word_inputs, char_inputs)
    print(tag_scores)

for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
#         model.hidden = model.init_hidden()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        chars_in = prepare_char_sequence(sentence)
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in, chars_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

# See what the scores are after training
with torch.no_grad():
    word_inputs = prepare_sequence(training_data[0][0], word_to_ix)
    char_inputs = prepare_char_sequence(training_data[0][0])
    tag_scores = model(word_inputs, char_inputs)

    # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
    # for word i. The predicted tag is the maximum scoring tag.
    # Here, we can see the predicted sequence below is 0 1 2 0 1
    # since 0 is index of the maximum value of row 1,
    # 1 is the index of maximum value of row 2, etc.
    # Which is DET NOUN VERB DET NOUN, the correct sequence!
    print(tag_scores)

tensor([[-1.1564, -0.9660, -1.1881],
        [-1.0685, -1.0727, -1.1571],
        [-1.1041, -0.9955, -1.2075],
        [-1.0790, -1.1032, -1.1139],
        [-1.1232, -1.0572, -1.1167]])
tensor([[-0.3203, -1.7639, -2.2762],
        [-4.1823, -0.0193, -5.5623],
        [-2.6876, -4.8802, -0.0787],
        [-0.0381, -3.6916, -4.3832],
        [-2.3719, -0.1010, -5.8905]])


## Attempt 1

In [27]:
char_training_data = ["The dog ate the apple", "Everybody read that book"]

chars = set()
for sentence in char_training_data:
    charrs = set(list(sentence.replace(' ', '')))
    chars = chars.union(charrs)
    
itos = sorted(list(chars))
itos.insert(0, '_pad_')
itos.insert(1, '_unk_')

stoi = {v:k for k,v in enumerate(itos)}
print(stoi)

# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

{'_pad_': 0, '_unk_': 1, 'E': 2, 'T': 3, 'a': 4, 'b': 5, 'd': 6, 'e': 7, 'g': 8, 'h': 9, 'k': 10, 'l': 11, 'o': 12, 'p': 13, 'r': 14, 't': 15, 'v': 16, 'y': 17}


In [67]:
class LSTM_CHARTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, char_size, tagset_size):
        super(LSTM_CHARTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.char_embeddings = nn.Embedding(char_size, embedding_dim, padding_idx=0)
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.char_lstm = nn.LSTM(embedding_dim, embedding_dim)
        self.pos_lstm = nn.LSTM(embedding_dim+embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        
#         self.char_hidden = self.init_hidden()
        self.pos_hidden = self.init_hidden()

    def init_hidden(self, c=1):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, c, self.hidden_dim),
                torch.zeros(1, c, self.hidden_dim))

#     def forward(self, sentence):  #=> ([ 0,  1,  2,  3,  4])
#         embeds = self.word_embeddings(sentence) #=> ([5, 6])
#         lstm_out, self.hidden = self.lstm(embeds.unsqueeze(1), self.hidden)  #=> ([5, 1, 6])
#         #=> ([5, 1, 6]), (([1, 1, 6]),([1, 1, 6]))
#         tag_space = self.hidden2tag(lstm_out.squeeze()) #=> ([5, 6])
#         #=> ([5, 3])
#         tag_scores = F.log_softmax(tag_space, dim=1)
#         return tag_scores
    
    def forward(self, sentence, characters):
        # sentence   => ([5]) (words)
        # characters => ([5, 5]) (words, chars)
        pdb.set_trace()
        char_embeds = self.char_embeddings(characters)  #=> ([5, 5, 6]) (words, chars, embeddings)
        self.char_hidden = self.init_hidden(characters.size()[1]) #=> (([1, 5, 6]), ([1, 5, 6]))
        char_lstm_out, self.char_hidden = self.char_lstm(char_embeds, self.char_hidden) #=> ([5, 5, 6])
        
        word_embeds = self.word_embeddings(sentence).unsqueeze(1) #=> ([5, 1, 6])
        pos_inp = torch.cat((word_embeds,char_lstm_out), 1)       #=> ([5, 6, 6])
        pos_lstm_out, self.pos_hidden = self.pos_lstm(pos_inp, self.pos_hidden)
        tag_space = self.hidden2tag(pos_lstm_out.squeeze())
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [65]:
def prepare_char_sequence(seq, to_ix):
    '''pad to the longest word in sequence and return a 2d tensor'''
    arr = seq.split()
    h = len(arr)
    w = len(max(arr, key=len))
    tens = torch.zeros(h,w, dtype=torch.long)
    for i,w in enumerate(arr):
        idxs = [to_ix[c] for c in w]
        tens[i,:len(idxs)] = torch.tensor(idxs, dtype=torch.long)
    return tens

In [62]:
char_inputs = prepare_char_sequence(char_training_data[0], stoi)
char_inputs

tensor([[  3.,   9.,   7.,   0.,   0.],
        [  6.,  12.,   8.,   0.,   0.],
        [  4.,  15.,   7.,   0.,   0.],
        [ 15.,   9.,   7.,   0.,   0.],
        [  4.,  13.,  13.,  11.,   7.]])

In [43]:
word_inputs = prepare_sequence(training_data[0][0], word_to_ix)
word_inputs

tensor([ 0,  1,  2,  3,  4])

In [68]:
model = LSTM_CHARTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(stoi), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    word_inputs = prepare_sequence(training_data[0][0], word_to_ix)
    char_inputs = prepare_char_sequence(char_training_data[0], stoi)
    tag_scores = model(word_inputs, char_inputs)
    print(tag_scores)

for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden = model.init_hidden()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        chars_in = prepare_char_sequence(sentence, stoi)
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in, chars_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

# See what the scores are after training
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)

    # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
    # for word i. The predicted tag is the maximum scoring tag.
    # Here, we can see the predicted sequence below is 0 1 2 0 1
    # since 0 is index of the maximum value of row 1,
    # 1 is the index of maximum value of row 2, etc.
    # Which is DET NOUN VERB DET NOUN, the correct sequence!
    print(tag_scores)

> <ipython-input-67-e8691a7c46c2>(42)forward()
-> char_embeds = self.char_embeddings(characters)  #=> ([5, 5, 6])
(Pdb) n
> <ipython-input-67-e8691a7c46c2>(43)forward()
-> self.char_hidden = self.init_hidden(characters.size()[1])
(Pdb) n
> <ipython-input-67-e8691a7c46c2>(44)forward()
-> char_lstm_out, self.char_hidden = self.char_lstm(char_embeds, self.char_hidden)
(Pdb) self.char_hidden.shape()
*** AttributeError: 'tuple' object has no attribute 'shape'
(Pdb) self.char_hidden.shape
*** AttributeError: 'tuple' object has no attribute 'shape'
(Pdb) self.char_hidden[0].shape
torch.Size([1, 5, 6])
(Pdb) n
> <ipython-input-67-e8691a7c46c2>(46)forward()
-> word_embeds = self.word_embeddings(sentence)
(Pdb) l
 41  	        pdb.set_trace()
 42  	        char_embeds = self.char_embeddings(characters)  #=> ([5, 5, 6])
 43  	        self.char_hidden = self.init_hidden(characters.size()[1])
 44  	        char_lstm_out, self.char_hidden = self.char_lstm(char_embeds, self.char_hidden)
 45  	
 46  -

BdbQuit: 

In [None]:
chars

## https://github.com/FraLotito/partofspeech-tagger/blob/master/post.py

In [70]:
word_to_ix = {}
car_to_ix = {}

def get_index_of_max(input):
    index = 0
    for i in range(1, len(input)):
        if input[i] > input[index]:
            index = i 
    return index

def get_max_prob_result(input, ix_to_tag):
    return ix_to_tag[get_index_of_max(input)]

def prepare_car_sequence(word, to_ix):
    idxs = []
    for car in word:
        idxs.append(to_ix[car])
    return idxs

def prepare_sequence(seq, to_ix):
    res = []
    for w in seq:
        res.append((to_ix[w], prepare_car_sequence(w, car_to_ix)))
    return res

def prepare_target(seq, to_ix):
    idxs = []
    for w in seq:
        idxs.append(to_ix[w])
    return autograd.Variable(torch.LongTensor(idxs))

In [73]:
prepare_sequence(training_data[0][0], word_to_ix)

[(0, [0, 1, 2]),
 (1, [3, 4, 5]),
 (2, [6, 7, 2]),
 (3, [7, 1, 2]),
 (4, [6, 8, 8, 9, 2])]

In [72]:
training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]

for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
        for car in word:
        	if car not in car_to_ix:
        		car_to_ix[car] = len(car_to_ix)


tag_to_ix = {"DET": 0, "NN": 1, "V": 2}
ix_to_tag = {0: "DET", 1: "NN", 2: "V"}

CAR_EMBEDDING_DIM = 3
WORD_EMBEDDING_DIM = 6
HIDDEN_DIM = 6

In [83]:
import torch.autograd as autograd

class CustomTagger(nn.Module):

    def __init__(self, word_embedding_dim, car_embedding_dim, hidden_dim, vocab_size, alphabet_size, tagset_size):

        super(CustomTagger, self).__init__()

        self.hidden_dim = hidden_dim
        self.word_embedding_dim = word_embedding_dim
        self.car_embedding_dim = car_embedding_dim

        self.car_embeddings = nn.Embedding(alphabet_size, car_embedding_dim)
        self.lstm_car = nn.LSTM(car_embedding_dim, car_embedding_dim)

        self.word_embeddings = nn.Embedding(vocab_size, word_embedding_dim)
        self.lstm_word = nn.LSTM(word_embedding_dim+car_embedding_dim, hidden_dim)

        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

        self.hidden = self.init_hidden(hidden_dim)
        self.hidden_car = self.init_hidden(car_embedding_dim)

    def init_hidden(self, dim):
        return (autograd.Variable(torch.zeros(1, 1, dim)),
                autograd.Variable(torch.zeros(1, 1, dim)))

    def forward(self, sentence):
        word_idxs = []
        lstm_car_result = []
        for word in sentence:
            self.hidden_car = self.init_hidden(self.car_embedding_dim) #(([1, 1, 3]),([1, 1, 3]))
            word_idxs.append(word[0])
            char_idx = autograd.Variable(torch.LongTensor(word[1])) # ([3])
            car_embeds = self.car_embeddings(char_idx) #([3, 3])
            lstm_car_out, self.hidden_car = self.lstm_car(car_embeds.unsqueeze(1), self.hidden_car) #([3, 1, 3])
            lstm_car_result.append(lstm_car_out[-1])  #([1, 3]) ????  why save only last row??

        lstm_car_result = torch.stack(lstm_car_result) #([5, 1, 3])
        word_embeds = self.word_embeddings(autograd.Variable(torch.LongTensor(word_idxs))).unsqueeze(1) #([5, 1, 6])
        lstm_in = torch.cat((word_embeds, lstm_car_result), 2) #([5, 1, 9])
        lstm_out, self.hidden = self.lstm_word(lstm_in, self.hidden) #([5, 1, 6])

        tag_space = self.hidden2tag(lstm_out.squeeze(1)) #([5, 6])
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [84]:
model = CustomTagger(WORD_EMBEDDING_DIM, CAR_EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(car_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)


for epoch in range(300):  
    for sentence, tags in training_data:
        model.zero_grad()

        model.hidden = model.init_hidden(HIDDEN_DIM)

        sentence_in = prepare_sequence(sentence, word_to_ix)
        
        targets = prepare_target(tags, tag_to_ix)

        tag_scores = model(sentence_in)

        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

In [85]:
# ======================= TEST

test_sentence = training_data[0][0]
inputs = prepare_sequence(test_sentence, word_to_ix)
tag_scores = model(inputs)
for i in range(len(test_sentence)):
	print('{}: {}'.format(test_sentence[i],get_max_prob_result(tag_scores[i].data.numpy(), ix_to_tag)))

The: DET
dog: NN
ate: V
the: DET
apple: NN
