In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from nltk.corpus import brown
from nltk.tag import untag

torch.manual_seed(1)

<torch._C.Generator at 0x7fc4627608d0>

In [2]:
brown_news_tagged = brown.tagged_sents(categories='news', tagset='universal')
brown_news_words = brown.tagged_words(categories='news', tagset='universal')

brown_train = brown_news_tagged[100:]
brown_test = brown_news_tagged[:100]
test_sent = untag(brown_test[0])
print("Tagged: ", brown_test[0])
print('___________________________\n')
print("Untagged: ", test_sent)

Tagged:  [('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'ADJ'), ('Jury', 'NOUN'), ('said', 'VERB'), ('Friday', 'NOUN'), ('an', 'DET'), ('investigation', 'NOUN'), ('of', 'ADP'), ("Atlanta's", 'NOUN'), ('recent', 'ADJ'), ('primary', 'NOUN'), ('election', 'NOUN'), ('produced', 'VERB'), ('``', '.'), ('no', 'DET'), ('evidence', 'NOUN'), ("''", '.'), ('that', 'ADP'), ('any', 'DET'), ('irregularities', 'NOUN'), ('took', 'VERB'), ('place', 'NOUN'), ('.', '.')]
___________________________

Untagged:  ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']


In [3]:
def reformat(brown_dataset):
    training_data = []
    for sent in brown_dataset:
        untaged_sent = untag(sent)
        tags = [tag for _,tag in sent]
        training_data.append((untaged_sent,tags))
    return training_data

Utils 

In [53]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


def get_to_ix(training_data):
    to_ix = {}
    for sent, tags in training_data:
        for word in sent:
            if word not in to_ix:
                to_ix[word] = len(to_ix)
    return to_ix

def categoryFromOutput(output):
    # A tuple of (values, indices) is returned,
    # where the indices are the indices of the elements in the original input tensor.
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
    return all_categories[category_i]

def accuracy(real,pred):
    correct = 0
    N = 0
    for i in range(len(real)):
        for real_label,pred_label in zip(real[i],pred[i]):
            if real_label == categoryFromOutput(pred_label):
                    correct += 1
        N += len(real[i])
    return correct * 100 / N


More parameters

In [5]:
training_data = reformat(brown_train)
test_data = reformat(brown_test)

word_to_ix = get_to_ix(training_data+test_data)

tag_to_ix = {'ADJ':0,'ADP':1,'ADV':2,'CONJ':3,'DET':4,'NOUN':5,
             'NUM':6,'PRT':7,'PRON':8,'VERB':9,'.':10,'X':11}

all_categories = ['ADJ','ADP','ADV','CONJ','DET','NOUN',
                 'NUM','PRT','PRON','VERB','.','X']

# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 100
HIDDEN_DIM = 6

In [6]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, 1, self.hidden_dim).to('cuda'),
                torch.zeros(1, 1, self.hidden_dim).to('cuda'))

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(
            embeds.view(len(sentence), 1, -1), self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [7]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
model = model.cuda()
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    inputs = inputs.to('cuda')
    tag_scores = model(inputs)
    print(tag_scores)

for epoch in range(10): 
    print("ephoc {}".format(epoch))
    for sentence, tags in training_data:
        model.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden = model.init_hidden()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix).to('cuda')

        # Step 3. Run our forward pass.
        sentence_in = sentence_in.to('cuda')
        tag_scores = model(sentence_in).to('cuda')

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

# See what the scores are after training
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    inputs = inputs.to('cuda')
    tag_scores = model(inputs)

    # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
    # for word i. The predicted tag is the maximum scoring tag.
    # Here, we can see the predicted sequence below is 0 1 2 0 1
    # since 0 is index of the maximum value of row 1,
    # 1 is the index of maximum value of row 2, etc.
    # Which is DET NOUN VERB DET NOUN, the correct sequence!
    print(tag_scores)

tensor([[-2.3959, -2.6854, -2.5724, -2.9977, -2.3539, -2.7690, -2.0622,
         -2.7923, -2.1698, -2.7879, -2.4779, -2.2199],
        [-2.6655, -2.4565, -2.4383, -2.9171, -2.4000, -2.5787, -2.3422,
         -2.9593, -2.2046, -2.5828, -2.1214, -2.4887],
        [-2.5505, -2.8002, -2.9879, -2.9769, -2.3474, -2.7428, -2.4584,
         -3.0217, -1.8872, -2.6414, -1.9625, -2.2804],
        [-2.4983, -2.6355, -2.4682, -2.8517, -2.4176, -2.6173, -2.0370,
         -2.7326, -2.3782, -2.6639, -2.5523, -2.2470],
        [-2.6966, -2.5131, -2.5092, -2.7616, -2.4182, -2.2062, -2.4613,
         -2.6251, -2.2857, -2.6084, -2.2523, -2.6657],
        [-2.6895, -2.8626, -2.8727, -2.7660, -2.7094, -2.2310, -2.1871,
         -2.4408, -2.0116, -2.6274, -2.2276, -2.6832],
        [-2.6462, -2.7684, -2.7255, -2.7060, -2.5941, -2.3330, -2.1486,
         -2.4944, -2.2237, -2.5893, -2.3577, -2.4632],
        [-2.6363, -2.9015, -2.9608, -2.6441, -2.5465, -2.1891, -2.3132,
         -2.4432, -2.1660, -2.5705, -2.

In [8]:
test_labels = []
for test in test_data:
    test_labels.append(test[1])

In [55]:
test_results = []
with torch.no_grad():
    for t in test_data:
        inputs = prepare_sequence(t[0], word_to_ix)
        inputs = inputs.to('cuda')
        test_results.append(model(inputs))

In [57]:
print("Test result {}%".format(round(accuracy(test_labels,test_results),3)))

Test result 86.287%
