# Example: part of speech tagging
We will create a language model for part-of-speech (POS) tagging.

In particular, we want to submit to the network a sequence of words $w_1, \dots, w_n$ with $w_i \in V$ and estimate a probability distribution $\phi(T)$ over the POS tags $T$ for each word $w_1, \dots, w_n$.

In [79]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook

## Prepare data

In [80]:
from langmodels.corpora.moviedialog import MovieDialogCollection

In [81]:
db_name = 'movie-dialogs'
collection = 'lines'

In [82]:
genre = ['western']
ug = {'$unwind': '$character.movie.genres'}
mg = {'$match': {'character.movie.genres': {'$in': genre}}}
pg = {'$project': {'_id': 0, 'id': 1, 'text': 1}}
pipeline = [ug, mg, pg]

In [83]:
corpus = MovieDialogCollection(db_name, collection, 
                                use_pos=False, 
                                mix_pos=True, pipeline=pipeline)

## Training set preparation

In [91]:
training, V, T = [], set(), set()
for doc, tokens in corpus.get_tokens():
    parts = [t.split('_') for t in tokens]
    words = [p[0] for p in parts]
    tags = [p[1] for p in parts]
    for w in words:
        V.add(w)
    for t in tags:
        T.add(t)
    training.append((words, tags))
V = list(V)
T = list(T)

In [92]:
training[0]

(['now', 'you', 'tell', 'us', '.'], ['ADV', 'PRON', 'VERB', 'PRON', 'PUNCT'])

In [93]:
word2idx = dict([(x, i) for i, x in enumerate(V)])
tag2idx = dict([(x, i) for i, x in enumerate(T)])

## Model

In [94]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(42)

<torch._C.Generator at 0x1a27796330>

In [95]:
class LSTMlm(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, target_size):
        super(LSTMlm, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to target space
        self.hidden2tag = nn.Linear(hidden_dim, target_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

## Train

In [96]:
def sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq if w in to_ix.keys()]
    return torch.tensor(idxs, dtype=torch.long)

In [97]:
EMBEDDING_DIM, HIDDEN_DIM = 100, 64
model = LSTMlm(EMBEDDING_DIM, HIDDEN_DIM, len(V), len(T))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

### Example

In [98]:
x, _ = model.lstm(model.word_embeddings(sequence(training[0][0], word2idx)).view(len(training[0][0]), 1, -1))
t = model.hidden2tag(x.view(len(training[0][0]), -1))
F.log_softmax(t, dim=1)

tensor([[-2.7242, -2.9714, -2.7973, -2.8605, -2.9286, -2.9000, -2.8711, -2.8523,
         -2.7466, -2.9541, -2.9392, -2.8738, -2.6489, -2.9573, -2.7392, -2.8284,
         -2.6587],
        [-2.7778, -2.9681, -2.8251, -2.9521, -2.8338, -2.8873, -2.8896, -3.0296,
         -2.7647, -2.8975, -2.9781, -2.8586, -2.6336, -2.7665, -2.7590, -2.9270,
         -2.5454],
        [-2.8889, -2.8260, -2.8010, -2.8261, -2.9033, -2.9624, -2.7400, -3.0105,
         -2.8012, -2.9921, -2.9324, -2.9976, -2.6822, -2.8226, -2.5681, -2.8189,
         -2.7114],
        [-2.9844, -2.9396, -2.8421, -2.7116, -2.8482, -2.9610, -2.6532, -3.1048,
         -2.8885, -2.9583, -2.8147, -3.0550, -2.6126, -2.8081, -2.6163, -2.9446,
         -2.6192],
        [-2.9343, -3.0359, -2.7008, -2.9913, -2.8830, -2.8439, -2.7358, -2.8355,
         -2.9294, -2.9245, -2.8909, -3.0296, -2.6302, -2.8414, -2.6242, -2.8425,
         -2.6372]], grad_fn=<LogSoftmaxBackward>)

In [99]:
epochs = tqdm_notebook(list(range(10)))
for epoch in epochs:
    for sentence, tags in training:
        model.zero_grad()
        sent = sequence(sentence, word2idx)
        target = sequence(tags, tag2idx)
        tag_scores = model(sent)
        loss = loss_function(tag_scores, target)
        loss.backward()
        optimizer.step()

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




## Usage

In [114]:
test = ['this', 'is', 'a', 'very', 'nice', 'example', 'for', 'sure', '.']
s = sequence(test, word2idx)

In [115]:
tags = []
for tensor in model(s):
    tag_index = tensor.argmax().item()
    tags.append(T[tag_index])

In [116]:
print("\t".join([V[i] for i in s]))
print("\t".join(tags))

this	is	a	very	nice	example	for	sure	.
DET	VERB	DET	ADV	ADJ	NOUN	ADP	ADJ	PUNCT


## Exercize
Define a stategy for evaluating the performances of the network.