# Example: part of speech tagging
We will create a language model for part-of-speech (POS) tagging.

In particular, we want to submit to the network a sequence of words $w_1, \dots, w_n$ with $w_i \in V$ and estimate a probability distribution $\phi(T)$ over the POS tags $T$ for each word $w_1, \dots, w_n$.

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

## Prepare data

In [2]:
from langmodels.corpora.moviedialog import MovieDialogCollection

In [3]:
db_name = 'movie-dialogs'
collection = 'lines'

In [4]:
genre = ['western']
ug = {'$unwind': '$character.movie.genres'}
mg = {'$match': {'character.movie.genres': {'$in': genre}}}
pg = {'$project': {'_id': 0, 'id': 1, 'text': 1}}
pipeline = [ug, mg, pg]

In [5]:
corpus = MovieDialogCollection(db_name, collection, 
                                use_pos=False, 
                                mix_pos=True, pipeline=pipeline)

## Training set preparation

In [6]:
training, V, T = [], set(), set()
for doc, tokens in corpus.get_tokens():
    parts = [t.split('_') for t in tokens]
    words = [p[0] for p in parts]
    tags = [p[1] for p in parts]
    for w in words:
        V.add(w)
    for t in tags:
        T.add(t)
    training.append((words, tags))
V = list(V)
T = list(T)

In [7]:
training[0]

(['now', 'you', 'tell', 'us', '.'], ['ADV', 'PRON', 'VERB', 'PRON', 'PUNCT'])

In [8]:
word2idx = dict([(x, i) for i, x in enumerate(V)])
tag2idx = dict([(x, i) for i, x in enumerate(T)])

## Model

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(42)

<torch._C.Generator at 0x7ff7f1a12710>

In [10]:
class LSTMlm(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, target_size):
        super(LSTMlm, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to target space
        self.hidden2tag = nn.Linear(hidden_dim, target_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

## Train

In [11]:
def sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq if w in to_ix.keys()]
    return torch.tensor(idxs, dtype=torch.long)

In [12]:
EMBEDDING_DIM, HIDDEN_DIM = 100, 64
model = LSTMlm(EMBEDDING_DIM, HIDDEN_DIM, len(V), len(T))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

### Example

In [13]:
x, _ = model.lstm(model.word_embeddings(sequence(training[0][0], word2idx)).view(len(training[0][0]), 1, -1))
t = model.hidden2tag(x.view(len(training[0][0]), -1))
F.log_softmax(t, dim=1)

tensor([[-2.8374, -2.8072, -2.7428, -2.9349, -3.1515, -2.9466, -2.7810, -2.9349,
         -2.7547, -3.0652, -2.8674, -3.0949, -2.8811, -2.7860, -3.0822, -2.8745,
         -2.8268, -2.7895],
        [-2.8538, -2.8959, -2.7571, -2.9187, -3.0924, -2.9770, -2.9083, -2.9131,
         -2.7631, -3.0502, -2.9427, -3.0932, -2.8484, -2.7200, -3.0089, -2.7897,
         -2.7450, -2.8636],
        [-2.8142, -2.8713, -2.9109, -2.9240, -3.0856, -2.8989, -3.0851, -2.7651,
         -2.7207, -3.1799, -2.7715, -3.0318, -2.7937, -2.8357, -3.0370, -2.8234,
         -2.6940, -2.9427],
        [-2.9595, -3.0556, -2.8244, -2.9722, -3.1392, -2.9502, -2.8556, -2.8936,
         -2.7368, -3.0661, -2.7343, -2.9134, -2.6712, -2.8549, -3.0387, -2.8730,
         -2.7361, -2.8936],
        [-2.9606, -2.7847, -2.9484, -2.8296, -3.2152, -3.0139, -2.9636, -2.8995,
         -2.7725, -3.1719, -2.6807, -2.9501, -2.6844, -2.8338, -3.0941, -2.7803,
         -2.8170, -2.8196]], grad_fn=<LogSoftmaxBackward>)

In [15]:
epochs = tqdm(list(range(10)))
for epoch in epochs:
    for sentence, tags in training:
        model.zero_grad()
        sent = sequence(sentence, word2idx)
        target = sequence(tags, tag2idx)
        tag_scores = model(sent)
        loss = loss_function(tag_scores, target)
        loss.backward()
        optimizer.step()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




## Usage

In [16]:
test = ['this', 'is', 'a', 'very', 'nice', 'example', 'for', 'sure', '.']
s = sequence(test, word2idx)

In [17]:
tags = []
for tensor in model(s):
    tag_index = tensor.argmax().item()
    tags.append(T[tag_index])

In [18]:
print("\t".join([V[i] for i in s]))
print("\t".join(tags))

this	is	a	very	nice	example	for	sure	.
DET	AUX	DET	ADV	ADJ	NOUN	ADP	ADJ	PUNCT


## Exercize
Define a stategy for evaluating the performances of the network.