# Example: generate dialogues
We can see dialogues as a mapping from a line of text into a subsequent line. The idea then is to use LSTM to map a sequence of words into another sequence of words, having that the two sequences may have different length.

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook

## Prepare data

In [2]:
from langmodels.corpora.moviedialog import MovieDialogCollection

In [3]:
db_name = 'movie-dialogs'
collection = 'lines'

In [4]:
genre = ['sci-fi']
ug = {'$unwind': '$character.movie.genres'}
mg = {'$match': {'character.movie.genres': {'$in': genre}}}
pg = {'$project': {'_id': 0, 'id': 1, 'text': 1}}
pipeline = [ug, mg, pg]

In [6]:
corpus = MovieDialogCollection(db_name, collection, 
                                use_pos=False, 
                                mix_pos=False, pipeline=pipeline)

## Training set preparation
We create pairs of lines to lines of fized size, using padding

In [11]:
training, V, padding = [], corpus.vocabulary + ['NONE'], 5
current = None
for doc, tokens in corpus.get_tokens():
    if len(tokens) < padding:
        tokens_pad = tokens + ['NONE']*(padding-len(tokens))
    elif len(tokens) > padding:
        tokens_pad = tokens[:padding]
    else:
        tokens_pad = tokens
    if current is None:
        pass
    else:
        training.append((current, tokens_pad))
    current = tokens_pad

In [12]:
training[0]

(['we', 'trying', 'to', 'get', 'there'],
 ['continued', 'NONE', 'NONE', 'NONE', 'NONE'])

In [13]:
word2idx = dict([(x, i) for i, x in enumerate(V)])

## Model

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(42)

<torch._C.Generator at 0x1a26e08190>

In [15]:
class LSTMlm(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, target_size):
        super(LSTMlm, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to target space
        self.hidden2tag = nn.Linear(hidden_dim, target_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

## Train

In [16]:
def sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq if w in to_ix.keys()]
    return torch.tensor(idxs, dtype=torch.long)

In [18]:
EMBEDDING_DIM, HIDDEN_DIM = 100, 64
model = LSTMlm(EMBEDDING_DIM, HIDDEN_DIM, len(V), len(V))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [21]:
epochs = tqdm_notebook(list(range(10)))
limit = 1000 # used to speed up things in the examples
for epoch in epochs:
    for sentence, tags in training[:1000]:
        model.zero_grad()
        sent = sequence(sentence, word2idx)
        target = sequence(tags, word2idx)
        tag_scores = model(sent)
        loss = loss_function(tag_scores, target)
        loss.backward()
        optimizer.step()

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




## Usage

In [25]:
test = ['my', 'name', 'is', 'john', 'NONE']
s = sequence(test, word2idx)

In [26]:
tags = []
for tensor in model(s):
    tag_index = tensor.argmax().item()
    tags.append(V[tag_index])

In [27]:
print("\t".join([V[i] for i in s]))
print("\t".join(tags))

my	name	is	john	NONE
ted	you	you	be	NONE


## Exercize
Define a stategy for evaluating the performances of the network and try to improve that.