# Introduction to Neural Language Models using PyTorch
We want to predict

$$\hat{P}(w_i \mid w_1, \dots, w_{i-1})$$

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook

In [2]:
import string
import nltk
import matplotlib.pyplot as plt

## Case study
We aim at training a network for a Movie in the Movie-Dialog dataset. Instead of using simple tokens, we create artificial tokens by combining a token with its part-of-speech.

**Example**

In [3]:
from langmodels.corpora.moviedialog import MovieDialogCollection

In [4]:
db_name = 'movie-dialogs'
collection = 'lines'
m = {'$match': {'character.movie.id': 'm42'}}
p = {'$project': {'_id': 0, 'id': 1, 'text': 1}}
pipeline = [m, p]

In [5]:
m42 = MovieDialogCollection(db_name, collection, 
                            use_pos=False, mix_pos=True, pipeline=pipeline)

In [6]:
tokens = m42.get_tokens()

In [7]:
print(tokens[0])

('L119167', ['--_PUNCT', 'he_PRON', "'s_VERB", 'just_ADV', 'a_DET', 'lucky_ADJ', 'guy_NOUN', '._PUNCT'])


## Word embeddings
Since we do not have single words, but words plus POS, we cannot use a pre-trained word embedding model. Thus, we create one custom model, using a larger corpus (see the [example](https://github.com/afflint/inforet/blob/master/thematic-studies/language-models/L04-wordembeddings.ipynb))

In [8]:
ug = {'$unwind': '$character.movie.genres'}
mg = {'$match': {'character.movie.genres': {'$in': ['drama']}}}
pg = {'$project': {'_id': 0, 'id': 1, 'text': 1}}
pipeline = [ug, mg, pg]

In [9]:
general = MovieDialogCollection(db_name, collection, 
                                use_pos=False, 
                                mix_pos=True, pipeline=pipeline)

sequences = []
for doc, tokens in general.get_skip_tokens(n=3, s=2):
    for a, b, c in tokens:
        sequences.append([a, b, c])

print(len(sequences), sequences[0])

In [12]:
from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors

embedding_model = Word2Vec(sequences, min_count=30)

embedding_model.wv.most_similar(positive='can_VERB')[:5]

embedding_model.save('langmodels/corpora/token_pos.word2vec')

### Load model

In [18]:
embedding_model = Word2Vec.load('langmodels/corpora/token_pos.word2vec')

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(42)