# Introduction to Neural Language Models

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

## Recap of probability chains in language

In [30]:
import json
from nltk import word_tokenize

In [18]:
raw_data = '/Users/flint/Data/recipe/recipe1M_layers/layer1.json'
with open(raw_data, 'r') as raw_data_file:
    rdata = json.load(raw_data_file)

In [39]:
tokenizer = lambda x: ['[sos]'] + word_tokenize(x.lower()) + ['[eos]']
recipe2text = lambda x: " ".join(
    ['[start]'] + 
    [" ".join(tokenizer(y['text'])) for y in rdata[x]['instructions']] + 
    ['[stop]'])

In [45]:
corpus = [recipe2text(i) for i in range(10**4)]

### A different overview of n-gram models

In [50]:
import networkx as nx
from nltk import ngrams

In [71]:
def text2graph(recipe, n=2):
    g = nx.DiGraph()
    r = recipe.split()
    for ngram in ngrams(r, n=n, pad_left=True, left_pad_symbol='[pad]'):
        n1, n2 = " ".join(ngram[:-1]), " ".join(ngram[1:])
        g.add_edge(n1, n2)
    return g

In [74]:
g = text2graph(corpus[0], n=3)

[pad] [pad] [pad] [start]
[pad] [start] [start] [sos]
[start] [sos] [sos] preheat
[sos] preheat preheat the
preheat the the oven
the oven oven to
the oven oven .
oven to to 350
to 350 350 f.
350 f. f. butter
f. butter butter or
butter or or oil
or oil oil an
oil an an 8-inch
an 8-inch 8-inch baking
8-inch baking baking dish
baking dish dish .
dish . . [eos]
. [eos] [eos] [sos]
. [eos] [eos] [stop]
[eos] [sos] [sos] cook
[eos] [sos] [sos] (
[eos] [sos] [sos] rinse
[eos] [sos] [sos] combine
[eos] [sos] [sos] scrape
[eos] [sos] [sos] sprinkle
[eos] [sos] [sos] bake
[eos] [sos] [sos] let
[eos] [sos] [sos] melt
[eos] [sos] [sos] continue
[eos] [sos] [sos] slowly
[eos] [sos] [sos] remove
[eos] [sos] [sos] add
[eos] [sos] [sos] stir
[eos] [sos] [sos] use
[eos] [sos] [sos] this
[eos] [sos] [sos] one-half
[eos] [sos] [sos] the
[eos] [sos] [sos] it
[sos] cook cook the
[sos] cook cook until
cook the the penne
the penne penne 2
penne 2 2 minutes
2 minutes minutes less
2 minutes minutes .
minutes l