In [34]:
import numpy as np
import dill

In [35]:
from nltk.util import pad_sequence, ngrams, everygrams
from nltk.lm.preprocessing import pad_both_ends, flatten


In [36]:
text = [["A", "B", "C"], ["A", "B", "C", "D", "E", "F"]]

In [37]:
list(ngrams(text[0], 2))

[('A', 'B'), ('B', 'C')]

In [38]:
padded_sentence = list(pad_sequence(text[0], pad_left=True, left_pad_symbol="<s>", pad_right=True, right_pad_symbol="</s>", n=2))
list(ngrams(padded_sentence, 2))

[('<s>', 'A'), ('A', 'B'), ('B', 'C'), ('C', '</s>')]

In [39]:
padded_sentence = list(pad_sequence(text[0], pad_left=True, left_pad_symbol="<s>", pad_right=True, right_pad_symbol="</s>", n=3))
list(ngrams(padded_sentence, 3))

[('<s>', '<s>', 'A'),
 ('<s>', 'A', 'B'),
 ('A', 'B', 'C'),
 ('B', 'C', '</s>'),
 ('C', '</s>', '</s>')]

In [40]:
padded_bigrams = list(pad_both_ends(text[0], n=2))
list(everygrams(padded_bigrams, max_len=2))

[('<s>',),
 ('<s>', 'A'),
 ('A',),
 ('A', 'B'),
 ('B',),
 ('B', 'C'),
 ('C',),
 ('C', '</s>'),
 ('</s>',)]

Now, the function that does everthing for us..

In [41]:
from nltk.lm.preprocessing import padded_everygram_pipeline
train, vocab = padded_everygram_pipeline(order=2, text=text)

To avoid re-creating the text in memory, both train and vocab are lazy iterators. They are evaluated on demand at the training time.

To understand the output of the ''padded_everygram_pipeline'':

In [42]:
training_ngrams, padded_sentences = padded_everygram_pipeline(order=2, text=text)
for ngramlize_sentence in training_ngrams:
    print(list(ngramlize_sentence))
    print()
print("#############")
print(list(padded_sentences))

[('<s>',), ('<s>', 'A'), ('A',), ('A', 'B'), ('B',), ('B', 'C'), ('C',), ('C', '</s>'), ('</s>',)]

[('<s>',), ('<s>', 'A'), ('A',), ('A', 'B'), ('B',), ('B', 'C'), ('C',), ('C', 'D'), ('D',), ('D', 'E'), ('E',), ('E', 'F'), ('F',), ('F', '</s>'), ('</s>',)]

#############
['<s>', 'A', 'B', 'C', '</s>', '<s>', 'A', 'B', 'C', 'D', 'E', 'F', '</s>']


Using NLTK Tokenize

In [43]:
try:
    from nltk import word_tokenize, sent_tokenize 
    # Testing whether it works. 
    word_tokenize(sent_tokenize("This is a foobar sentence. Yes it is.")[0])
except: 
    import re
    from nltk.tokenize import ToktokTokenizer
    sent_tokenize = lambda x: re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', x)
    toktok = ToktokTokenizer()
    word_tokenize = word_tokenize = toktok.tokenize

**SOME REAL DATA**

In [44]:
import os
import io

with io.open(os.path.join("data", "language-never-random.txt"), encoding='utf8') as fin:
    text = fin.read()

In [45]:
tokenized_text = [list(map(str.lower, word_tokenize(sent))) for sent in sent_tokenize(text)]

In [46]:
print(text[:500])

                       Language is never, ever, ever, random

                                                               ADAM KILGARRIFF




Abstract
Language users never choose words randomly, and language is essentially
non-random. Statistical hypothesis testing uses a null hypothesis, which
posits randomness. Hence, when we look at linguistic phenomena in cor-
pora, the null hypothesis will never be true. Moreover, where there is enough
data, we shall (almost) always be able to establish 


**3-gram Model**

In [47]:
n = 3
train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)

Training a N-gram Model

In [48]:
# FIRSTLY a simple Maximum Likelihood Estimator
from nltk.lm import MLE
model = MLE(order=n)
print(len(model.vocab))

0


In [49]:
model.fit(text=train_data, vocabulary_text=padded_sents)
print(len(model.vocab))

1429


In [50]:
# trying the model vocab
print(model.vocab.lookup("language is never random lah .".split()))

('language', 'is', 'never', 'random', '<UNK>', '.')


In [51]:
# assessing the counts
print(model.counts["language"]) # count('language')
print(model.counts[["language"]]["is"]) # count('is' | 'language')
print(model.counts[["language", "is"]]["never"]) # count('never' | 'language is')

25
11
7


In [52]:
print(model.score('language')) # P('language')
print(model.score('is', 'language'.split()))  # P('is'|'language')
print(model.score('never', 'language is'.split()))  # P('never'|'language is')

0.003916040100250626
0.44
0.6363636363636364


In [53]:
print(model.generate(20, random_seed=34))

['in', 'the', 'system', '…', 'the', 'performance', 'of', 'the', 'european', 'chapter', 'of', 'the', 'error', 'term', 'is', 'very', 'high', '.', '</s>', '</s>']


In [54]:
# making human readable sentences
from nltk.tokenize.treebank import TreebankWordDetokenizer
detokenize = TreebankWordDetokenizer().detokenize

def generate_sent(model, num_words, random_seed=42):
    content = []
    for token in model.generate(num_words, random_seed=random_seed):
        if token == "<s>":
            continue
        if token == "</s>":
            break
        content.append(token)
    return detokenize(content)

generate_sent(model, 20, random_seed=67)

', the sum is over the four cells of the conference of the cells in the empirical linguistics.'