##Set Up

In [76]:
# load libraries
import pandas as pd
import numpy as np
import re

import nltk

nltk.download('punkt')
from nltk import word_tokenize
from nltk import sent_tokenize
from nltk.util import bigrams
from nltk.lm.preprocessing import padded_everygram_pipeline

import requests

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Part I

In [79]:
# leverage requests package to load in a text from project gutenberg
# first do it with Prof's book

r = requests.get(r'https://www.gutenberg.org/cache/epub/64317/pg64317.txt')
great_gatsby = r.text

# first, remove unwanted new line and tab characters from the text
for char in ["\n", "\r", "\d", "\t"]:
    great_gatsby = great_gatsby.replace(char, " ")

# check
print(great_gatsby[:100])

﻿The Project Gutenberg eBook of The Great Gatsby        This ebook is for the use of anyone anywhere


In [125]:
# then trying it with my book selection [Sherlock Holmes] to see if results still work

sh = requests.get(r'https://www.gutenberg.org/files/48320/48320-0.txt')
sherlock_holmes = sh.text

# first, remove unwanted new line and tab characters from the text
for char in ["\n", "\r", "\d", "\t"]:
    sherlock_holmes = sherlock_holmes.replace(char, " ")

# check that it worked and make sure to start at the top
print(sherlock_holmes[3478:3750])

To Sherlock Holmes she is always _the_ woman. I have seldom heard  him mention her under any other name. In his eyes she eclipses and  predominates the whole of her sex. It was not that he felt any emotion  akin to love for Irene Adler. All emotions, and that one particul


In [126]:
# find the end
sherlock_holmes.index("THE END")

592705

In [127]:
# trim text to start of the bok
sherlock_holmes = sherlock_holmes[3478:592705]
print(sherlock_holmes[:100])

To Sherlock Holmes she is always _the_ woman. I have seldom heard  him mention her under any other n


In [128]:
# following steps for cleaning
def clean_text(text: str):
    # lowercase
    text = text.lower()

    # remove punctuation from text
    text = re.sub(r"[^\w\s]", "", text)

    # tokenize the text
    tokens = nltk.word_tokenize(text)

    # return your tokens
    return tokens

# call the function
sample_tokens = clean_text(text = sherlock_holmes)

# check
print(sample_tokens[:50])

['to', 'sherlock', 'holmes', 'she', 'is', 'always', '_the_', 'woman', 'i', 'have', 'seldom', 'heard', 'him', 'mention', 'her', 'under', 'any', 'other', 'name', 'in', 'his', 'eyes', 'she', 'eclipses', 'and', 'predominates', 'the', 'whole', 'of', 'her', 'sex', 'it', 'was', 'not', 'that', 'he', 'felt', 'any', 'emotion', 'akin', 'to', 'love', 'for', 'irene', 'adler', 'all', 'emotions', 'and', 'that', 'one']


In [129]:
# create bigrams from the sample tokens
my_bigrams = bigrams(sample_tokens)

# check
list(my_bigrams)[:20]

[('to', 'sherlock'),
 ('sherlock', 'holmes'),
 ('holmes', 'she'),
 ('she', 'is'),
 ('is', 'always'),
 ('always', '_the_'),
 ('_the_', 'woman'),
 ('woman', 'i'),
 ('i', 'have'),
 ('have', 'seldom'),
 ('seldom', 'heard'),
 ('heard', 'him'),
 ('him', 'mention'),
 ('mention', 'her'),
 ('her', 'under'),
 ('under', 'any'),
 ('any', 'other'),
 ('other', 'name'),
 ('name', 'in'),
 ('in', 'his')]

Part II

In [130]:
# 2 is for bigrams
n = 2
#specify the text you want to use
text = sherlock_holmes

In [131]:
# step 1: tokenize the text into sentences
sentences = nltk.sent_tokenize(text)

# step 2: tokenize each sentence into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

# step 3: convert each word to lowercase
tokenized_text = [[word.lower() for word in sent] for sent in tokenized_sentences]

#notice the sentence breaks and what the first 10 items of the tokenized text
print(tokenized_text[0])

['to', 'sherlock', 'holmes', 'she', 'is', 'always', '_the_', 'woman', '.']


In [132]:
# notice what the first 10 items are of the vocabulary
print(text[:100])

To Sherlock Holmes she is always _the_ woman. I have seldom heard  him mention her under any other n


In [133]:
# we imported this function from nltk
train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)

In [134]:
from nltk.lm import MLE
# we imported this function from nltk linear models (lm)
# it is for Maximum Likelihood Estimation

# MLE is the model we will use
lm = MLE(n)

In [135]:
# currently the vocab length is 0: it has no prior knowledge
len(lm.vocab)

0

In [136]:
# fit the model
# training data is the bigrams and unigrams
# the vocab is all the sentence tokens in the corpus

lm.fit(train_data, padded_sents)
len(lm.vocab)

9751

In [137]:
# inspect the model's vocabulary.
# be sure that a sentence you know exists (from tokenized_text) is in the
print(lm.vocab.lookup(tokenized_text[0]))

('to', 'sherlock', 'holmes', 'she', 'is', 'always', '_the_', 'woman', '.')


In [138]:
# see what happens when we include a word that is not in the vocab.
print(lm.vocab.lookup('to sherlock holmes she is baobab .'.split()))

('to', 'sherlock', 'holmes', 'she', 'is', '<UNK>', '.')


In [139]:
# how many times does bohemia appear in the model?
print(lm.counts['bohemia'])

# what is the probability of bohemia appearing?
# this is technically the relative frequency of bohemia appearing
lm.score('bohemia')

8


6.181186015066641e-05

In [142]:
# how often does (in, bohemia) occur and what is the relative frequency?
print(lm.counts[['in']]['bohemia'])
lm.score('in', 'bohemia'.split())

1


0.125

In [143]:
# what is the score of 'UNK'?

lm.score("<UNK>")

0.0

## Part III: Generate Text

In [144]:
# generate a 20 word sentence starting with the word, 'holmes'

print(lm.generate(20, text_seed= 'diamond', random_seed=15))

['your', 'address', 'me', ',', 'â\x80\x9d', 'asked', 'me', 'so', 'that', 'â\x80\x98for', 'mrs.', 'rucastle', 'suddenly', 'another', 'broad', ',', 'who', 'was', 'speaking', 'only']


In [145]:
from nltk.tokenize.treebank import TreebankWordDetokenizer

detokenize = TreebankWordDetokenizer().detokenize

def generate_sent(lm, num_words, text_seed, random_seed=42):
    """
    :param model: An ngram language model from `nltk.lm.model`.
    :param num_words: Max no. of words to generate.
    :param random_seed: Seed value for random.
    """
    content = []
    for token in lm.generate(num_words, text_seed=text_seed, random_seed=random_seed):
        if token == '<s>':
            continue
        if token == '</s>':
            break
        content.append(token)
    return detokenize(content)

In [146]:
# Now generate sentences that look much nicer.
generate_sent(lm, 40, text_seed='diamond', random_seed=15)

'your address me, â\x80\x9d asked me so that â\x80\x98for mrs. rucastle suddenly another broad, who was speaking only trust that her husband already.'

In [147]:
generate_sent(lm, 20, text_seed='sherlock', random_seed = 93)

'which it may depend so far as either end, mr. fowler.â\x80\x9d â\x80\x9cthat is too lateâ\x80\x99â\x80\x9d] â\x80\x9cit must stop'

In [152]:
generate_sent(lm, 30, text_seed='card', random_seed = 17)

'is the wild and the push her again, fresh information as little fortune to waterloo bridge road, when we canâ\x80\x99t make myself with a minute or six hundred'

In [157]:
generate_sent(lm, 50, text_seed='violin', random_seed = 312)

'one of a couple of dubious and it would not finished.'