In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import string

In [30]:
#import gdown
#!gdown https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/edgar_allan_poe.txt
#!gdown https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt

In [31]:
robert_file= open('robert_frost.txt', 'r')

In [32]:
np.random.seed(1234)

In [33]:
initial = {} #start of the phrase
first_order = {} #second word only 
second_order = {}

In [34]:
def remove_punctuation(s):
    return s.translate(str.maketrans('','',string.punctuation))

In [35]:
def add2dict(d,k,v):
    if k not in d:
        d[k] = []
    d[k].append(v)

In [36]:
for line in open('robert_frost.txt'):
    tokens = remove_punctuation(line.rstrip().lower()).split()
    
    T = len(tokens)
    for i in range(T):
        t = tokens[i]
        if i == 0:  
            #measure the distribution of the first word
            initial[t] = initial.get(t,0.) + 1 
        else:
            t_1 = tokens[i-1] 
            if i == T - 1:
                #measure the probability of ending the line 
                add2dict(second_order, (t_1,t), 'END')
            if i == 1:
                #measure the distribution of second word
                # given only the first word
                add2dict(first_order, t_1, t)
            else:
                t_2 = tokens[i-2]
                add2dict(second_order, (t_2, t_1), t)
            
            

In [37]:
# normalize the distrubutions
initial_total  = sum(initial.values())
for t,c in initial.items():
    initial[t] = c/initial_total

In [38]:
# convert [cat, cat, cat, dog, dog, dog, mouse ,...]
# into {cat: 0.5, dog: 0.4, mouse: 0.1}
def list2pdicts(ts):
    # turn each list of possibilities into a dictionary of probabilities
    d = {}
    n = len(ts)
    for t in ts:
        d[t] = d.get(t, 0.) + 1
    for t, c in d.items():
        d[t] = c/n
    return d
    


In [39]:
for t_1, ts in first_order.items():
    # replace list with dictionary of probabilites 
    first_order[t_1] = list2pdicts(ts)

In [40]:
for k, ts  in second_order.items():
    second_order[k]  = list2pdicts(ts)

In [42]:
def sample_word(d):
    # print "d:", d 
    p0 = np.random.random()
    # print "p0:", p0
    cummulative = 0
    for t, p in d.items():
        cummulative += p 
        if p0 < cummulative:
            return t
    assert(False) # should never get there

In [44]:
def generate():
    for i in range(4): #generate 4 lines 
        sentence = []
        
        #initial word
        w0 = sample_word(initial)
        sentence.append(w0)
        
        #sample second word
        w1 = sample_word(first_order[w0])
        sentence.append(w1)
        
        # second-order transitions until END
        while True:
            w2 = sample_word(second_order[(w0, w1)])
            if w2 == 'END':
                break
            sentence.append(w2)
            w0 = w1
            w1 = w2
        print(' '.join(sentence))
            

In [45]:
generate()

hes nothing listen when i looked at nine the swarm was turned to rock
as often as he chose
and if youre lost enough to stock a village library
though not yet all gone out of beaten ways


In [46]:
generate()

no one on earth will ever live on it again
with whose vast wheels
its to say
i know


In [48]:
generate()

up to pass a winter eve
to make them out
and then someone
dyou know a person so related to herself


In [49]:
generate()

you take the polish off the ground
no i dont follow you
in leaves no step had trodden black
first theres the childrens house of makebelieve
