In [1]:
import pandas as pd
import numpy as np
import string
from sklearn.model_selection import train_test_split

In [2]:
initial = {} # start of a phrase
first_order = {} # second word only
second_order = {}

In [3]:
def remove_punctuation(s):
    return s.translate(str.maketrans('','',string.punctuation))

In [7]:
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt

--2024-07-30 15:13:37--  https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 56286 (55K) [text/plain]
Saving to: ‘robert_frost.txt’


2024-07-30 15:13:38 (4.48 MB/s) - ‘robert_frost.txt’ saved [56286/56286]



In [8]:
def add2dict(d, k, v):
  if k not in d:
    d[k] = []
  d[k].append(v)

In [9]:
for line in open('robert_frost.txt'):
  tokens = remove_punctuation(line.rstrip().lower()).split()

  T = len(tokens)
  for i in range(T):
    t = tokens[i]
    if i == 0:
      # measure the distribution of the first word
      initial[t] = initial.get(t, 0.) + 1
    else:
      t_1 = tokens[i-1]
      if i == T - 1:
        # measure probability of ending the line
        add2dict(second_order, (t_1, t), 'END')
      if i == 1:
        # measure distribution of second word
        # given only first word
        add2dict(first_order, t_1, t)
      else:
        t_2 = tokens[i-2]
        add2dict(second_order, (t_2, t_1), t)

In [10]:
# normalize the distributions
initial_total = sum(initial.values())
for t, c in initial.items():
    initial[t] = c / initial_total

In [16]:
initial

{'two': 0.005571030640668524,
 'and': 0.08983286908077995,
 'to': 0.034818941504178275,
 'then': 0.008356545961002786,
 'because': 0.0006963788300835655,
 'though': 0.004874651810584958,
 'had': 0.002785515320334262,
 'in': 0.0201949860724234,
 'oh': 0.002785515320334262,
 'yet': 0.0020891364902506965,
 'i': 0.08217270194986072,
 'somewhere': 0.0006963788300835655,
 'whose': 0.001392757660167131,
 'his': 0.004874651810584958,
 'he': 0.023676880222841225,
 'my': 0.004874651810584958,
 'between': 0.0020891364902506965,
 'the': 0.057103064066852366,
 'of': 0.0201949860724234,
 'but': 0.035515320334261836,
 'some': 0.003481894150417827,
 'from': 0.006963788300835654,
 'is': 0.003481894150417827,
 'natures': 0.0006963788300835655,
 'her': 0.001392757660167131,
 'so': 0.009052924791086351,
 'nothing': 0.001392757660167131,
 'when': 0.006267409470752089,
 'came': 0.0006963788300835655,
 'one': 0.00766016713091922,
 'proclaimed': 0.0006963788300835655,
 'smoothlaid': 0.0006963788300835655,
 'h

In [17]:
def list2pdict(ts):
  # turn each list of possibilities into a dictionary of probabilities
  d = {}
  n = len(ts)
  for t in ts:
    d[t] = d.get(t, 0.) + 1
  for t, c in d.items():
    d[t] = c / n
  return d

In [20]:
for t_1, ts in first_order.items():
  # replace list with dictionary of probabilities
  first_order[t_1] = list2pdict(ts)

In [22]:
first_order

{'two': {'roads': 0.14285714285714285,
  'miles': 0.14285714285714285,
  'oldbelievers': 0.14285714285714285,
  'winds': 0.14285714285714285,
  'weeks': 0.14285714285714285,
  'of': 0.14285714285714285,
  'at': 0.14285714285714285},
 'and': {'sorry': 0.009708737864077669,
  'be': 0.009708737864077669,
  'looked': 0.009708737864077669,
  'having': 0.009708737864077669,
  'both': 0.009708737864077669,
  'that': 0.009708737864077669,
  'miles': 0.009708737864077669,
  'would': 0.009708737864077669,
  'dropped': 0.009708737864077669,
  'further': 0.009708737864077669,
  'when': 0.009708737864077669,
  'tell': 0.009708737864077669,
  'the': 0.009708737864077669,
  'caught': 0.009708737864077669,
  'put': 0.009708737864077669,
  'threw': 0.009708737864077669,
  'birds': 0.009708737864077669,
  'suddenly': 0.009708737864077669,
  'scurf': 0.009708737864077669,
  'since': 0.009708737864077669,
  'whats': 0.009708737864077669,
  'many': 0.009708737864077669,
  'blew': 0.009708737864077669,
  's

In [23]:
for k, ts in second_order.items():
  second_order[k] = list2pdict(ts)

In [24]:
def sample_word(d):
  # print "d:", d
  p0 = np.random.random()
  # print "p0:", p0
  cumulative = 0
  for t, p in d.items():
    cumulative += p
    if p0 < cumulative:
      return t
  assert(False) # should never get here

In [25]:
def generate():
  for i in range(4): # generate 4 lines
    sentence = []

    # initial word
    w0 = sample_word(initial)
    sentence.append(w0)

    # sample second word
    w1 = sample_word(first_order[w0])
    sentence.append(w1)

    # second-order transitions until END
    while True:
      w2 = sample_word(second_order[(w0, w1)])
      if w2 == 'END':
        break
      sentence.append(w2)
      w0 = w1
      w1 = w2
    print(' '.join(sentence))

In [26]:
generate()


and stamped and said things to happen in their town a noted witch
but why take time for what im like to take it what was his was always hers
you take the timber
unfortunately all of one kind though
