In [75]:
import nltk
from nltk.corpus import cmudict
from nltk.tokenize import word_tokenize, wordpunct_tokenize, sent_tokenize

import os
import re

from ipynb.fs.full.preprocessing import parse_poems
from HMM_soln import unsupervised_HMM

## Get rhyming dictionary

In [76]:
text = open(os.path.join(os.getcwd(), 'data/shakespeare.txt')).read()

# remove sonnet numbers and convert to lowercase
text = re.sub(r'[0-9]+', '', text) 
text = text.lower()
text = text.split('\n')

In [79]:
# tokenize the data and filter out punctuation
tokens2 = [wordpunct_tokenize(s) for s in text]
punct = set(['.', ',', '!', ':', ';'])
filtered2 = [ [w for w in sentence if w not in punct ] for sentence in tokens2]
filtered2 = [x for x in filtered2 if len(x) != 0]

# get last word in each line
last2 = [ sentence[len(sentence) - 1] for sentence in filtered2]

In [84]:
# get syllables and word pronunciations 
syllables = \
[[(w, len(p), p) for (w, p) in cmudict.entries() if word == w] \
   for word in last2]

Function for determining rhyming

In [104]:
def rhymes(s):
    '''
    Function that determines rhyming words by comparing the ending sounds.
    
    Input: 
    s: a tuple (w, len(p), p) where w is a word, len(p) is its number of syllables,
    and p is its pronunciation
    
    Output:
    filtered: a list of words that rhyme with w. If none, returns w. If w is 
    empty, returns [].
    '''
    
    try:
        (w, l, p) = s[0]
        try:
            filtered = [wt for (wt, pt) in cmudict.entries() if l == len(pt) and p[-2:] == pt[-2:]]
            return filtered
        except:
            return [w]
    except:
        return []

In [None]:
# get rhymes 
[rhymes(s) for s in syllables]

## Load data and train HMM

In [None]:
# load data and reverse all words
text = open(os.path.join(os.getcwd(), 'data/shakespeare.txt')).read()
text = text.split(' ')
text.reverse()
text = ' '.join(text)
obs, obs_map = parse_poems(text)

In [None]:
hmm8 = unsupervised_HMM(obs, 8, 100)

In [None]:
hmm8 = unsupervised_HMM(obs, 8, 100)

## Generate sonnets

In [None]:
from ipynb.fs.full.unsupervised_hmm_poems import obs_map_reverser, sample_sentence

def generate_obs(hmm, obs_map):
    '''
    Naively generates 14-line sonnet with 8 words each.
    
    Inputs:
    hmm: trained hmm
    obs_map: maps word to observation index
    
    Outputs:
    None
    '''
    # generate all words in sonnet
    all_words = sample_sentence(hmm, obs_map, 112)
    all_words = all_words.split(' ')
    
    # split into 14 lines and add capitalization/naive punctuation
    for i in range(14):
        count = 0
        line = ' '.join(all_words[i*8:(i+1)*8]).capitalize()
        if i < 11 or i == 12:
            line += ','
        else:
            line += '.'
        print(line)

def generate_by_line(hmm, obs_map):
    for i in range(14):
        line = sample_sentence(hmm, obs_map, 8)
        if i < 11 or i == 12:
            line += ','
        else:
            line += '.'
        print(line)