In [1]:
import nltk
from nltk.corpus import cmudict
from nltk.tokenize import word_tokenize, wordpunct_tokenize, sent_tokenize

import os
import re

from ipynb.fs.full.preprocessing import parse_poems, parse_observations
from HMM_soln import unsupervised_HMM

## Get rhyming dictionary

In [2]:
text = open(os.path.join(os.getcwd(), 'data/shakespeare.txt')).read()

# remove sonnet numbers and convert to lowercase
text = re.sub(r'[0-9]+', '', text) 
text = text.lower()
obs, obs_map = parse_observations(text)
text = text.split('\n')

In [3]:
# tokenize the data and filter out punctuation
tokens2 = [wordpunct_tokenize(s) for s in text]
punct = set(['.', ',', '!', ':', ';'])
filtered2 = [ [w for w in sentence if w not in punct ] for sentence in tokens2]
filtered2 = [x for x in filtered2 if len(x) != 0]

# get last word in each line
last2 = [ sentence[len(sentence) - 1] for sentence in filtered2]

In [4]:
# get syllables and word pronunciations 
syllables = \
[[(w, len(p), p) for (w, p) in cmudict.entries() if word == w] \
   for word in last2]

Function for determining rhyming

In [5]:
def rhymes(s, obs_map):
    '''
    Function that determines rhyming words by comparing the ending sounds.
    
    Inputs: 
    s: a tuple (w, len(p), p) where w is a word, len(p) is its number of syllables,
    and p is its pronunciation
    obs_map: maps all unique words in dataset to an integer
    
    Output:
    filtered: a list of words that rhyme with w. If none, returns w. If w is 
    empty, returns [].
    '''
    
    try:
        (w, l, p) = s[0]
        try:
            filtered = [wt for (wt, pt) in cmudict.entries() if l == len(pt) and wt in obs_map.keys() \
                        and p[-2:] == pt[-2:]] 
            return filtered
        except:
            return [w]
    except:
        return []

In [None]:
# get rhyming dictionary
rhyme_dict = {}
for s in syllables: 
    try:
        (w, l, p) = s[0]
        rhyme_dict[w] = rhymes(s, obs_map)
    except:
        pass

## Load data and train HMM

In [None]:
# load data and reverse all words
text = open(os.path.join(os.getcwd(), 'data/shakespeare.txt')).read()
text = text.split(' ')
text.reverse()
text = ' '.join(text)
obs, obs_map = parse_poems(text)

In [None]:
hmm8 = unsupervised_HMM(obs, 8, 100)

In [None]:
hmm8 = unsupervised_HMM(obs, 8, 100)

## Generate sonnets

In [None]:
from ipynb.fs.full.unsupervised_hmm_poems import obs_map_reverser, sample_sentence

def generate_rhyming_obs(hmm, obs_map, rhyming_dict):
    '''    
    Seeds end of a line with a word and generates rest of line going backwards. 
    Generates a full 14-line sonnet in this manner, using the rhyme scheme 
    abab cdcd efef gg.
    
    Inputs:
    hmm: trained hmm
    obs_map: maps word to observation index
    rhyming_dict: maps word to list of rhyming words
    
    Outputs:
    sonnet: the generated sonnet
    '''

    obs_map_r = obs_map_reverser(obs_map)

    # Sample and convert lines starting from end of sonnet
    n_words = 8
    sonnet = ''
    
    for i in range(14):
        if i == 0:
            # the start of the poem - choose any word (g)
            seed1 = np.random.choice(range(len(obs_map))) # get index
            word1 = obs_map_r[seed1] # get word    
            emission, states = hmm.generate_emission(n_words, seed1)       
        elif i == 1:
            # get rhyming word
            word2 = np.random.choice(rhyming_dict[word1])
            seed2 = obs_map[word2]
            emission, states = hmm.generate_emission(n_words, seed2)    
        elif i in [2, 6, 10]:
            # start of new stanza (b, d, f)
            # use last word in previous emission to generate start of new line
            prev = emission[-1] 
            seed1, state = hmm.generate_emission(1, prev)
            word1 = obs_map_r[seed1] # get word    
            emission, states = hmm.generate_emission(n_words, seed1)
        elif i in [4, 8, 12]:
            # second line of (b, d, f)
            # get rhyming word
            word2 = np.random.choice(rhyming_dict[word1])
            seed2 = obs_map[word2]
            emission, states = hmm.generate_emission(n_words, seed2)
        elif i in [3, 7, 11]:
            # start of rhyme (a, c, e)
            # use last word in previous emission to generate start of new line
            prev = emission[-1] 
            seed3, state = hmm.generate_emission(1, prev)
            word3 = obs_map_r[seed3] # get word 
        else: # i in [5, 9, 13]:
            # second line of (a, c, e)
            # get rhyming word
            word4 = np.random.choice(rhyming_dict[word3])
            seed4 = obs_map[word4]
            emission, states = hmm.generate_emission(n_words, seed4)
        
        # add line to sonnet (going backwards)
        for e in emission: 
            word = obs_map_r[e]
            sonnet = word + ' ' + sonnet
        
        # add line breaks
        if i < 11:
            sonnet = '\n' + sonnet
    
    return sonnet