In [2]:
import nltk
from nltk.corpus import cmudict
from nltk.tokenize import word_tokenize, wordpunct_tokenize, sent_tokenize

import os
import re
import numpy as np
import json

from ipynb.fs.full.preprocessing import parse_poems, parse_observations
from HMM_soln import unsupervised_HMM

## Get rhyming dictionary

In [3]:
text = open(os.path.join(os.getcwd(), 'data/shakespeare.txt')).read()

# remove sonnet numbers and convert to lowercase
text = re.sub(r'[0-9]+', '', text) 
text = text.lower()
obs, obs_map = parse_observations(text)
text = text.split('\n')

In [4]:
# tokenize the data and filter out punctuation
tokens2 = [wordpunct_tokenize(s) for s in text]
punct = set(['.', ',', '!', ':', ';'])
filtered2 = [ [w for w in sentence if w not in punct ] for sentence in tokens2]
filtered2 = [x for x in filtered2 if len(x) != 0]

# get last word in each line
last2 = [ sentence[len(sentence) - 1] for sentence in filtered2]

In [5]:
# get syllables and word pronunciations 
try:
    # load from file
    file_object = open('data/syllables', 'r')
    syllables = json.load(file_object)
except:
    syllables = \
    [[(w, len(p), p) for (w, p) in cmudict.entries() if word == w] \
       for word in last2]
    # save to file
    file_object = open('data/syllables', 'w')
    json.dump(syllables, file_object)

Function for determining rhyming

In [88]:
def rhymes(s, obs_map):
    '''
    Function that determines rhyming words by comparing the ending sounds.
    
    Inputs: 
    s: a tuple (w, len(p), p) where w is a word, len(p) is its number of syllables,
    and p is its pronunciation
    obs_map: maps all unique words in dataset to an integer
    
    Output:
    filtered: a list of words that rhyme with w. If none, returns w. If w is 
    empty, returns [].
    '''
    
    try:
        (w, l, p) = s[0]
        try:
            filtered = [wt for (wt, pt) in cmudict.entries() if l == len(pt) and wt in obs_map.keys() \
                        and p[1:] == pt[1:]] 
            return filtered
        except:
            return [w]
    except:
        return []

In [155]:
# get rhyming dictionary
try:
    file_object = open('data/rhyme_dict2', 'r')
    rhyme_dict2 = json.load(file_object)
except:
    rhyme_dict2 = {}
    for s in syllables: 
        try:
            (w, l, p) = s[0]
            rhyme_dict2[w] = rhymes(s, obs_map)
        except:
            pass
    # save dictionary to file
    file_object = open('data/rhyme_dict2', 'w')
    json.dump(rhyme_dict2, file_object)

## Load data and train HMM

In [55]:
# load data and reverse all words
text = open(os.path.join(os.getcwd(), 'data/shakespeare.txt')).read()
text = re.sub(r'[0-9]+', '', text) 
text = text.lower()
text = text.split(' ')
text.reverse()
text = ' '.join(text)
obs, obs_map = parse_poems(text)

In [73]:
hmm8 = unsupervised_HMM(obs, 8, 1000)

Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
Iteration: 60
Iteration: 70
Iteration: 80
Iteration: 90
Iteration: 100
Iteration: 110
Iteration: 120
Iteration: 130
Iteration: 140
Iteration: 150
Iteration: 160
Iteration: 170
Iteration: 180
Iteration: 190
Iteration: 200
Iteration: 210
Iteration: 220
Iteration: 230
Iteration: 240
Iteration: 250
Iteration: 260
Iteration: 270
Iteration: 280
Iteration: 290
Iteration: 300
Iteration: 310
Iteration: 320
Iteration: 330
Iteration: 340
Iteration: 350
Iteration: 360
Iteration: 370
Iteration: 380
Iteration: 390
Iteration: 400
Iteration: 410
Iteration: 420
Iteration: 430
Iteration: 440
Iteration: 450
Iteration: 460
Iteration: 470
Iteration: 480
Iteration: 490
Iteration: 500
Iteration: 510
Iteration: 520
Iteration: 530
Iteration: 540
Iteration: 550
Iteration: 560
Iteration: 570
Iteration: 580
Iteration: 590
Iteration: 600
Iteration: 610
Iteration: 620
Iteration: 630
Iteration: 640
Iteration: 650
Iteration: 660
Iteration: 670
Iter

In [56]:
hmm12 = unsupervised_HMM(obs, 12, 1000)

Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
Iteration: 60
Iteration: 70
Iteration: 80
Iteration: 90
Iteration: 100
Iteration: 110
Iteration: 120
Iteration: 130
Iteration: 140
Iteration: 150
Iteration: 160
Iteration: 170
Iteration: 180
Iteration: 190
Iteration: 200
Iteration: 210
Iteration: 220
Iteration: 230
Iteration: 240
Iteration: 250
Iteration: 260
Iteration: 270
Iteration: 280
Iteration: 290
Iteration: 300
Iteration: 310
Iteration: 320
Iteration: 330
Iteration: 340
Iteration: 350
Iteration: 360
Iteration: 370
Iteration: 380
Iteration: 390
Iteration: 400
Iteration: 410
Iteration: 420
Iteration: 430
Iteration: 440
Iteration: 450
Iteration: 460
Iteration: 470
Iteration: 480
Iteration: 490
Iteration: 500
Iteration: 510
Iteration: 520
Iteration: 530
Iteration: 540
Iteration: 550
Iteration: 560
Iteration: 570
Iteration: 580
Iteration: 590
Iteration: 600
Iteration: 610
Iteration: 620
Iteration: 630
Iteration: 640
Iteration: 650
Iteration: 660
Iteration: 670
Iter

In [117]:
hmm12_100 = unsupervised_HMM(obs, 12, 100)

Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
Iteration: 60
Iteration: 70
Iteration: 80
Iteration: 90
Iteration: 100


## Generate sonnets

In [78]:
def obs_map_reverser(obs_map):
    '''
    This function taken from HMM_helper.py by Andrew Kang
    '''
    obs_map_r = {}

    for key in obs_map:
        obs_map_r[obs_map[key]] = key

    return obs_map_r

def generate_rhyming_obs(hmm, obs_map, rhyming_dict, last):
    '''    
    Seeds end of a line with a word and generates rest of line going backwards. 
    Generates a full 14-line sonnet in this manner, using the rhyme scheme 
    abab cdcd efef gg.
    
    Inputs:
    hmm: trained hmm
    obs_map: maps word to observation index
    rhyming_dict: maps word to list of rhyming words
    last: list of words that end lines in Shakespearean sonnets
    
    Outputs:
    sonnet: the generated sonnet
    '''

    obs_map_r = obs_map_reverser(obs_map)

    # Sample and convert lines starting from end of sonnet
    n_words = 8
    sonnet = ''
    
    for i in range(14):
        if i == 0:
            # the start of the poem - choose any ending word (g)
            word1 = np.random.choice(last) # get word
            seed1 = obs_map[word1]
            #seed1 = np.random.choice(range(len(obs_map))) # get index
            #word1 = obs_map_r[seed1] # get word    
            emission, states = hmm.generate_emission(n_words, seed1)   
        elif i == 1:
            # get rhyming word
            try:
                word2 = np.random.choice(rhyming_dict[word1])
            except:
                word2 = word1
            seed2 = obs_map[word2]
            emission, states = hmm.generate_emission(n_words, seed2)    
        elif i in [2, 6, 10]:
            # start of new stanza (b, d, f)
            # use last word in previous emission to generate start of new line
#             prev = emission[-1] 
#             seed1, state = hmm.generate_emission(2, prev)
#             word1 = obs_map_r[seed1[1]] # get word    
            word1 = np.random.choice(last) # get word
            seed1 = obs_map[word1]
            #emission, states = hmm.generate_emission(n_words, seed1[1])
            emission, states = hmm.generate_emission(n_words, seed1)
        elif i in [4, 8, 12]:
            # second line of (b, d, f)
            # get rhyming word
            try:
                word2 = np.random.choice(rhyming_dict[word1])
            except:
                word2 = word1
            seed2 = obs_map[word2]
            emission, states = hmm.generate_emission(n_words, seed2)
        elif i in [3, 7, 11]:
            # start of rhyme (a, c, e)
            # use last word in previous emission to generate start of new line
#             prev = emission[-1] 
#             seed3, state = hmm.generate_emission(2, prev)
#             word3 = obs_map_r[seed3[1]] # get word 
            word3 = np.random.choice(last) # get word
            seed3 = obs_map[word3]
#             emission, states = hmm.generate_emission(n_words, seed3[1])
            emission, states = hmm.generate_emission(n_words, seed3)
        else: # i in [5, 9, 13]:
            # second line of (a, c, e)
            # get rhyming word
            try:
                word4 = np.random.choice(rhyming_dict[word3])
            except:
                word4 = word3
            seed4 = obs_map[word4]
            emission, states = hmm.generate_emission(n_words, seed4)
        
        # add line to sonnet (going backwards)
        for e in emission: 
            word = obs_map_r[e]     
            sonnet = word + ' ' + sonnet
        
        # add line breaks
        if i < 13:
            sonnet = '\n' + sonnet
    
    return sonnet

In [156]:
sonnet = generate_rhyming_obs(hmm12, obs_map, rhyme_dict, last2)

In [157]:
sonnet = sonnet.split('\n')
for line in sonnet:
    print(line.capitalize())

Heat not lying fire but sleeping that be 
By remedy hearts bath fire of water and 
Cool and warmed up his a water be 
A thrall this sleeping so came keep end 
Growing but lovegod water fire and tripping fond 
Lovegod love so water keep in by master 
Brand perpetual so came took warmed for burned 
Heartinflaming discased but i came and nymphs greater 
Heartinflaming cool keep his by a the rebuked 
Bath was many asleep chaste and disarmed might 
Lovegod heat lovegod bath that  little respect 
Took of votary tripping by maiden of right 
Keep took by she many vowed a bred 
Discased keep asleep heartinflaming hand and warmed fled 
