In [1]:
import os
import numpy as np
from IPython.display import HTML
import json
from HMM import unsupervised_HMM
from HMM_helper import (
    text_to_wordcloud,
    states_to_wordclouds,
    parse_observations,
    sample_sentence,
    visualize_sparsities,
    animate_emission
)
import re
import sys
import random

In [4]:
shakespeare_text = open(os.path.join(os.getcwd(), 'data_Shakespeare/shakespeare.txt')).read()

In [5]:
# full_text = shakespeare_text + spenser_text
obs, obs_map = parse_observations(shakespeare_text)

In [42]:
def split_by_sonnet_shakespeare(text): 
    """
    Split Shakespeare text by sonnet. 
    """
    return shakespeare_text.split("\n\n\n")

def build_rhyme_pairs(sonnets): 
    """
    Build a dictionary of rhyme pairs and their frequency of 
    occurrence in Shakespeare's sonnets. 
    """
    rhyme_pairs = dict()
    
    for i in range(0, len(sonnets)): 
        sonnet_num = i + 1
        # Excluce Sonnet 99, Sonnet 126, and Sonnet 145. 
        # Sonnet 99 has 15 lines, Sonnet 126 has 12 lines, and Sonnet 145 is 
        # written in iambic tetrameter. All other sonnets follow the same 
        # structure. 
        if sonnet_num == 99 or sonnet_num == 126 or sonnet_num == 145: 
            pass
        else: 
            # Append the last word of each line (remove punctuation)
            lines = sonnets[i].split("\n")
            end_words = [' ']
            for j in range(1, 15): 
                words = lines[j].split(' ')
                last_word = words[-1].strip()
                last_word = re.sub(r'[^\w\-\']', '', last_word).lower()
                end_words.append(last_word)
            
            # Create pairs of rhyming words. Shakespeare's sonnets has the 
            # following rhyming structure per line: abab, cdcd, efef, gg. 
            rhyme_1 = (end_words[1], end_words[3])
            rhyme_2 = (end_words[2], end_words[4])
            rhyme_3 = (end_words[5], end_words[7])
            rhyme_4 = (end_words[6], end_words[8])
            rhyme_5 = (end_words[9], end_words[11])
            rhyme_6 = (end_words[10], end_words[12])
            rhyme_7 = (end_words[13], end_words[14])
            
            rhymes = [rhyme_1, rhyme_2, rhyme_3, rhyme_4, rhyme_5, rhyme_6, rhyme_7]
            
            # Add rhyming pairs to dictionary and keep in track of their 
            # frequency of occurrence. 
            for pair in rhymes: 
                if pair not in rhyme_pairs and pair[::-1] in rhyme_pairs: 
                    rhyme_pairs[pair[::-1]] += 1
                elif pair in rhyme_pairs and pair[::-1] not in rhyme_pairs: 
                    rhyme_pairs[pair] += 1
                else: 
                    rhyme_pairs[pair] = 1
                    
    return rhyme_pairs      

In [43]:
sonnets = split_by_sonnet_shakespeare(shakespeare_text)
rhyme_pairs = build_rhyme_pairs(sonnets)