In [9]:
import os
import numpy as np
from IPython.display import HTML
import json
from HMM import unsupervised_HMM
from HMM_helper import (
    text_to_wordcloud,
    states_to_wordclouds,
    parse_observations,
    sample_sentence,
    visualize_sparsities,
    animate_emission, 
)
import re
import sys
import random

### Helper Functions (Cleanup Data)

In [37]:
def split_by_sonnet_shakespeare(text): 
    """
    Split Shakespeare text by sonnet. 
    """
    return shakespeare_text.split("\n\n\n")

def clean_shakespeare(text): 
    sonnets = split_by_sonnet_shakespeare(text)
    all_lines = []
    for i in range(0, len(sonnets)): 
        lines = sonnets[i].split("\n")
        for j in range(1, len(lines)):
            all_lines.append(lines[j])
    return '\n'.join(all_lines)

### Helper Function (Build Syllables Dictionary)

In [33]:
def build_syllables_dict(syllables): 
    """
    Build a syllables dictionary. 
    """
    lines = [line.split() for line in syllables.split('\n') if line.split()]
    syllables_dict = dict()
    for line in lines: 
        word = line[0]
        if len(line) > 2: 
            # Ignore end-of-line syllable count. 
            # Keep the larger syllable count if there are multiple possibilities.
            if "E" in line[1]: 
                syllables_dict[word] = int(line[2])
            elif "E" in line[2]: 
                syllables_dict[word] = int(line[1])
            else: 
                syllables_dict[word] = int(line[2])
        else: 
            syllables_dict[word] = int(line[1])
    return syllables_dict

### Helper Function (Build Rhyme Dictionary)

In [11]:
def build_rhyme_pairs_shakespeare(sonnets): 
    """
    Build a dictionary of rhyme pairs and their frequency of 
    occurrence in Shakespeare's sonnets. 
    """
    rhyme_pairs = dict()
    
    for i in range(0, len(sonnets)): 
        sonnet_num = i + 1
        # Excluce Sonnet 99, Sonnet 126, and Sonnet 145. 
        # Sonnet 99 has 15 lines, Sonnet 126 has 12 lines.
        # All other sonnets follow the same rhyme scheme.
        if sonnet_num == 99 or sonnet_num == 126: 
            
            # Sonnet 99 has 15 lines of rhyme scheme ababa cdcd efef gg. 
            # We will throw out the rhyme triple a. 
            if sonnet_num == 99: 
                lines = sonnets[i].split("\n")
                end_words = [' ']
                
                for j in range(1, 16): 
                    words = lines[j].split(' ')
                    last_word = words[-1].strip()
                    last_word = re.sub(r'[^\w\-\']', '', last_word).lower()
                    end_words.append(last_word)
                
                # Create pairs of rhyming words. 
                rhyme1 = (end_words[2], end_words[4])
                rhyme2 = (end_words[6], end_words[8])
                rhyme3 = (end_words[7], end_words[9])
                rhyme4 = (end_words[10], end_words[12])
                rhyme5 = (end_words[11], end_words[13])
                rhyme6 = (end_words[14], end_words[15])
                
                rhymes = [rhyme1, rhyme2, rhyme3, rhyme4, rhyme5, rhyme6]
                
                # Add rhyming pairs to dictionary and keep in track of their 
                # frequency of occurrence. 
                for pair in rhymes: 
                    if pair in rhyme_pairs: 
                        rhyme_pairs[pair] += 1
                    else: 
                        rhyme_pairs[pair] = 1
            
            # Sonnet 126 has 12 lines of rhyme scheme aa bb cc dd ee ff. 
            if sonnet_num == 126: 
                lines = sonnets[i].split("\n")
                end_words = [' ']
                for j in range(1, 13): 
                    words = lines[j].split(' ')
                    last_word = words[-1].strip()
                    last_word = re.sub(r'[^\w\-\']', '', last_word).lower()
                    end_words.append(last_word)
                # Create pairs of rhyming words. 
                rhyme1 = (end_words[1], end_words[2])
                rhyme2 = (end_words[3], end_words[4])
                rhyme3 = (end_words[5], end_words[6])
                rhyme4 = (end_words[7], end_words[8])
                rhyme5 = (end_words[9], end_words[10])
                rhyme6 = (end_words[11], end_words[12])
                
                rhymes = [rhyme1, rhyme2, rhyme3, rhyme4, rhyme5, rhyme6]
                
                # Add rhyming pairs to dictionary and keep in track of their 
                # frequency of occurrence. 
                for pair in rhymes: 
                    if pair in rhyme_pairs: 
                        rhyme_pairs[pair] += 1
                    else: 
                        rhyme_pairs[pair] = 1
                    
        else: 
            # Append the last word of each line (remove punctuation)
            lines = sonnets[i].split("\n")
            end_words = [' ']
            for j in range(1, 15): 
                words = lines[j].split(' ')
                last_word = words[-1].strip()
                last_word = re.sub(r'[^\w\-\']', '', last_word).lower()
                end_words.append(last_word)
            
            # Create pairs of rhyming words. Shakespeare's sonnets has the 
            # following rhyming structure per line: abab, cdcd, efef, gg. 
            rhyme1 = (end_words[1], end_words[3])
            rhyme2 = (end_words[2], end_words[4])
            rhyme3 = (end_words[5], end_words[7])
            rhyme4 = (end_words[6], end_words[8])
            rhyme5 = (end_words[9], end_words[11])
            rhyme6 = (end_words[10], end_words[12])
            rhyme7 = (end_words[13], end_words[14])
            
            rhymes = [rhyme1, rhyme2, rhyme3, rhyme4, rhyme5, rhyme6, rhyme7]
            
            # Add rhyming pairs to dictionary and keep in track of their 
            # frequency of occurrence. 
            for pair in rhymes: 
                if pair in rhyme_pairs: 
                    rhyme_pairs[pair] += 1
                else: 
                    rhyme_pairs[pair] = 1
                    
    return rhyme_pairs

### Helper Functions (Generate Emissions)

In [18]:
def generate_emission1(self, M, seed):
        '''
        Generates an emission of length M, assuming that the starting state
        is chosen uniformly at random. 

        Arguments:
            M:          Length of the emission to generate.

        Returns:
            emission:   The randomly generated emission as a list.

            states:     The randomly generated states as a list.
        '''
        
        #O: The (i, j)^th element is the probability of
        # emitting observation j given state i.
        O = np.array(self.O)
        emission = []
        emission.append(seed)
       
        # generate first state given this emission 
        col_states = O[:, seed]
       
        # Choose the state having the highest probability of generating
        # this emission. 
        
        state = np.argmax(col_states)       
        states = []
        for t in range(M-1):
            # Append state.
            states.append(state)

            # Sample next observation.
            rand_var = random.uniform(0, 1)
            next_obs = 0

            while rand_var > 0:
                rand_var -= self.O[state][next_obs]
                next_obs += 1

            next_obs -= 1
            emission.insert(0, next_obs)

            # Sample next state.
            rand_var = random.uniform(0, 1)
            next_state = 0

            while rand_var > 0:
                rand_var -= self.A[state][next_state]
                next_state += 1

            next_state -= 1
            state = next_state

        return emission, states
    
def obs_map_reverser1(obs_map):
    obs_map_r = {}

    for key in obs_map:
        obs_map_r[obs_map[key]] = key

    return obs_map_r

def parse_observations_reverse(text):
    # Convert text to dataset.
    lines = [line.split() for line in text.split('\n') if line.split()]

    obs_counter = 0
    obs = []
    obs_map = {}

    for line in reversed(lines):
        obs_elem = []
        
        for word in reversed(line):
            word = re.sub(r'[^\w\-\']', '', word).lower() 
            #word = re.sub(r'[^\w]', '', word).lower()
            if word not in obs_map:
                # Add unique words to the observations map.
                obs_map[word] = obs_counter
                obs_counter += 1
            
            # Add the encoded word.
            obs_elem.append(obs_map[word])
        
        # Add the encoded sequence.
        obs.append(obs_elem)

    return obs, obs_map

In [45]:
def sample_sentence1(hmm, obs_map, seed, n_syllables=10):
    """
    Generate a sentence with the given number of syllables and seed word.  
    """
    # Get reverse map.
    seed_idx = obs_map[seed]
    obs_map_r = obs_map_reverser1(obs_map)
    sentence = []
    
    # Keep generating until we are able to obtain a line 
    # that can be truncated to exactly n_syllables number of 
    # syllables.    
    while True: 
        count = 0
        sentence = []
        emission, states = generate_emission1(hmm, 10, seed_idx)
        emission = emission[::-1]
        for i in emission: 
            word = obs_map_r[i]
            syllables_count = syllables_dict[word]
            count += syllables_count
            sentence.append(word)
            if count >= n_syllables: 
                break
        if count == n_syllables: 
            break        

    return ' '.join(sentence[::-1]).capitalize()

In [46]:
def sample_shakespeare_sonnet(hmm, obs_map, rhyme_pairs): 
    """
    Generate a poem following the rhyme scheme of a typical Shakespeare
    Sonnet (abab cdcd efef gg)
    """
    poem = ''
    (a1, a2) = rhyme_pairs[0]
    (b1, b2) = rhyme_pairs[1]
    (c1, c2) = rhyme_pairs[2]
    (d1, d2) = rhyme_pairs[3]
    (e1, e2) = rhyme_pairs[4]
    (f1, f2) = rhyme_pairs[5]
    (g1, g2) = rhyme_pairs[6]
    
    poem += sample_sentence1(hmm, obs_map, a1) + '\n'
    poem += sample_sentence1(hmm, obs_map, b1) + '\n'
    poem += sample_sentence1(hmm, obs_map, a2) + '\n'
    poem += sample_sentence1(hmm, obs_map, b2) + '\n'
    poem += sample_sentence1(hmm, obs_map, c1) + '\n'
    poem += sample_sentence1(hmm, obs_map, d1) + '\n'
    poem += sample_sentence1(hmm, obs_map, c2) + '\n'
    poem += sample_sentence1(hmm, obs_map, d2) + '\n'
    poem += sample_sentence1(hmm, obs_map, e1) + '\n'
    poem += sample_sentence1(hmm, obs_map, f1) + '\n'
    poem += sample_sentence1(hmm, obs_map, e2) + '\n'
    poem += sample_sentence1(hmm, obs_map, f2) + '\n'
    poem += sample_sentence1(hmm, obs_map, g1) + '\n'
    poem += sample_sentence1(hmm, obs_map, g2)
    
    return poem

### Data Preprocessing

In [21]:
# Import text
shakespeare_text = open(os.path.join(os.getcwd(), 'data_Shakespeare/shakespeare.txt')).read()

In [22]:
# Split into sonnets
shakespeare_sonnets = split_by_sonnet_shakespeare(shakespeare_text)

In [23]:
# Process text (Remove sonnet number)
shakespeare_clean = clean_shakespeare(shakespeare_text)

### Create Rhyme Dictionaries

In [25]:
shakespeare_rhyme_pairs = build_rhyme_pairs_shakespeare(shakespeare_sonnets)

### Create Syllables Dictionary

In [34]:
syllables = open(os.path.join(os.getcwd(), 'data_Shakespeare/Syllable_dictionary.txt')).read()
syllables_dict = build_syllables_dict(syllables)

### HMM Models

In [26]:
# Shakespeare text only
obs_reversed_shakespeare, obs_map_reversed_shakespeare = parse_observations_reverse(shakespeare_clean)
hmm = unsupervised_HMM(obs_reversed_shakespeare, 16, 100)

Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
Iteration: 60
Iteration: 70
Iteration: 80
Iteration: 90
Iteration: 100


In [50]:
shakespeare_rhyme = random.sample(list(shakespeare_rhyme_pairs.keys()), 7)
print(sample_shakespeare_sonnet(hmm, obs_map_reversed_shakespeare, shakespeare_rhyme))

Thou thou where be leave not am would spend find
Highmost made withering willing know thy ranged
Lines loves i the self once straight with tell wind
One upon thee love's doth esteemed exchanged
Else them in than niggard or bad uphold
Prognosticate ornament were will black
As make to pays need art me eyes bring cold
Sweets did together will first all pry lack
Live advantage see need expiate case
Him doth in doth leave am their brave plight sight
I not that do faint give so right dear place
Looks away leaped swear my defeated light
Say could that hath say the hours hours
May evermore my had my fair flowers
