# 3. Pre-Processing

In [41]:
import numpy as np
from HMM import unsupervised_HMM
from HMM_helper import (
    text_to_wordcloud,
    states_to_wordclouds,
    sample_sentence,
    visualize_sparsities,
    animate_emission,
    obs_map_reverser
)

In [72]:
# Load the data
filename = 'data/shakespeare.txt'

with open(filename, 'r') as f:
    content = f.readlines()
    lines = []
    for line in content:
        line = line.strip(',')
        if len(line) > 23:
            words = (line.split('\n')[0].split(' '))
            while '' in words : words.remove('')
            newWords = []
            for w in words:
                w = w.strip(' ,.:;?!\'()').lower()
                newWords.append(w)
            lines.append(newWords)

obs, obs_map = parse_observations(lines)

# Syllable data
with open('data/Syllable_dictionary.txt', 'r') as syl_f:
    syl_content = syl_f.readlines()
word_to_syl = {}
for l in syl_content:
    spl = l.split()
    if len(spl) == 3 and (spl[2] == 'E3' or spl[2] == 'E4'):
        word_to_syl[spl[0]] = spl[1]
    else:
        word_to_syl[spl[0]] = spl[len(spl)-1]
syllables = []
for l in lines:
    syl_line = []
    for w in l:
        if w in word_to_syl.keys():
            syl_line.append(int(word_to_syl.get(w)))
        else:
            new_w = "\'" + w
            if new_w in word_to_syl.keys():
                syl_line.append(int(word_to_syl.get(new_w)))
            else:
                syl_line.append(0)
    syllables.append(syl_line)
#print ("Syllable data:", syllables)

syl_to_word = {}
for l in syl_content:
    spl = l.split()
    v = spl[0]
    # emphasis handling
    if len(spl) == 3 and (spl[2] == 'E3' or spl[2] == 'E4'):
        k = int(spl[1])
    else:
        k = int(spl[len(spl)-1])
    # adding to dict
    if k in syl_to_word.keys():
        syl_to_word[k].append(obs_map[v.strip('\'')])
    else:
        syl_to_word[k] = [obs_map[v.strip('\'')]]
#print ("Syllable-to-word dict:", syl_to_word)

In [44]:
def parse_observations(lines):
    obs_counter = 0
    obs = []
    obs_map = {}

    for line in lines:
        obs_elem = []
        
        for word in line:
            if word not in obs_map:
                # Add unique words to the observations map.
                obs_map[word] = obs_counter
                obs_counter += 1
            
            # Add the encoded word.
            obs_elem.append(obs_map[word])
        
        # Add the encoded sequence.
        obs.append(obs_elem)

    return obs, obs_map

In [45]:
obs, obs_map = parse_observations(lines)

In [46]:
hmm_words = unsupervised_HMM(obs, 5, 50)

Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50


In [47]:
hmm_syls = unsupervised_HMM(syllables, 5, 50)

Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50


In [109]:
"""
Combine the results from an HMM trained on the words and an HMM trained on the syllables of the sonnets to
create a sentence of desired length

Input:
    - word_hmm: HMM trained on input of the words of the sonnets
    - syllable_hmm: HMM trained on input of the syllables of words of the sonnets
    - syllables_to_words: dictionary with keys being number of syllables and values being lists of words
        with the corresponding number of syllables
    - obs_map: dictionary mapping words to their respective integer encoding
    - n_words: number of words for the sentence to have
    
Output: A string sentence of length n_words words
"""
def create_sentence(word_hmm, syllable_hmm, syllables_to_words, obs_map, n_words):
    emission = []
    seq = syllable_hmm.generate_emission(n_words)[0]
    states = word_hmm.generate_emission(n_words)[1]
    print(seq)
    
    indices = syllables_to_words[seq[0]]
    probs_sum = sum([word_hmm.O[0][i] for i in indices])
    probs = [word_hmm.O[0][i] / probs_sum for i in indices]
    word = np.random.choice(indices, p=probs)
    emission.append(word)
    
    for idx in range(1, n_words):
        syllables = seq[idx]
        indices = syllables_to_words[syllables]
        probs_sum = sum([word_hmm.O[states[idx]][i] for i in indices])
        probs = [word_hmm.O[states[idx]][i] / probs_sum for i in indices]
        word = np.random.choice(indices, p=probs)
        emission.append(word)
    
    obs_map_rev = obs_map_reverser(obs_map)
    sentence_array = [obs_map_rev[i] for i in emission]

    sentence = ' '.join(sentence_array).capitalize()
    return sentence

In [111]:
print(create_sentence(hmm_words, hmm_syls, syl_to_word, obs_map, 10))

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Then tongues frown strong which those that thee place we
