# 3. Pre-Processing

In [158]:
import numpy as np
from HMM import unsupervised_HMM
from HMM_helper import (
    text_to_wordcloud,
    states_to_wordclouds,
    sample_sentence,
    visualize_sparsities,
    animate_emission,
    obs_map_reverser
)

import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/alisonnoyes/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     /Users/alisonnoyes/nltk_data...
[nltk_data]   Unzipping help/tagsets.zip.


True

In [144]:
# Load the data
filename = 'data/shakespeare.txt'

with open(filename, 'r') as f:
    content = f.readlines()
    lines = []
    for line in content:
        line = line.strip(',')
        if len(line) > 23:
            words = (line.split('\n')[0].split(' '))
            while '' in words : words.remove('')
            newWords = []
            for w in words:
                w = w.strip(' ,.:;?!\'()').lower()
                newWords.append(w)
            lines.append(newWords)

obs, obs_map = parse_observations(lines)

# Syllable data
with open('data/Syllable_dictionary.txt', 'r') as syl_f:
    syl_content = syl_f.readlines()
word_to_syl = {}
for l in syl_content:
    spl = l.split()
    if len(spl) == 3 and (spl[2] == 'E3' or spl[2] == 'E4'):
        word_to_syl[spl[0]] = spl[1]
    else:
        word_to_syl[spl[0]] = spl[len(spl)-1]
syllables = []
for l in lines:
    syl_line = []
    for w in l:
        if w in word_to_syl.keys():
            syl_line.append(int(word_to_syl.get(w)))
        else:
            new_w = "\'" + w
            if new_w in word_to_syl.keys():
                syl_line.append(int(word_to_syl.get(new_w)))
            else:
                syl_line.append(0)
    syllables.append(syl_line)

# Part of speech data
parts_of_speech = []
pos_map = {}
pos_counter = 0
for l in lines:
    pos_line = []
    for w in l:
        pos = nltk.pos_tag([w])[0][1]
        if pos not in pos_map:
            pos_map[pos] = pos_counter
            pos_counter += 1
        
        pos_line.append(pos_map[pos])
    
    parts_of_speech.append(pos_line)

# Initialize a 2D dictionary for syllables and parts of speech
syl_pos_dict = dict()
for d in range(6): 
    syl_pos_dict[d] = dict()
    
for l in syl_content:
    spl = l.split()
    v = spl[0]
    pos = pos_map[nltk.pos_tag([v])[0][1]]
    
    # emphasis handling
    if len(spl) == 3 and (spl[2] == 'E3' or spl[2] == 'E4'):
        k = int(spl[1])
    else:
        k = int(spl[len(spl)-1])
    
    if pos in syl_pos_dict[k].keys():
        syl_pos_dict[k][pos].append(obs_map[v.strip('\'')])
    else:
        syl_pos_dict[k][pos] = [obs_map[v.strip('\'')]]

In [150]:
print(pos_map)

{'IN': 0, 'NN': 1, 'NNS': 2, 'PRP': 3, 'RB': 4, 'VBD': 5, 'MD': 6, 'CC': 7, 'DT': 8, 'PRP$': 9, 'VBN': 10, 'TO': 11, 'JJ': 12, 'VBG': 13, 'WRB': 14, 'VB': 15, 'RBR': 16, 'VBZ': 17, 'WP$': 18, 'WP': 19, 'VBP': 20, 'WDT': 21, 'CD': 22, 'JJS': 23, 'JJR': 24}


In [145]:
def parse_observations(lines):
    obs_counter = 0
    obs = []
    obs_map = {}

    for line in lines:
        obs_elem = []
        
        for word in line:
            if word not in obs_map:
                # Add unique words to the observations map.
                obs_map[word] = obs_counter
                obs_counter += 1
            
            # Add the encoded word.
            obs_elem.append(obs_map[word])
        
        # Add the encoded sequence.
        obs.append(obs_elem)

    return obs, obs_map

In [146]:
obs, obs_map = parse_observations(lines)

In [147]:
hmm_words = unsupervised_HMM(obs, 5, 50)

Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50


In [148]:
hmm_pos = unsupervised_HMM(parts_of_speech, 5, 50)

Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50


In [149]:
hmm_syls = unsupervised_HMM(syllables, 5, 50)

Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50


In [175]:
"""
Combine the results from an HMM trained on the words and an HMM trained on the syllables of the sonnets to
create a sentence of desired length

Input:
    - word_hmm: HMM trained on input of the words of the sonnets
    - syllable_hmm: HMM trained on input of the syllables of words of the sonnets
    - syllables_to_words: dictionary with keys being number of syllables and values being lists of words
        with the corresponding number of syllables
    - obs_map: dictionary mapping words to their respective integer encoding
    - n_words: number of words for the sentence to have
    
Output: A string sentence of length n_words words
"""
def create_sentence(word_hmm, syllable_hmm, pos_hmm, syl_pos_dict, obs_map, n_words):
    emission = []
    syl_seq = syllable_hmm.generate_emission(n_words)[0]
    pos_seq = pos_hmm.generate_emission(n_words)[0]
    states = word_hmm.generate_emission(n_words)[1]
    print(syl_seq)
    print(pos_seq)
    
    try:
        indices = syl_pos_dict[syl_seq[0]][pos_seq[0]]
        probs_sum = sum([word_hmm.O[0][i] for i in indices])
        probs = [word_hmm.O[0][i] / probs_sum for i in indices]
        word = np.random.choice(indices, p=probs)
        emission.append(word)

        for idx in range(1, n_words):
            indices = syl_pos_dict[syl_seq[idx]][pos_seq[idx]]
            probs_sum = sum([word_hmm.O[states[idx]][i] for i in indices])
            probs = [word_hmm.O[states[idx]][i] / probs_sum for i in indices]
            word = np.random.choice(indices, p=probs)
            emission.append(word)

        obs_map_rev = obs_map_reverser(obs_map)
        sentence_array = [obs_map_rev[i] for i in emission]

        sentence = ' '.join(sentence_array).capitalize()
        return sentence
    except KeyError as e:
        print("Could not find valid combination for generated sequences.  Retrying with new sequences...")
        return create_sentence(word_hmm, syllable_hmm, pos_hmm, syl_pos_dict, obs_map, n_words)

In [177]:
print(create_sentence(hmm_words, hmm_syls, hmm_pos, syl_pos_dict, obs_map, 10))

[1, 2, 2, 1, 1, 1, 1, 3, 1, 1]
[1, 18, 3, 1, 4, 3, 1, 2, 4, 8]
Could not find valid combination for generated sequences.  Retrying with new sequences...
[1, 2, 1, 2, 1, 1, 1, 1, 3, 2]
[0, 1, 1, 1, 1, 1, 1, 7, 1, 1]
For actor doth widowed eye seem youth and to-morrow thousand


In [163]:
for pos in pos_map.keys():
    print(pos_map[pos])
    print(nltk.help.upenn_tagset(pos))

0
IN: preposition or conjunction, subordinating
    astride among uppon whether out inside pro despite on by throughout
    below within for towards near behind atop around if like until below
    next into if beside ...
None
1
NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...
None
2
NNS: noun, common, plural
    undergraduates scotches bric-a-brac products bodyguards facets coasts
    divestitures storehouses designs clubs fragrances averages
    subjectivists apprehensions muses factory-jobs ...
None
3
PRP: pronoun, personal
    hers herself him himself hisself it itself me myself one oneself ours
    ourselves ownself self she thee theirs them themselves they thou thy us
None
4
RB: adverb
    occasionally unabatingly maddeningly adventurously professedly
    stirringly prominently technologically magisterially predominately
    swiftly 