In [1]:
import numpy as np
from HMM import unsupervised_HMM
from HMM_helper import (
    text_to_wordcloud,
    states_to_wordclouds,
    sample_sentence,
    visualize_sparsities,
    animate_emission,
    obs_map_reverser
)
import random

import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/hannahfan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     /Users/hannahfan/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [2]:
def parse_observations(lines):
    obs_counter = 0
    obs = []
    obs_map = {}

    for line in lines:
        obs_elem = []
        
        for word in line:
            if word not in obs_map:
                # Add unique words to the observations map.
                obs_map[word] = obs_counter
                obs_counter += 1
            
            # Add the encoded word.
            obs_elem.append(obs_map[word])
        
        # Add the encoded sequence.
        obs.append(obs_elem)

    return obs, obs_map

In [3]:
# Load the data
filename = 'data/shakespeare.txt'
filename_2 = 'data/spenser.txt'

with open(filename, 'r') as f:
    with open(filename_2, 'r') as f2:
        content = f.readlines() + f2.readlines()
        lines = []
        for line in content:
            line = line.strip(',')
            if len(line) > 23:
                words = (line.split('\n')[0].split(' '))
                while '' in words : words.remove('')
                newWords = []
                for w in words:
                    w = w.strip(' ,.:;?!\'()').lower()
                    newWords.append(w)
                lines.append(newWords)

obs, obs_map = parse_observations(lines)

# Syllable data
with open('data/Syllable_dictionary.txt', 'r') as syl_f:
    syl_content = syl_f.readlines()
word_to_syl = {}
for l in syl_content:
    spl = l.split()
    if len(spl) == 3 and (spl[2] == 'E3' or spl[2] == 'E4'):
        word_to_syl[spl[0]] = spl[1]
    else:
        word_to_syl[spl[0]] = spl[len(spl)-1]
syllables = []
for l in lines:
    syl_line = []
    for w in l:
        if w in word_to_syl.keys():
            syl_line.append(int(word_to_syl.get(w)))
        else:
            new_w = "\'" + w
            if new_w in word_to_syl.keys():
                syl_line.append(int(word_to_syl.get(new_w)))
            else:
                syl_line.append(0)
    syllables.append(syl_line)

# Part of speech data
parts_of_speech = []
pos_map = {}
pos_counter = 0
for l in lines:
    pos_line = []
    for w in l:
        pos = nltk.pos_tag([w])[0][1]
        if pos not in pos_map:
            pos_map[pos] = pos_counter
            pos_counter += 1
        
        pos_line.append(pos_map[pos])
    
    parts_of_speech.append(pos_line)

# Initialize a 2D dictionary for syllables and parts of speech
syl_pos_dict = dict()
for d in range(6): 
    syl_pos_dict[d] = dict()
    
for l in syl_content:
    spl = l.split()
    v = spl[0]
    pos = pos_map[nltk.pos_tag([v])[0][1]]
    
    # emphasis handling
    if len(spl) == 3 and (spl[2] == 'E3' or spl[2] == 'E4'):
        k = int(spl[1])
    else:
        k = int(spl[len(spl)-1])
    
    if pos in syl_pos_dict[k].keys():
        syl_pos_dict[k][pos].append(obs_map[v.strip('\'')])
    else:
        syl_pos_dict[k][pos] = [obs_map[v.strip('\'')]]

In [4]:
print(pos_map)
print(len(lines))

{'IN': 0, 'NN': 1, 'NNS': 2, 'PRP': 3, 'RB': 4, 'VBD': 5, 'MD': 6, 'CC': 7, 'DT': 8, 'PRP$': 9, 'VBN': 10, 'TO': 11, 'JJ': 12, 'VBG': 13, 'WRB': 14, 'VB': 15, 'RBR': 16, 'VBZ': 17, 'WP$': 18, 'WP': 19, 'VBP': 20, 'WDT': 21, 'CD': 22, 'JJS': 23, 'JJR': 24}
3401


In [5]:
obs, obs_map = parse_observations(lines)

In [6]:
hmm_words = unsupervised_HMM(obs, 4, 80)

Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
Iteration: 60
Iteration: 70
Iteration: 80


In [7]:
hmm_pos = unsupervised_HMM(parts_of_speech, 4, 80)

Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
Iteration: 60
Iteration: 70
Iteration: 80


In [8]:
hmm_syls = unsupervised_HMM(syllables, 4, 80)

Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
Iteration: 60
Iteration: 70
Iteration: 80


In [9]:
"""
Combine the results from an HMM trained on the words and an HMM trained on the syllables of the sonnets to
create a sentence of desired length

Input:
    - word_hmm: HMM trained on input of the words of the sonnets
    - syllable_hmm: HMM trained on input of the syllables of words of the sonnets
    - syllables_to_words: dictionary with keys being number of syllables and values being lists of words
        with the corresponding number of syllables
    - obs_map: dictionary mapping words to their respective integer encoding
    - n_syllables: number of syllables for the sentence to have
    
Output: A string sentence of length n_syllables syllables
"""
def create_sentence(word_hmm, syllable_hmm, pos_hmm, syl_pos_dict, obs_map, n_syllables):
    emission = []
    
    # Generate sequences to construct our sentence from
    syl_seq = syllable_hmm.generate_emission(n_syllables)[0]
    pos_seq = pos_hmm.generate_emission(n_syllables)[0]
    states = word_hmm.generate_emission(n_syllables)[1]
    
    try:
        # Initial word
        indices = syl_pos_dict[syl_seq[0]][pos_seq[0]]
        probs_sum = sum([word_hmm.O[0][i] for i in indices])
        probs = [word_hmm.O[0][i] / probs_sum for i in indices]
        word = np.random.choice(indices, p=probs)
        emission.append(word)

        syl_counter = syl_seq[0]
        idx = 1
        # Get new words until we reach the given number of syllables
        while syl_counter < n_syllables:
            syls = syl_seq[idx]
            if syls + syl_counter > n_syllables: syls = n_syllables - syl_counter
            syl_counter += syls
                
            indices = syl_pos_dict[syls][pos_seq[idx]]
            probs_sum = sum([word_hmm.O[states[idx]][i] for i in indices])
            probs = [word_hmm.O[states[idx]][i] / probs_sum for i in indices]
            word = np.random.choice(indices, p=probs)
            emission.append(word)
            
            idx += 1
        
        # Convert the encoded words back to English words
        obs_map_rev = obs_map_reverser(obs_map)
        sentence_array = [obs_map_rev[i] for i in emission]

        sentence = ' '.join(sentence_array).capitalize()
        return sentence
    
    # If we generate sequences which give a combination of number of syllables and part of speech
    # that is not found in the dictionary, try again
    except KeyError as e:
        create_sentence(word_hmm, syllable_hmm, pos_hmm, syl_pos_dict, obs_map, n_syllables)

In [10]:
"""
Combine the results from an HMM trained on the words and an HMM trained on the syllables of the sonnets to
create a sentence of desired length starting with a given word as the last word in the sentence

Input:
    - word_hmm: HMM trained on input of the words of the sonnets
    - syllable_hmm: HMM trained on input of the syllables of words of the sonnets
    - syllables_to_words: dictionary with keys being number of syllables and values being lists of words
        with the corresponding number of syllables
    - obs_map: dictionary mapping words to their respective integer encoding
    - n_syllables: number of syllables for the sentence to have
    - init_word: encoding of the last word in the sentence
    
Output: A string sentence of length n_syllables syllables
"""
def create_sentence_reverse(word_hmm, syllable_hmm, pos_hmm, syl_pos_dict, obs_map, n_syllables, init_word):
    emission = [init_word]
    
    # Generate sequences to build the sentence from
    syl_seq = syllable_hmm.generate_emission(n_syllables)[0]
    pos_seq = pos_hmm.generate_emission(n_syllables)[0]
    
    # Find the initial state based on the state with the highest probability of getting the given word
    init_probs_sum = sum([word_hmm.O[i][init_word] for i in range(word_hmm.L)])
    init_probs = [word_hmm.O[i][init_word] / init_probs_sum for i in range(word_hmm.L)]
    state = np.random.choice(range(word_hmm.L), p=init_probs)
    
    # Initialize syllable counter
    obs_map_rev = obs_map_reverser(obs_map)
    syl_counter = int(word_to_syl[obs_map_rev[init_word]])
    
    try:
        # Start with the second-to-last values in the sequence
        indices = syl_pos_dict[syl_seq[n_syllables - 2]][pos_seq[n_syllables - 2]]
        probs_sum = sum([word_hmm.O[state][i] for i in indices])
        probs = [word_hmm.O[state][i] / probs_sum for i in indices]
        word = np.random.choice(indices, p=probs)
        emission.append(word)
        
        syl_counter += syl_seq[n_syllables - 2]
        idx = n_syllables - 3
        while syl_counter < n_syllables:
            syls = syl_seq[idx]
            if syls + syl_counter > n_syllables: syls = n_syllables - syl_counter
            syl_counter += syls
                
            indices = syl_pos_dict[syls][pos_seq[idx]]
            
            # Backtrack to choose a previous state based on probabilities in the transition matrix
            state_probs_sum = sum([word_hmm.A[i][state] for i in range(word_hmm.L)])
            state_probs = [word_hmm.A[i][state] / state_probs_sum for i in range(word_hmm.L)]
            state = np.random.choice(range(word_hmm.L), p=state_probs)
            
            probs_sum = sum([word_hmm.O[state][i] for i in indices])
            probs = [word_hmm.O[state][i] / probs_sum for i in indices]
            word = np.random.choice(indices, p=probs)
            emission.append(word)
            
            idx -= 1
        
        # Reverse the emission sequence and convert back to English words
        emission = emission[::-1]
        
        sentence_array = [obs_map_rev[i] for i in emission]
        sentence = ' '.join(sentence_array).capitalize()
        return sentence
    
    # If we generate sequences which give a combination of number of syllables and part of speech
    # that is not found in the dictionary, try again
    except KeyError as e:
        return create_sentence_reverse(word_hmm, syllable_hmm, pos_hmm, syl_pos_dict, obs_map, n_syllables, init_word)

In [11]:
def generate_poem(word_hmm, syllable_hmm, pos_hmm, syl_pos_dict, obs_map, n_syllables):
    """
    In the generate emission poem function we want to generate emissions in the form of a the poem:
    3 quatrain, 1 couplet. 
    """
    poem_emissions = []
    poem_states = []

    sonnet_structure = [4,4,4,2] #number of lines per structure in sonnet
    for num in sonnet_structure:
        structure_emission = []
        
        for n in range(num): 
            line_emission = create_sentence(word_hmm, syllable_hmm, pos_hmm, syl_pos_dict, obs_map,n_syllables) 
            while type(line_emission) == type(None): 
                line_emission = create_sentence(word_hmm, syllable_hmm, pos_hmm, syl_pos_dict, obs_map,n_syllables) 
            if n != num-1:
                structure_emission.append(line_emission + ',\n')
            else: 
                structure_emission.append(line_emission + '.\n')
        poem_emissions.append(''.join(structure_emission))


    return ''.join(poem_emissions)

words_that_rhyme = [
    ["increase", "decease"], ["brow", "now"], ["mother", "another"], ["womb", "tomb"], ["mine", "thine"],
    ["eyes", "lies"], ["excuse", "use"], ["fair", "heir"], ["art", "depart"], ["day", "way"], ["perish", "cherish"],
    ["cold", "old"], ["decay", "away"], ["thereby", "die"], ["time", "prime"], ["night", "white"], ["go", "grow"],
    ["be", "thee"], ["live", "give"], ["time", "rhyme"], ["fair", "repair"], ["men", "pen"], ["still", "skill"],
    ["cruel", "fuel"], ["look", "book"], ["due", "true"], ["power", "hour"], ["strong", "wrong"], ["fly", "eye"]
]
def generate_rhyming_poem(word_hmm, syllable_hmm, pos_hmm, syl_pos_dict, obs_map, n_syllables):
    """
    In the generate emission poem function we want to generate emissions in the form of a the poem:
    3 quatrain, 1 couplet. 
    """
    poem_emissions = []
    poem_states = []

    sonnet_structure = [4,4,4,2] #number of lines per structure in sonnet
    for num in sonnet_structure:
        structure_emission = []
        rhymes = [random.choice(words_that_rhyme) for _ in range(num)]
        r = 0
        
        for l in range(int(num / 2)):
            if num > 2:
                # Alternating rhymes
                rhyme_word_a = obs_map[rhymes[int(r / 2)][r % 2]]
                rhyme_word_b = obs_map[rhymes[int(r / 2 + 1)][r % 2]]
                r += 1

                # Consecutive rhymes
                """rhyme_word_a = obs_map[rhymes[int(r / 2)][r % 2]]
                rhyme_word_b = obs_map[rhymes[int(r / 2)][(r + 1) % 2]]
                r += 2"""
            else:
                rhyme_word_a = obs_map[rhymes[int(r / 2)][r % 2]]
                rhyme_word_b = obs_map[rhymes[int(r / 2)][(r + 1) % 2]]
                r += 2
            
            line_emission = create_sentence_reverse(hmm_words, hmm_syls, hmm_pos, syl_pos_dict, obs_map, 10, rhyme_word_a)
            
            if l * 2 != num-1:
                structure_emission.append(line_emission + ',\n')
            else: 
                structure_emission.append(line_emission + '.\n')
            
            line_emission = create_sentence_reverse(hmm_words, hmm_syls, hmm_pos, syl_pos_dict, obs_map, 10, rhyme_word_b)
            
            if l * 2 + 1 != num-1:
                structure_emission.append(line_emission + ',\n')
            else: 
                structure_emission.append(line_emission + '.\n')
                
        poem_emissions.append(''.join(structure_emission))


    return ''.join(poem_emissions)

def sample_sonnet(hmm, obs_map, n_words):
    obs_map_r = obs_map_reverser(obs_map)
    
    emission, states = generate_emission_poem(n_words)

    poem = []
    for e in emission:
        for i, s in enumerate(e):
            sentence = []
            for l in s:
                sentence.append(obs_map_r[l])
            if i != len(e)-1 : 
                poem.append(' '.join(sentence).capitalize() + ',\n')
            else: 
                poem.append(' '.join(sentence).capitalize() + '.\n')
            
    return ''.join(poem)

In [12]:
print(generate_rhyming_poem(hmm_words, hmm_syls, hmm_pos, syl_pos_dict, obs_map, 10))

Good thus ye thee doth beshrew dear love womb,
World of love's interim say my cannot womb,
Was vow reigned should her bareness where as tomb,
Qualify bearer wound of eyes or tomb.
Sacred stopped intelligence or life live,
Thee that all once a the thought thrusts excuse,
Huge they night pitch every doth thee thou give,
Justify let art raised thing in art use.
Budding in advantage more or cruel,
Himself darkness none i do thou will live,
Wilt the burn difference a weeds with fuel,
Foes sweets by quite which thee is love griefs give.
Sing proof i if write are deny cruel,
I true still too against be light thou fuel.



In [18]:
print(generate_poem(hmm_words, hmm_syls, hmm_pos, syl_pos_dict, obs_map, 10))

More time thine that her again to unto,
Above penance his thee made be spirit,
Such both reason himself may thou other,
Hast in fly worms within let mother's shame.
Any sing i woos end i be heart on,
Crow so that endured proved upon thou men,
Neither much frailties the me i th may the,
Her strong our guides proud level doth too.
Th i a love i themselves to such long joy,
Death loves awake old and subject to part,
That corrupt bends every upon themselves,
Another allege suffered and as her.
Then more my day to doth than you settled,
Since from mind loves as wind of another.



In [15]:
print(create_sentence_reverse(hmm_words, hmm_syls, hmm_pos, syl_pos_dict, obs_map, 10, obs_map["old"]))
print(create_sentence_reverse(hmm_words, hmm_syls, hmm_pos, syl_pos_dict, obs_map, 10, obs_map["cold"]))

Course every our shine but flowers old
Tear i age be in it worse hell fair cold


In [16]:
for pos in pos_map.keys():
    print(pos_map[pos])
    print(nltk.help.upenn_tagset(pos))

0
IN: preposition or conjunction, subordinating
    astride among uppon whether out inside pro despite on by throughout
    below within for towards near behind atop around if like until below
    next into if beside ...
None
1
NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...
None
2
NNS: noun, common, plural
    undergraduates scotches bric-a-brac products bodyguards facets coasts
    divestitures storehouses designs clubs fragrances averages
    subjectivists apprehensions muses factory-jobs ...
None
3
PRP: pronoun, personal
    hers herself him himself hisself it itself me myself one oneself ours
    ourselves ownself self she thee theirs them themselves they thou thy us
None
4
RB: adverb
    occasionally unabatingly maddeningly adventurously professedly
    stirringly prominently technologically magisterially predominately
    swiftly 