In [1]:
import numpy as np
import matplotlib.pyplot as plt
import itertools
import nltk
from nltk.corpus import cmudict
nltk.download('cmudict')
import pickle

[nltk_data] Downloading package cmudict to
[nltk_data]     /Users/kushaltirumala/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


In [2]:
with open("../data/shakespeare.txt", "r") as f:
    data = f.read()

In [3]:
def get_sonnets(filename):
    """ Returns list of separate sonnets in the file. 
    Each line of a sonnet is a list of words (tokens). """
    with open(filename) as file:
        sons, son = [], []
        for line in file:
            line = line.strip()
            if line.isdigit() == False:
                lst = line.lower().translate(str.maketrans('', '', ':;,.?!()')).split()
                if len(lst) > 0: # count words in line
                    son.append(lst)
                elif len(son) > 0:
                    sons.append(son)
                    son = []
    sons.append(son) # add the final sonnet
    return sons
file = '../data/shakespeare.txt'
sonnets = get_sonnets(file)

# The sonnets 99 and 126 (at indices 98 and 125)
# do not count as sonnets, since they are not 14 lines long
for i in range(len(sonnets)):
    if len(sonnets[i]) != 14:
        print(i)
        
sonnets = sonnets[:98] + sonnets[99:125] + sonnets[126:]

d = cmudict.dict()
def count_syl(word):
    """ Returns number of syllables in a word. Taken
    from StackOverflow. """
    if word in d:
        return len(list(y for y in d[word][0] if y[-1].isdigit()))
    count = 0
    vowels = 'aeiouy'
    
    if word[0] in vowels:
        count +=1
    for index in range(1,len(word)):
        if word[index] in vowels and word[index-1] not in vowels:
            count +=1
    if word.endswith('e'):
        count -= 1
    if word.endswith('le'):
        count+=1
    if count == 0:
        count +=1
    return count

def get_syl(dic_to_ids):
    """ Returns dictionary of IDs to syllables given 
    a dictionary mapping words to ids. """
    dic = {}
    for word in dic_to_ids.keys():
        id_ = dic_to_ids[word]
        if "'" in word:
            dic[id_] = 1
        else:
            dic[id_] = sum(count_syl(w) for w in word.split("-"))
        
    return dic

def get_id(lines):
    """ Returns dictionary of words to IDs and 
    dictionary of IDs to words. """
    dic_to_ids, dic_to_words = {}, {}
    id_ = 0
    for line in lines:
        for word in line:
            if word not in dic_to_ids:
                dic_to_ids[word] = id_
                dic_to_words[id_] = word
                id_ += 1
    return dic_to_ids, dic_to_words

def get_mappings(sonnets, name):
    """ Returns dictionary of words to IDs, dictionary of IDs to words, 
    and dictionary of IDs to syllables. """    
    lines = list(itertools.chain.from_iterable(sonnets)) # list of lines
    dic_to_ids, dic_to_words = get_id(lines)
    dic_syl = get_syl(dic_to_ids)
    
    with open(name, 'wb') as f:
        pickle.dump((dic_to_ids, dic_to_words, dic_syl), f)
    
    return dic_to_ids, dic_to_words, dic_syl

dic_to_ids, dic_to_words, dic_syl = get_mappings(sonnets, 'shakespeare_dics.pkl')

98
125


In [4]:
ids_to_dic = {v: k for k,v in dic_to_ids.items()}

In [5]:
import copy

hehe = copy.deepcopy(sonnets)
for i in range(len(sonnets)):
    for j in range(len(sonnets[i])):
        for k in range(len(sonnets[i][j])):
            hehe[i][j][k] = dic_to_ids[sonnets[i][j][k]]
hehe = np.array(hehe)

In [6]:
fin = []
for i in range(len(hehe)):
    temp = []
    for j in range(len(hehe[i])):
        for k in range(len(hehe[i][j])):
            temp.append(hehe[i][j][k])
    fin.append(temp)
fin = np.array(fin)

In [79]:
from HMM_sol import *
# want to seed words, and generate the sonnet in reverse based on that seeded word
# so first we generate the training data for the end of line stuff that (hopefully rhymes)
def words_to_ids(data):
    ans = copy.deepcopy(data)
    for i in range(len(data)):
        for j in range(len(data[i])):
            ans[i][j] = dic_to_ids[data[i][j]]
    return ans

In [85]:
r = []
for i in range(len(hehe)):
    temp = []
    for j in range(len(hehe[i])):
        temp.append(hehe[i][j][len(hehe[i][j]) - 1])
    r.append(temp)
HMM_rhyming = unsupervised_HMM(r, 30, 200, D_len=len(ids_to_dic))

3202
Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
Iteration: 60
Iteration: 70
Iteration: 80
Iteration: 90
Iteration: 100
Iteration: 110
Iteration: 120
Iteration: 130
Iteration: 140
Iteration: 150
Iteration: 160
Iteration: 170
Iteration: 180
Iteration: 190
Iteration: 200


In [99]:
emission, states = HMM_rhyming.generate_emission(14)

In [100]:
x = ' '.join([str(ids_to_dic[i]) for i in emission])

In [101]:
x

'state acquainted done depart pride lies friend amazeth ordering thence dignity offence dead flatter'

In [31]:
import pickle
with open('../shakespeare_rhymes.pkl', 'rb') as f:
    rhyme_data_raw = pickle.load(f)
    

In [33]:
rhyme_data_raw

[['cease', 'increase', 'lease', 'decease', 'decrease', 'excess'],
 ['i',
  'fee',
  'die',
  'memory',
  'eye',
  'legacy',
  'deny',
  'qualify',
  'thee',
  'fly',
  'by',
  'majesty',
  'usury',
  'husbandry',
  'dignity',
  'sky',
  'eternity',
  'me',
  'canopy',
  'be',
  'ye',
  'see',
  'gravity',
  'thereby',
  'lie',
  'flattery',
  'defy',
  'melancholy',
  'alchemy',
  'history',
  'enmity',
  'constancy',
  'why',
  'posterity',
  'decree',
  'free',
  'fortify',
  'remedy',
  'idolatry',
  'masonry'],
 ['devise',
  'prophecies',
  'cries',
  'spies',
  'despise',
  'eyes',
  'subtleties',
  'lies',
  'arise'],
 ['cruel', 'fuel', 'jewel'],
 ['content',
  'excellent',
  'spent',
  'invent',
  'argument',
  'rent',
  'ornament',
  'monument'],
 ['bring',
  'ordering',
  'thing',
  'niggarding',
  'spring',
  'prefiguring',
  'wing',
  'king',
  'sing'],
 ['allow', 'bow', 'mow', 'brow', 'now', 'bough', 'how'],
 ['stelled', 'held', 'field'],
 ['days', 'decays', 'praise', 'lays

# no don't learn rhyme!

In [112]:
rhyme_data_raw

[['cease', 'increase', 'lease', 'decease', 'decrease', 'excess'],
 ['i',
  'fee',
  'die',
  'memory',
  'eye',
  'legacy',
  'deny',
  'qualify',
  'thee',
  'fly',
  'by',
  'majesty',
  'usury',
  'husbandry',
  'dignity',
  'sky',
  'eternity',
  'me',
  'canopy',
  'be',
  'ye',
  'see',
  'gravity',
  'thereby',
  'lie',
  'flattery',
  'defy',
  'melancholy',
  'alchemy',
  'history',
  'enmity',
  'constancy',
  'why',
  'posterity',
  'decree',
  'free',
  'fortify',
  'remedy',
  'idolatry',
  'masonry'],
 ['devise',
  'prophecies',
  'cries',
  'spies',
  'despise',
  'eyes',
  'subtleties',
  'lies',
  'arise'],
 ['cruel', 'fuel', 'jewel'],
 ['content',
  'excellent',
  'spent',
  'invent',
  'argument',
  'rent',
  'ornament',
  'monument'],
 ['bring',
  'ordering',
  'thing',
  'niggarding',
  'spring',
  'prefiguring',
  'wing',
  'king',
  'sing'],
 ['allow', 'bow', 'mow', 'brow', 'now', 'bough', 'how'],
 ['stelled', 'held', 'field'],
 ['days', 'decays', 'praise', 'lays

In [118]:
# we first train an HMM on the backward sequence
reversed_data = []
for sonnet in hehe:
    for line in sonnet:
        line.reverse()
        temp = line
        reversed_data.append(temp)

[[5, 4, 3, 2, 1, 0],
 [12, 11, 10, 9, 8, 7, 6],
 [20, 19, 18, 17, 16, 15, 14, 13],
 [25, 21, 24, 10, 23, 22, 21],
 [32, 31, 30, 29, 28, 27, 26, 13],
 [39, 38, 37, 36, 35, 34, 33],
 [45, 44, 43, 42, 41, 40],
 [50, 49, 46, 48, 34, 28, 47, 34, 46, 34],
 [55, 54, 53, 15, 52, 51, 6, 26],
 [60, 59, 15, 28, 58, 57, 56],
 [64, 34, 63, 62, 30, 29, 61],
 [69, 68, 67, 66, 65, 22, 56],
 [76, 75, 74, 73, 72, 71, 15, 70],
 [80, 56, 79, 15, 18, 78, 53, 15, 77, 28],
 [86, 34, 85, 84, 83, 82, 81],
 [90, 8, 34, 68, 89, 88, 87, 56],
 [52, 96, 95, 94, 93, 92, 91, 34],
 [103, 102, 101, 100, 99, 98, 41, 76, 97],
 [45, 108, 34, 107, 43, 106, 105, 104],
 [111, 110, 34, 100, 109, 15, 107, 43],
 [32, 113, 88, 30, 29, 61, 112, 28],
 [119, 118, 56, 117, 116, 115, 114],
 [124, 8, 34, 123, 119, 122, 121, 120],
 [131, 100, 130, 129, 128, 127, 126, 26, 125],
 [137, 136, 133, 135, 56, 134, 133, 132, 84],
 [29, 139, 18, 108, 21, 138],
 [136, 51, 26, 81, 141, 140, 76, 28, 114, 74],
 [147, 146, 145, 26, 81, 144, 143, 34,

In [121]:
rdata = copy.deepcopy(reversed_data)


In [123]:
HMM_reversed_20 = unsupervised_HMM(rdata, 20, 10)

3202
Iteration: 10


In [156]:
def get_rhyming_words(word, rhyme_data):
    for pot_list in rhyme_data:
        if word in pot_list:
            return pot_list
    return []

def choose_ending_words(rhyme_data):
    temp = copy.deepcopy(rhyme_data)
    # choosing rhyming scheme per uniform distribution
    choices = [np.random.randint(0, len(rhyme_data)) for _ in range(7)]
    words = []
    for i in range(len(choices)):
        choice1 = random.choice(temp[choices[i]])
        words.append(choice1)
        temp[choices[i]].remove(choice1)
        choice2 = random.choice(temp[choices[i]])
        words.append(choice2)
        temp[choices[i]].remove(choice2)
    
    
    return words

In [176]:
# take first thing that is int thing in txt
meter_data = {}
with open("../data/Syllable_dictionary.txt", "r") as f:
    for line in f:
        temp = line.strip().split(" ")
        word = temp[0]
        count = temp[1:]
        
        for possible_val in count:
            if possible_val.isdigit():
                meter_data[word] = int(possible_val)
                break

In [157]:
# in the form aabb ccdd eeff gg
end_words = choose_ending_words(rhyme_data_raw)

In [172]:
# start at the state most likely to have that end_word
start_word = end_words[1]
temp = np.array(HMM_reversed_20.O)
most_probable_start_state = np.argmax(temp[:, dic_to_ids[start_word]])

12

In [190]:
def generate_emission_sequential(syl_count, A, O, start_word):
    temp = np.array(O)
    most_probable_start_state = np.argmax(temp[:, dic_to_ids[start_word]])
    
    emission = []
    states = []

    # choose starting state
    y_i = most_probable_start_state
    states.append(y_i)

    counter = meter_data[start_word]
    emission.append(dic_to_ids[start_word])
    while counter < syl_count:
        y_i = int(y_i)
        # print(len(self.O[y_i]))
        array = range(len(O[y_i]))
        observation_index = np.random.choice(array, p=O[y_i])
        while counter + meter_data[ids_to_dic[observation_index]] > syl_count:
#             print("uhoh")
            observation_index = np.random.choice(array, p=O[y_i])
            
        emission.append(observation_index)
        counter += meter_data[ids_to_dic[observation_index]]
        
    return emission, states

In [191]:
emission, states = generate_emission_sequential(10, HMM_reversed_20.A, HMM_reversed_20.O, end_words[0])

In [201]:
poem = []
for i in range(14):
    start_state = end_words[i]
    emission, states = generate_emission_sequential(10, HMM_reversed_20.A, HMM_reversed_20.O, start_state)
    emission.reverse()
    x = ' '.join([str(ids_to_dic[i]) for i in emission])
    poem.append(x)

In [206]:
# switch order
temp = poem[1]
poem[1] = poem[2]
poem[2] = temp

temp = poem[5]
poem[5] = poem[6]
poem[6] = temp

temp = poem[9]
poem[9] = poem[10]
poem[10] = temp

for line in poem:
    print(line)

shaken accidents rehearse bar admire
alone after-loss kill reap trim require
die of him now of peace that my praised bail
indeed estimate recite sit same gaol
sit denied so pay how gone knowing drink
beauteous when selling thing remembered forth
at knife robs abused o from by for think
so murd'rous be marvel unlettered worth
deem over-plus suppose night dye begin
thine in alone boast wide despised sin
hour divine fee muse bold crowned possessing
scorn rents reign accidents need releasing
awake delight new tied which bow you fits
slide effect cherish memory free hits
