In [1]:
import numpy as np
import matplotlib.pyplot as plt
import itertools
import nltk
from nltk.corpus import cmudict
nltk.download('cmudict')
import pickle

[nltk_data] Downloading package cmudict to
[nltk_data]     /Users/kushaltirumala/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


In [2]:
with open("../data/shakespeare.txt", "r") as f:
    data = f.read()

In [3]:
def get_sonnets(filename):
    """ Returns list of separate sonnets in the file. 
    Each line of a sonnet is a list of words (tokens). """
    with open(filename) as file:
        sons, son = [], []
        for line in file:
            line = line.strip()
            if line.isdigit() == False:
                lst = line.lower().translate(str.maketrans('', '', ':;,.?!()')).split()
                if len(lst) > 0: # count words in line
                    son.append(lst)
                elif len(son) > 0:
                    sons.append(son)
                    son = []
    sons.append(son) # add the final sonnet
    return sons
file = '../data/shakespeare.txt'
sonnets = get_sonnets(file)

# The sonnets 99 and 126 (at indices 98 and 125)
# do not count as sonnets, since they are not 14 lines long
for i in range(len(sonnets)):
    if len(sonnets[i]) != 14:
        print(i)
        
sonnets = sonnets[:98] + sonnets[99:125] + sonnets[126:]

d = cmudict.dict()
def count_syl(word):
    """ Returns number of syllables in a word. Taken
    from StackOverflow. """
    if word in d:
        return len(list(y for y in d[word][0] if y[-1].isdigit()))
    count = 0
    vowels = 'aeiouy'
    
    if word[0] in vowels:
        count +=1
    for index in range(1,len(word)):
        if word[index] in vowels and word[index-1] not in vowels:
            count +=1
    if word.endswith('e'):
        count -= 1
    if word.endswith('le'):
        count+=1
    if count == 0:
        count +=1
    return count

def get_syl(dic_to_ids):
    """ Returns dictionary of IDs to syllables given 
    a dictionary mapping words to ids. """
    dic = {}
    for word in dic_to_ids.keys():
        id_ = dic_to_ids[word]
        if "'" in word:
            dic[id_] = 1
        else:
            dic[id_] = sum(count_syl(w) for w in word.split("-"))
        
    return dic

def get_id(lines):
    """ Returns dictionary of words to IDs and 
    dictionary of IDs to words. """
    dic_to_ids, dic_to_words = {}, {}
    id_ = 0
    for line in lines:
        for word in line:
            if word not in dic_to_ids:
                dic_to_ids[word] = id_
                dic_to_words[id_] = word
                id_ += 1
    return dic_to_ids, dic_to_words

def get_mappings(sonnets, name):
    """ Returns dictionary of words to IDs, dictionary of IDs to words, 
    and dictionary of IDs to syllables. """    
    lines = list(itertools.chain.from_iterable(sonnets)) # list of lines
    dic_to_ids, dic_to_words = get_id(lines)
    dic_syl = get_syl(dic_to_ids)
    
    with open(name, 'wb') as f:
        pickle.dump((dic_to_ids, dic_to_words, dic_syl), f)
    
    return dic_to_ids, dic_to_words, dic_syl

dic_to_ids, dic_to_words, dic_syl = get_mappings(sonnets, 'shakespeare_dics.pkl')

98
125


In [4]:
ids_to_dic = {v: k for k,v in dic_to_ids.items()}

In [5]:
import copy

hehe = copy.deepcopy(sonnets)
for i in range(len(sonnets)):
    for j in range(len(sonnets[i])):
        for k in range(len(sonnets[i][j])):
            hehe[i][j][k] = dic_to_ids[sonnets[i][j][k]]
hehe = np.array(hehe)

In [6]:
fin = []
for i in range(len(hehe)):
    temp = []
    for j in range(len(hehe[i])):
        for k in range(len(hehe[i][j])):
            temp.append(hehe[i][j][k])
    fin.append(temp)
fin = np.array(fin)

In [79]:
from HMM_sol import *
# want to seed words, and generate the sonnet in reverse based on that seeded word
# so first we generate the training data for the end of line stuff that (hopefully rhymes)
def words_to_ids(data):
    ans = copy.deepcopy(data)
    for i in range(len(data)):
        for j in range(len(data[i])):
            ans[i][j] = dic_to_ids[data[i][j]]
    return ans

In [85]:
r = []
for i in range(len(hehe)):
    temp = []
    for j in range(len(hehe[i])):
        temp.append(hehe[i][j][len(hehe[i][j]) - 1])
    r.append(temp)
HMM_rhyming = unsupervised_HMM(r, 30, 200, D_len=len(ids_to_dic))

3202
Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
Iteration: 60
Iteration: 70
Iteration: 80
Iteration: 90
Iteration: 100
Iteration: 110
Iteration: 120
Iteration: 130
Iteration: 140
Iteration: 150
Iteration: 160
Iteration: 170
Iteration: 180
Iteration: 190
Iteration: 200


In [99]:
emission, states = HMM_rhyming.generate_emission(14)

In [100]:
x = ' '.join([str(ids_to_dic[i]) for i in emission])

In [101]:
x

'state acquainted done depart pride lies friend amazeth ordering thence dignity offence dead flatter'

In [31]:
import pickle
with open('../shakespeare_rhymes.pkl', 'rb') as f:
    rhyme_data_raw = pickle.load(f)
    

In [33]:
rhyme_data_raw

[['cease', 'increase', 'lease', 'decease', 'decrease', 'excess'],
 ['i',
  'fee',
  'die',
  'memory',
  'eye',
  'legacy',
  'deny',
  'qualify',
  'thee',
  'fly',
  'by',
  'majesty',
  'usury',
  'husbandry',
  'dignity',
  'sky',
  'eternity',
  'me',
  'canopy',
  'be',
  'ye',
  'see',
  'gravity',
  'thereby',
  'lie',
  'flattery',
  'defy',
  'melancholy',
  'alchemy',
  'history',
  'enmity',
  'constancy',
  'why',
  'posterity',
  'decree',
  'free',
  'fortify',
  'remedy',
  'idolatry',
  'masonry'],
 ['devise',
  'prophecies',
  'cries',
  'spies',
  'despise',
  'eyes',
  'subtleties',
  'lies',
  'arise'],
 ['cruel', 'fuel', 'jewel'],
 ['content',
  'excellent',
  'spent',
  'invent',
  'argument',
  'rent',
  'ornament',
  'monument'],
 ['bring',
  'ordering',
  'thing',
  'niggarding',
  'spring',
  'prefiguring',
  'wing',
  'king',
  'sing'],
 ['allow', 'bow', 'mow', 'brow', 'now', 'bough', 'how'],
 ['stelled', 'held', 'field'],
 ['days', 'decays', 'praise', 'lays

# fuck learning rhyme

In [107]:
rhyme_data_raw

[['cease', 'increase', 'lease', 'decease', 'decrease', 'excess'],
 ['i',
  'fee',
  'die',
  'memory',
  'eye',
  'legacy',
  'deny',
  'qualify',
  'thee',
  'fly',
  'by',
  'majesty',
  'usury',
  'husbandry',
  'dignity',
  'sky',
  'eternity',
  'me',
  'canopy',
  'be',
  'ye',
  'see',
  'gravity',
  'thereby',
  'lie',
  'flattery',
  'defy',
  'melancholy',
  'alchemy',
  'history',
  'enmity',
  'constancy',
  'why',
  'posterity',
  'decree',
  'free',
  'fortify',
  'remedy',
  'idolatry',
  'masonry'],
 ['devise',
  'prophecies',
  'cries',
  'spies',
  'despise',
  'eyes',
  'subtleties',
  'lies',
  'arise'],
 ['cruel', 'fuel', 'jewel'],
 ['content',
  'excellent',
  'spent',
  'invent',
  'argument',
  'rent',
  'ornament',
  'monument'],
 ['bring',
  'ordering',
  'thing',
  'niggarding',
  'spring',
  'prefiguring',
  'wing',
  'king',
  'sing'],
 ['allow', 'bow', 'mow', 'brow', 'now', 'bough', 'how'],
 ['stelled', 'held', 'field'],
 ['days', 'decays', 'praise', 'lays

In [111]:
# we first train an HMM on the backward sequence
reversed_data = []
for sonnet in hehe:
    for line in sonnet:
        reversed_data.append(reverse(line))
reversed_data

NameError: name 'reverse' is not defined