In [1]:
import numpy as np
import matplotlib.pyplot as plt
import itertools
import nltk
from nltk.corpus import cmudict
nltk.download('cmudict')
import pickle

[nltk_data] Downloading package cmudict to
[nltk_data]     /Users/kushaltirumala/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


In [2]:
with open("../data/shakespeare.txt", "r") as f:
    data = f.read()

In [3]:
def get_sonnets(filename):
    """ Returns list of separate sonnets in the file. 
    Each line of a sonnet is a list of words (tokens). """
    with open(filename) as file:
        sons, son = [], []
        for line in file:
            line = line.strip()
            if line.isdigit() == False:
                lst = line.lower().translate(str.maketrans('', '', ':;,.?!()')).split()
                if len(lst) > 0: # count words in line
                    son.append(lst)
                elif len(son) > 0:
                    sons.append(son)
                    son = []
    sons.append(son) # add the final sonnet
    return sons
file = '../data/shakespeare.txt'
sonnets = get_sonnets(file)

# The sonnets 99 and 126 (at indices 98 and 125)
# do not count as sonnets, since they are not 14 lines long
for i in range(len(sonnets)):
    if len(sonnets[i]) != 14:
        print(i)
        
sonnets = sonnets[:98] + sonnets[99:125] + sonnets[126:]

d = cmudict.dict()
def count_syl(word):
    """ Returns number of syllables in a word. Taken
    from StackOverflow. """
    if word in d:
        return len(list(y for y in d[word][0] if y[-1].isdigit()))
    count = 0
    vowels = 'aeiouy'
    
    if word[0] in vowels:
        count +=1
    for index in range(1,len(word)):
        if word[index] in vowels and word[index-1] not in vowels:
            count +=1
    if word.endswith('e'):
        count -= 1
    if word.endswith('le'):
        count+=1
    if count == 0:
        count +=1
    return count

def get_syl(dic_to_ids):
    """ Returns dictionary of IDs to syllables given 
    a dictionary mapping words to ids. """
    dic = {}
    for word in dic_to_ids.keys():
        id_ = dic_to_ids[word]
        if "'" in word:
            dic[id_] = 1
        else:
            dic[id_] = sum(count_syl(w) for w in word.split("-"))
        
    return dic

def get_id(lines):
    """ Returns dictionary of words to IDs and 
    dictionary of IDs to words. """
    dic_to_ids, dic_to_words = {}, {}
    id_ = 0
    for line in lines:
        for word in line:
            if word not in dic_to_ids:
                dic_to_ids[word] = id_
                dic_to_words[id_] = word
                id_ += 1
    return dic_to_ids, dic_to_words

def get_mappings(sonnets, name):
    """ Returns dictionary of words to IDs, dictionary of IDs to words, 
    and dictionary of IDs to syllables. """    
    lines = list(itertools.chain.from_iterable(sonnets)) # list of lines
    dic_to_ids, dic_to_words = get_id(lines)
    dic_syl = get_syl(dic_to_ids)
    
    with open(name, 'wb') as f:
        pickle.dump((dic_to_ids, dic_to_words, dic_syl), f)
    
    return dic_to_ids, dic_to_words, dic_syl

dic_to_ids, dic_to_words, dic_syl = get_mappings(sonnets, 'shakespeare_dics.pkl')

98
125


In [4]:
ids_to_dic = {v: k for k,v in dic_to_ids.items()}

In [5]:
import copy

hehe = copy.deepcopy(sonnets)
for i in range(len(sonnets)):
    for j in range(len(sonnets[i])):
        for k in range(len(sonnets[i][j])):
            hehe[i][j][k] = dic_to_ids[sonnets[i][j][k]]
hehe = np.array(hehe)

In [6]:
fin = []
for i in range(len(hehe)):
    temp = []
    for j in range(len(hehe[i])):
        for k in range(len(hehe[i][j])):
            temp.append(hehe[i][j][k])
    fin.append(temp)
fin = np.array(fin)

In [7]:
from HMM_sol import *

In [9]:
HMM10 = unsupervised_HMM(fin, 10, 200)

Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
Iteration: 60
Iteration: 70
Iteration: 80
Iteration: 90
Iteration: 100
Iteration: 110
Iteration: 120
Iteration: 130
Iteration: 140
Iteration: 150
Iteration: 160
Iteration: 170
Iteration: 180
Iteration: 190
Iteration: 200


In [41]:
emission, states = HMM10.generate_emission(20)

In [42]:
x = ' '.join([str(ids_to_dic[i]) for i in emission])

In [43]:
x

'me brightness tongue to like as weigh thou long impregnable my all this graces how thee of but have as'

In [90]:
# take first thing that is int thing in txt
meter_data = {}
with open("../data/Syllable_dictionary.txt", "r") as f:
    for line in f:
        temp = line.strip().split(" ")
        word = temp[0]
        count = temp[1:]
        
        for possible_val in count:
            if possible_val.isdigit():
                meter_data[word] = int(possible_val)
                break

In [91]:
meter_data["niggarding"]

3

In [None]:
generated_sonnet = []
for _ in range(14):
    

In [143]:
def generate_emission_sequential(syl_count, A, O):
    emission = []
    states = []

    # choose starting state
    y_i = np.random.randint(0, len(A))
    states.append(y_i)

    counter = 0
    while counter < syl_count:
        y_i = int(y_i)
        # print(len(self.O[y_i]))
        array = range(len(O[y_i]))
        observation_index = np.random.choice(array, p=O[y_i])
        while counter + meter_data[ids_to_dic[observation_index]] > syl_count:
#             print("uhoh")
            observation_index = np.random.choice(array, p=O[y_i])
            
        emission.append(observation_index)
        counter += meter_data[ids_to_dic[observation_index]]
        
    return emission, states

    

In [144]:
emission, states = generate_emission_sequential(10, HMM10.A, HMM10.O)

In [145]:
x = ' '.join([str(ids_to_dic[i]) for i in emission])

In [146]:
x

'elsewhere nights by was but grace to die of'

In [147]:
for _ in range(14):
    emission, states = generate_emission_sequential(10, HMM10.A, HMM10.O)
    x = ' '.join([str(ids_to_dic[i]) for i in emission])
    print(x)

of numbers thine return idly lives fame
will nothing yet not there brief live praise thee
soul had was seek saw know strangely if live
him you thee me a true if shun so keeps
then chance be better faint am no deem toil
raised hide admitted i not vacant be
spend farther stay not predict think shows have
gain face blush sight thee the beauty debate
breast other will summer part view slight child
best unseeing love this miles hate the those
to and with past give nor of of was with
most buried all other which that through and
closet die might spurring yet i dull it
and weak buried though forsake therein life


In [148]:
def save_hmm(hmm, filename):
    with open("models/"+filename, 'wb') as f:
        pickle.dump(hmm, f)
save_hmm(HMM10, "hmm_naive_10_sonnets.pkl")