In [1]:
import numpy as np
import matplotlib.pyplot as plt
import itertools
import nltk
from nltk.corpus import cmudict
nltk.download('cmudict')
import pickle

[nltk_data] Downloading package cmudict to
[nltk_data]     /Users/kushaltirumala/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


In [3]:
with open("../data/shakespeare.txt", "r") as f:
    data = f.read()

In [4]:
def get_sonnets(filename):
    """ Returns list of separate sonnets in the file. 
    Each line of a sonnet is a list of words (tokens). """
    with open(filename) as file:
        sons, son = [], []
        for line in file:
            line = line.strip()
            if line.isdigit() == False:
                lst = line.lower().translate(str.maketrans('', '', ':;,.?!()')).split()
                if len(lst) > 0: # count words in line
                    son.append(lst)
                elif len(son) > 0:
                    sons.append(son)
                    son = []
    sons.append(son) # add the final sonnet
    return sons
file = '../data/shakespeare.txt'
sonnets = get_sonnets(file)

# The sonnets 99 and 126 (at indices 98 and 125)
# do not count as sonnets, since they are not 14 lines long
for i in range(len(sonnets)):
    if len(sonnets[i]) != 14:
        print(i)
        
sonnets = sonnets[:98] + sonnets[99:125] + sonnets[126:]

d = cmudict.dict()
def count_syl(word):
    """ Returns number of syllables in a word. Taken
    from StackOverflow. """
    if word in d:
        return len(list(y for y in d[word][0] if y[-1].isdigit()))
    count = 0
    vowels = 'aeiouy'
    
    if word[0] in vowels:
        count +=1
    for index in range(1,len(word)):
        if word[index] in vowels and word[index-1] not in vowels:
            count +=1
    if word.endswith('e'):
        count -= 1
    if word.endswith('le'):
        count+=1
    if count == 0:
        count +=1
    return count

def get_syl(dic_to_ids):
    """ Returns dictionary of IDs to syllables given 
    a dictionary mapping words to ids. """
    dic = {}
    for word in dic_to_ids.keys():
        id_ = dic_to_ids[word]
        if "'" in word:
            dic[id_] = 1
        else:
            dic[id_] = sum(count_syl(w) for w in word.split("-"))
        
    return dic

def get_id(lines):
    """ Returns dictionary of words to IDs and 
    dictionary of IDs to words. """
    dic_to_ids, dic_to_words = {}, {}
    id_ = 0
    for line in lines:
        for word in line:
            if word not in dic_to_ids:
                dic_to_ids[word] = id_
                dic_to_words[id_] = word
                id_ += 1
    return dic_to_ids, dic_to_words

def get_mappings(sonnets, name):
    """ Returns dictionary of words to IDs, dictionary of IDs to words, 
    and dictionary of IDs to syllables. """    
    lines = list(itertools.chain.from_iterable(sonnets)) # list of lines
    dic_to_ids, dic_to_words = get_id(lines)
    dic_syl = get_syl(dic_to_ids)
    
    with open(name, 'wb') as f:
        pickle.dump((dic_to_ids, dic_to_words, dic_syl), f)
    
    return dic_to_ids, dic_to_words, dic_syl

dic_to_ids, dic_to_words, dic_syl = get_mappings(sonnets, 'shakespeare_dics.pkl')

98
125


In [5]:
ids_to_dic = {v: k for k,v in dic_to_ids.items()}

In [6]:
import copy

hehe = copy.deepcopy(sonnets)
for i in range(len(sonnets)):
    for j in range(len(sonnets[i])):
        for k in range(len(sonnets[i][j])):
            hehe[i][j][k] = dic_to_ids[sonnets[i][j][k]]
hehe = np.array(hehe)

In [11]:
fin = []
for sonnet in hehe:
    for line in sonnet:
        fin.append(line)
fin = np.array(fin)
fin.shape

(2128,)

In [12]:
from HMM_sol import *
HMM10 = unsupervised_HMM(fin, 10, 200)

Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
Iteration: 60
Iteration: 70
Iteration: 80
Iteration: 90
Iteration: 100
Iteration: 110
Iteration: 120
Iteration: 130
Iteration: 140
Iteration: 150
Iteration: 160
Iteration: 170
Iteration: 180
Iteration: 190
Iteration: 200


In [61]:
emission, states = HMM10.generate_emission(10)

In [62]:
x = ' '.join([str(ids_to_dic[i]) for i in emission])

In [63]:
x

'bloody an stay increase purpose dear most as the receipt'

In [65]:
HMM50 = unsupervised_HMM(fin, 20, 200)

Iteration: 10
Iteration: 20
Iteration: 30
Iteration: 40
Iteration: 50
Iteration: 60
Iteration: 70
Iteration: 80
Iteration: 90
Iteration: 100
Iteration: 110
Iteration: 120
Iteration: 130
Iteration: 140
Iteration: 150
Iteration: 160
Iteration: 170
Iteration: 180
Iteration: 190
Iteration: 200


In [78]:
emission, states = HMM50.generate_emission(10)

In [79]:
x = ' '.join([str(ids_to_dic[i]) for i in emission])

In [80]:
x

'active to him white and vow unstained thine that then'

In [82]:
def generate_emission_sequential(syl_count, A, O):
    emission = []
    states = []

    # choose starting state
    y_i = np.random.randint(0, len(A))
    states.append(y_i)

    counter = 0
    while counter < syl_count:
        y_i = int(y_i)
        # print(len(self.O[y_i]))
        array = range(len(O[y_i]))
        observation_index = np.random.choice(array, p=O[y_i])
        while counter + meter_data[ids_to_dic[observation_index]] > syl_count:
#             print("uhoh")
            observation_index = np.random.choice(array, p=O[y_i])
            
        emission.append(observation_index)
        counter += meter_data[ids_to_dic[observation_index]]
        
    return emission, states

In [83]:
for _ in range(14):
    emission, states = generate_emission_sequential(10, HMM50.A, HMM50.O)
    x = ' '.join([str(ids_to_dic[i]) for i in emission])
    print(x)

NameError: name 'meter_data' is not defined