# Data preprocessing

How will you tokenize the data set? What will consist of a singular sequence, a poem, a stanza, or a line? Do you keep some words tokenized as bigrams? Do you split hyphenated words? How will you handle punctuation? 

Tokenize words in sonnet. Give HMM each line to train on.
Keep apostrophes and hyphens in a given word and remove other punctuation

Map words to a unique ID in a dictionary. Then map unique ID to number of syllables another dictionary.

In [1]:
import itertools
import nltk
from nltk.corpus import cmudict
nltk.download('cmudict')

[nltk_data] Downloading package cmudict to
[nltk_data]     C:\Users\alyci\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


True

In [2]:
def get_sonnets(filename):
    """ Returns list of separate sonnets in the file. 
    Each line of a sonnet is a list of words (tokens). """
    with open(filename) as file:
        sons, son = [], []
        for line in file:
            line = line.strip()
            if line.isdigit() == False:
                lst = line.lower().translate(str.maketrans('', '', ',:;.?!()')).split()
                if len(lst) > 0: # count words in line
                    son.append(lst)
                elif len(son) > 0:
                    sons.append(son)
                    son = []
    sons.append(son) # add the final sonnet
    return sons

In [3]:
def count_syl(word):
    """ Returns number of syllables in a word. Taken
    from StackOverflow. """
    d = cmudict.dict()
    if word in d:
        return max([len([y for y in x if y[-1].isdigit()]) for x in d[word.lower()]])
    
    count = 0
    vowels = 'aeiouy'
    
    if word[0] in vowels:
        count +=1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index-1] not in vowels:
            count +=1
    if word.endswith('e'):
        count -= 1
    if word.endswith('le'):
        count+=1
    if count == 0:
        count +=1
    return count

def get_syl(dic_to_ids):
    """ Returns dictionary of IDs to syllables given 
    a dictionary mapping words to ids. """
    dic = {}
    for word in dic_to_ids.keys():
        id_ = dic_to_ids[word]
        if "'" in word:
            dic[id_] = 1
        else:
            dic[id_] = sum(count_syl(w) for w in word.split("-"))
        
    return dic

In [4]:
def get_id(lines):
    """ Returns dictionary of words to IDs and 
    dictionary of IDs to words. """
    dic_to_ids, dic_to_words = {}, {}
    id_ = 0
    for line in lines:
        for word in line:
            if word not in dic_to_ids:
                dic_to_ids[word] = id_
                dic_to_words[id_] = word
                id_ += 1
    return dic_to_ids, dic_to_words

In [5]:
# Main function, which separates quatrains and couplets
# Set reverse to true to receive data with each line backwards
def get_mappings(filename):
    """ Returns dictionary of words to IDs, dictionary of IDs to words, 
    and dictionary of IDs to syllables. """
    sonnets = get_sonnets(filename)
    lines = list(itertools.chain.from_iterable(sonnets)) # list of lines
    dic_to_ids, dic_to_words = get_id(lines)
    dic_syl = get_syl(dic_to_ids)
    
    return dic_to_ids, dic_to_words, dic_syl

In [6]:
file = 'data/shakespeare.txt'
dic_to_ids, dic_to_words, dic_syl = get_mappings(file)

KeyboardInterrupt: 