In [66]:
import nltk
from nltk.corpus import cmudict
from nltk.tokenize import word_tokenize, wordpunct_tokenize, sent_tokenize

import os
import re

In [60]:
text = open(os.path.join(os.getcwd(), 'data/shakespeare.txt')).read()

# remove sonnet numbers and convert to lowercase
text = re.sub(r'[0-9]+', '', text) 
text = text.lower()
text = text.split('\n')

In [6]:
sonnet = ['O thou, my lovely boy, who in thy power', \
   'Dost hold Time\'s fickle glass, his sickle, hour;', \
   'Who hast by waning grown, and therein showst', \
   'Thy lovers withering as thy sweet self growst;', \
   'If Nature, sovereign mistress over wrack,', \
   'As thou goest onwards, still will pluck thee back,', \
   'She keeps thee to this purpose, that her skill', \
   'May time disgrace and wretched minutes kill.', \
   'Yet fear her, O thou minion of her pleasure!', \
   'She may detain, but not still keep, her treasure:', \
   'Her audit, though delayed, answered must be,', \
   'And her quietus is to render thee.']

In [40]:
tokens = [wordpunct_tokenize(s) for s in sonnet]
punct = set(['.', ',', '!', ':', ';'])
filtered = [ [w for w in sentence if w not in punct ] for sentence in tokens]
last = [ sentence[len(sentence) - 1] for sentence in filtered]

In [62]:
# tokenize the data and filter out punctuation
tokens2 = [wordpunct_tokenize(s) for s in text]
punct = set(['.', ',', '!', ':', ';'])
filtered2 = [ [w for w in sentence if w not in punct ] for sentence in tokens2]
filtered2 = [x for x in filtered2 if len(x) != 0]

# get last word in each line
last2 = [ sentence[len(sentence) - 1] for sentence in filtered2]

In [None]:
# get syllables and word pronunciations 
syllables = \
[[(w, len(p), p) for (w, p) in cmudict.entries() if word == w] \
   for w in last2]

In [None]:
def rhymes(s):
    '''
    Function that determines rhyming words by truncating the first sound
    and comparing the remaining sounds.
    
    Input: 
    s: a tuple (w, len(p), p) where w is a word, len(p) is its number of syllables,
    and p is its pronunciation
    
    Output:
    filtered: a list of words that rhyme with w. If none, returns w. If w is 
    empty, returns [].
    '''
    
    try:
    (w, l, p) = s[0]
        try:
            filtered = [wt for (wt, pt) in cmudict.entries() \ 
                if l == len(pt) and p[1:] == pt[1:] \
                   and len(wordnet.synsets(wt)) > 0]
            return filtered
        except:
            return [w]
    except:
        return []

In [None]:
# get rhymes 
[rhymes(s) for s in syllables]