In [58]:
from nltk.corpus import gutenberg
from nltk.corpus import cmudict
from collections import defaultdict
import string
import re
import random

In [22]:
sample_text = gutenberg.words('melville-moby_dick.txt')[4712:]

In [13]:
cmu_dict = cmudict.dict()

# Cleaning up the text

- Turn to lower case
- remove punctuation

In [50]:
def cleanup_text(words):
    # remove punctuation
    # third parameter of str.maketrans are chars that will be mapped to None
    transtab = str.maketrans('', '', string.punctuation)
    temp_words = [w.translate(transtab) for w in words]
    temp_words = [w for w in temp_words if w != '']
    
    # turn to lower case
    temp_words = [w.lower() for w in temp_words]
    
    return temp_words

# Creating the markov chain (forward and backward looking)

In [28]:
def get_markov(words):
    markov_forward = defaultdict(list)
    for i, w in enumerate(words[:-1]):
        markov_forward[w].append(words[i + 1])
    
    markov_backward = defaultdict(list)
    for i, w in enumerate(words[1:]):
        markov_backward[w].append(words[i - 1])
        
    return markov_forward, markov_backward

# Get the stressed vowels of a word

In [56]:
def get_stress_pattern(word):
    # look up the word in the cmudict
    if word in cmu_dict:
        # get the first pronounciation of the word
        cmu_word = cmu_dict[word][0]
        pattern = ''
        for c in cmu_word:
            if c[-1] in '012':
                pattern += c[-1]
    else:
        # word was not found
        pattern = None
        
    return pattern

# Generate a line based on a start or end word and pattern

In [141]:
def pattern_match(word_pattern, line_pattern, reverse=False):
    """
    Match a word pattern (if the vowels are stressed or not) to a target pattern for a line of poetry.
    
    Patterns are in the form of '[012]+', e.g. '0102'. 
    - '0': vowel is not stressed
    - '1': vowel has primary stress
    - '2': vowel has secondary stress. Vowels with 2 in either the word pattern or line pattern will match any
           vowel in the other pattern, e.g. '012' matches both '010', '011' and '012'.
    See http://en.wikipedia.org/wiki/Arpabet or https://www.nltk.org/book/ch02.html, chapter 4.2 A Pronouncing Dictionary,
    based on the nltk.corpus.cmudict CMU Pronouncing Dictionary for US English.
    """
    if reverse:
        word_pattern = word_pattern[::-1]
        line_pattern = line_pattern[::-1]
    for w, l in zip(list(word_pattern), list(line_pattern)):
        if (l == '2') or (w == '2') or (w == l):
            continue
        else:
            return False
            
    return True

In [146]:
def generate_poetry_line(seed, pattern, reverse=False):
    # look up list of following words
    f_words = markov_backward[seed] if reverse else markov_forward[seed]
    
    # order the markov words in random order
    for fw in random.sample(f_words, len(f_words)):
        # get the pattern of the word
        next_word_pattern = get_stress_pattern(fw)
        if next_word_pattern and pattern_match(next_word_pattern, pattern, reverse):
            remaining_pattern = pattern[:-len(next_word_pattern)] if reverse else pattern[len(next_word_pattern):]
            # if no more pattern to consume, return the word we found
            if remaining_pattern == '':
                return [fw, ]
            else:
                remaining_phrase = generate_poetry_line(fw, remaining_pattern, reverse)
                if remaining_phrase:
                    return remaining_phrase + [fw, ] if reverse else [fw, ] + remaining_phrase
                
    # we didn't find a chain that matches the pattern
    return None

# Generate words that rhyme

- Go through each word, select the last vowel and any following consonants
- Add to a rhyme dictionary

Any words in the same rhyme class will also automatically have the same pattern!

In [None]:
def get_word_rhyme(word):
    # look up the word in the cmudict
    if word in cmu_dict:
        # get the first pronounciation of the word
        cmu_word = cmu_dict[word][0]
        # find the last vowel

In [None]:
def create_rhyme_dict(words):
    rhyme_dict = defaultdict(list)


# Generate multiple lines based on line patterns and lines rhyming with each other

- Pick a start word based on:
    - the pattern to match (e.g. pick a word '010' if line pattern ends in '010'
    - the number of rhymes required (e.g. if lines 1 and 3 need to rhyme, pick a class of words that rhyme with each other
    

# Test the process

In [52]:
clean_text = cleanup_text(sample_text)
markov_forward, markov_backward = get_markov(clean_text)

In [57]:
test_words = ['ishmael', 'whale', 'ship', 'condescending']
for tw in test_words:
    print(cmu_dict[tw])
    print(get_stress_pattern(tw))

[['IH1', 'SH', 'M', 'IY0', 'L'], ['IH1', 'SH', 'M', 'EY0', 'L']]
10
[['W', 'EY1', 'L'], ['HH', 'W', 'EY1', 'L']]
1
[['SH', 'IH1', 'P']]
1
[['K', 'AA2', 'N', 'D', 'IH0', 'S', 'EH1', 'N', 'D', 'IH0', 'NG']]
2010


In [165]:
#random.seed(2021)
seed = random.choice(clean_text)
generate_poetry_line(seed, '1010101010', reverse=True)

['yes', 'a', 'hater', 'some', 'the', 'hinted', 'at', 'the']