### References

#### General notes and inspiration
- [matching rhymes from hamilton](http://graphics.wsj.com/hamilton-methodology/)
    - [details](https://journalism.stanford.edu/cj2016/files/Writing%20an%20Algorithm%20To%20Analyze%20and%20Visualize%20Lyrics%20From%20the%20Musical%20Hamilton.pdf)
- [general notes on rhyming](http://mtosmt.org/issues/mto.17.23.4/mto.17.23.4.komaniecki.html)
  - Rhyming groups are set in similar metrical locations.
  - Rhyming groups are set to similar rhythmic figures.
  - Rhyming groups are emphasized or articulated in similar ways.
  
#### Finding Rhymes
- [applying BLOSUM to phoneme combinations](https://pdfs.semanticscholar.org/8b66/ea2b1fdc0d7df782545886930ddac0daa1de.pdf)
- [converting phonemes to syllables](http://www.anthology.aclweb.org/N/N09/N09-1035.pdf)
- [phonetic similarity metrics](https://homes.cs.washington.edu/~bhixon/papers/phonemic_similarity_metrics_Interspeech_2011.pdf)

#### Substitutions
- [constonant misidentification](http://www.ebire.org/hcnlab/papers/WoodsJASA2010.pdf)
- [ARPABET to IPA](https://www.wikiwand.com/en/ARPABET)
- [B-Rhymes](http://www.b-rhymes.com/faq/)

#### Syllables
- [general ideas on syllables](http://sitr.us/2007/09/24/anatomy-of-a-syllable.html)
- [rules for syllabication](http://alias-i.com/lingpipe/demos/tutorial/hyphenation/read-me.html)
- [align graphenes to phonemes](http://www.aclweb.org/anthology/P10-1080)
    - [code](https://github.com/letter-to-phoneme/m2m-aligner)

# Mapping Flow

In [1]:
!ls lyrics_consolidated/ | grep Eminem

Eminem-and-dj-buttafingaz.mpk
Eminem-ft-logic-joyner-lucas-nitin-randhawa-remix.mpk
Eminem.mpk
Eminem-x-proof.mpk


In [2]:
import pixiedust
import json
from os import path as osp
missing_words_file = 'words-missing-from-cmu.json'
if osp.isfile(missing_words_file):
    with open(missing_words_file, 'r') as wf:
        WORDS_MISSING_FROM_CMU = json.load(wf)
else:
    WORDS_MISSING_FROM_CMU = {}
WORDS_MISSING_FROM_CMU

Pixiedust database opened successfully


{'ahhh': 'AE1 HH',
 'gnac': 'G AE1 N AH0 K',
 'titties': 'T IH1 T IY0 Z',
 'Pharoahe': 'F AA0 R OW1',
 'Monch': 'M AA1 N CH',
 "style's": 'S T AY1 L Z',
 'Girlies': 'G IH1 R L IY0 Z',
 'BMs': 'B AE1 M Z',
 '12pm': 'T W EH1 L V EH2 P AH0 M',
 'eses': 'EH1 S AH0 Z',
 'Rollies': 'R AA1 L IY0 Z',
 'Jeru': 'JH EH1 R UW0'}

In [3]:
import msgpack
from random import choice
from pprint import pprint as pp

lyric = ''
with open('lyrics_consolidated/Pharoahe-monch.mpk', 'rb') as lyric:
    corp = msgpack.unpack(lyric, encoding='utf-8')
    lyric = corp['Pharoahe-monch-simon-says-lyrics']['lyrics']

In [29]:
import re

def process_line(line):
    words = []
    # remove adlibs
    line = re.sub('\(.+?\)', '', line)
    # split words delimited by either spaces or commas
    for word in re.split('[ ,]', line):
        # strip out lots of characters we don't want for our
        # word analysis, but keep apostrophies
        stripped = re.sub(r"(^'|'$|[;\?\!\n \t\"\:]|\.+|\…)+", '', word)
        # convert a hyphenated word into multiple words
        words += re.split(r"[-–—]", stripped)
    no_blanks = list(filter(None, words))
    return no_blanks

def clean_headers(lyrics):
    processed = []
    for line in lyrics:
        # filter out song block headers that can span multiple lines
        opened = re.search('^\[', line)
        closed = re.search('\]$', line)
        if opened:
            bracket_open = True
        if bracket_open:
            if closed:
                bracket_open = False
            continue
        line = process_line(line)
        if line:
            processed.append(line)
    return processed

In [30]:
import g2p_en as g2p

def get_phones(text, stress=True):
    phoned = []
    with g2p.Session():
        for line in text:
            phoned_line = []
            for word in line:
                phones = pronouncing.phones_for_word(word)        
                # lots of words in genius are listed as being
                # pronounced as 'in' instead of the formal
                # 'ing' spelling that is in the cmu data
                # and needs modification to correct
                if not phones and re.search('in\'?$', word):
                    subbed = re.sub(r'in\'?', 'ing', word)
                    with_g = pronouncing.phones_for_word(subbed)
                    if with_g:
                        # convert the 'ng' phenomes to 'n'
                        without_g = re.sub(r'(?<=IH\d) (NG)$', ' N', with_g[0])
                        phones = without_g
                # some words start with an apostraphy and might not
                # be listed in the cmu as such
                elif not phones and re.search("^['`‘]", word):
                    without_apo = pronouncing.phones_for_word(word[1:])
                    if without_apo:
                        phones = without_apo[0]
                # fallback to use slower g2p
                elif not phones:
                    phones_cached = WORDS_MISSING_FROM_CMU.get(word)
                    if not phones_cached:
                        phones = ' '.join(g2p.g2p(word))
                        WORDS_MISSING_FROM_CMU[word] = phones
                    else:
                        phones = phones_cached
                # we don't need nested lists
                else:
                    phones = phones[0]
                # the numbers after a phenome are useful for determining
                # stresses and syllables within words, but aren't that
                # useful for comparing sounds themselves (rhymes)
                if not stress and phones:
                    phones = re.sub('\d*', '', phones)
                phoned_line.append(phones)
            phoned.append(phoned_line)
    with open(missing_words_file, 'w') as wf:
        json.dump(WORDS_MISSING_FROM_CMU, wf)
    return phoned

In [31]:
cleaned = clean_headers(lyric)
processed = get_phones(cleaned)
len(cleaned), len(processed)

INFO:tensorflow:Restoring parameters from /home/hank/anaconda3/lib/python3.6/site-packages/g2p_en/logdir/model_epoch_14_gs_27956


(60, 60)

In [60]:
from collections import defaultdict
import pronouncing

def match(graphemes, phonemes):
    found = []
    for i, line in enumerate(graphemes):
        ## Build rolling window of 2-3 lines
        # for the first line we compare it to the next line
        if i == 0:
            i_left, i_right = 0, 2
        # for the last line we compare it to the previous
        elif i == len(graphemes) - 1:
            i_left, i_right = -2, -1
        # for all other lines we compare to the previous
        # and the next line
        else:
            i_left, i_right = i-1, i+2
        ## Process phone groups before matching
        filtered_lines = []
        bank = defaultdict(int)
        for phone_group in phonemes[i_left:i_right]:
            # grab rhyming section of word (basic match)
            rp = pronouncing.rhyming_part
            rhyming_phones = [rp(word) for word in phone_group]
            # remove middle constonants which don't have
            # much to do with loose rhyming
            nix_const = lambda x: re.sub('( \w |\d)+', ' ', x).strip().replace('  ', ' ')
            filtered = list(map(nix_const, rhyming_phones))
            filtered_lines.append(filtered)
            # add filtered rhyming parts to common bank 
            # for current rolling window state
            for f in filtered:
                bank[f] += 1 
        # match
        found_rhymes = []
        for phone_group in filtered_lines:
            # positive match if rhyming phonemes in bank
            # and also not single vowel
            match = lambda x: bank[x] > 1 and ' ' in x
            # create bitmap for words to determine if they rhyme
            found_rhymes.append([match(w) for w in phone_group])
        # if we're on the first line, or the line only has one word
        if i == 0 or len(line) == 1:
            found.append(found_rhymes[0])
        else:            
            found.append(found_rhymes[1])
    finished = [list(zip(found[i], graphemes[i])) for i in range(len(found))]
    return finished

In [61]:
matched = match(cleaned, processed)
matched

[[(False, 'Uh'),
  (False, 'uh'),
  (False, 'uh'),
  (False, 'uh'),
  (False, 'uh'),
  (False, 'uh')],
 [(False, 'Uh'), (False, 'uh'), (False, 'uh'), (False, 'uh'), (False, 'uh')],
 [(False, 'Uh'),
  (False, 'uh'),
  (False, 'uh'),
  (False, 'uh'),
  (False, 'uh'),
  (False, 'uh')],
 [(False, 'Uh'), (False, 'uh'), (False, 'uh'), (False, 'uh'), (False, 'ahhh')],
 [(True, 'Get'), (True, 'the'), (True, 'fuck'), (True, 'up')],
 [(False, 'Simon'),
  (False, 'says'),
  (True, 'Get'),
  (True, 'the'),
  (True, 'fuck'),
  (True, 'up')],
 [(False, 'Throw'),
  (False, 'your'),
  (False, 'hands'),
  (True, 'in'),
  (True, 'the'),
  (False, 'sky')],
 [(False, 'Queens'),
  (False, 'is'),
  (True, 'in'),
  (True, 'the'),
  (False, 'back'),
  (False, 'sipping'),
  (False, 'gnac'),
  (False, "y'all"),
  (False, "what's"),
  (False, 'up')],
 [(False, 'Girls'),
  (True, 'rub'),
  (True, 'on'),
  (True, 'your'),
  (True, 'titties')],
 [(False, 'Yeah'),
  (False, 'I'),
  (False, 'said'),
  (False, 'it'),


In [62]:
from IPython.core.display import display, HTML
buff = ''
dont_match = set(["the","be","to","of","and","a","in","that","have","I","it","for","not","on","with","he","as","you","do","at","this","but","his","by","from","they","we","say","her","she","or","an","will","my","one","all","world","there","their","what","so","who","if","them","yeah"])
for line in matched:
    buff += '<br>'
    for rhymes, word in line:
        if rhymes and word not in dont_match:
            buff += '<b>{} </b>'.format(word)
        else:
            buff += word + ' '
    buff += '</br>'
display(HTML(buff))

In [56]:
for line in lyric:
    padded = [pad_brick(b) for b in syllabize(line)]
    transposed = list(map(list, zip(*padded)))
    print('\n'.join([' '.join(t) for t in transposed]))
    print()

NameError: name 'syllabize' is not defined

In [14]:
import g2p_en as g2p

with g2p.Session():
    phs = [g2p.g2p(line) for line in lyric]
    

INFO:tensorflow:Restoring parameters from /home/hank/anaconda3/lib/python3.6/site-packages/g2p_en/logdir/model_epoch_14_gs_27956


TypeError: sequence item 0: expected str instance, list found

In [24]:
for line in phs:
    print(' '.join(line))

P R AH0 D UW1 S T   B AY1   F AA0 R OW1   M AA1 N CH
IH1 N T R OW0
AH1 HH UW0   ,   AH1 HH UW0   ,   AH1 HH UW0
AH1 HH UW0   ,   AH1 HH UW0   ,   AH1
AH1 HH UW0   ,   AH1 HH UW0   ,   AH1 HH UW0
AH1 HH UW0   ,   AH1 HH UW0   ,   AE1 HH
HH UH1 K
G EH1 T   DH AH0   F AH1 K   AH1 P
S AY1 M AH0 N   S EH1 Z   ,   G EH1 T   DH AH0   F AH1 K   AH1 P
TH R OW1   Y AO1 R   HH AE1 N D Z   IH0 N   DH AH0   S K AY1
B AH1 B HH UH2 HH UH1 B UW0 B UW0   !
K W IY1 N Z   IH1 Z   IH0 N   DH AH0   B AE1 K   S IH1 P IH0 NG   N AE1 K   ,   Y AO2 L   ,   W AH1 T S   AH1 P   ?
G ER1 L Z   ,   R AH1 B   AA1 N   Y AO1 R   T IH1 T IY0 Z   Y AE1   !
Y AE1   ,   AY1   S EH1 D   IH1 T   ,   R AH1 B   AA1 N   Y AO1 R   T IH1 T IY0 Z
N UW1   Y AO1 R K   S IH1 T IY0   G R IH1 T IY0   K AH0 M IH1 T IY0   P IH1 T IY0   DH AH0   F UW1 L
DH AE1 T   AE1 K T   SH IH1 T IY2   IH0 N   DH AH0   M IH1 D S T   AH1 V   DH AH0   K AA1 M   ,   DH AH0   W IH1 T IY0
V ER1 S   W AH1 N
Y AO2 L   N OW1   DH AH0   N EY1 M
F AA0 R OW1   F

until current phoneme is a vowel
    label current phoneme as an onset
end loop
until all phonemes have been labeled
    label current phoneme as a nucleus
    if there are no more vowels in the word
        label all remaining consonants as codas
    else
        onset := all consonants before next vowel
        coda := empty
        until onset is legal
            coda := coda plus first phoneme of onset
            onset := onset less first phoneme
        end loop
    end if
end loop

In [25]:
def syls(word, phones):
    remove_digits = lambda x: re.sub('\d*', '', x)
    if isinstance(phones, str):
        phones = remove_digits(phones).split(' ')
    else:
        phones = [remove_digits(p) for p in phones]
    syls = []
    tagged_phones = []
    vowels = set(["AO","AA", "IY", "UW", "EH", "IH", "UH", "AH", "AE", "EY", "AY", "OW", "AW", "OY", "ER"])
    for i, p in eunumerate(phones):
        if p not in vowels:
            tagged_phones.append((p, 'onset'))
            continue
        tagged_phones.append((p,'nucleus'))
        remaining = phones[i:]
        if not set(remaining) & vowels:
            tagged_phones += [(p, 'coda') for p in remaining]
            break
        else:
            pass
def dum_syls(phones):
    
            

In [26]:
syls('inebriated', 'IH0 N EH1 B R IY0 EY2 T AH0 D')

In [28]:
get_phones('standardize')

['S T AE1 N D ER0 D AY2 Z']