### References
- http://mtosmt.org/issues/mto.17.23.4/mto.17.23.4.komaniecki.html
  - Rhyming groups are set in similar metrical locations.
  - Rhyming groups are set to similar rhythmic figures.
  - Rhyming groups are emphasized or articulated in similar ways.
- https://pdfs.semanticscholar.org/8b66/ea2b1fdc0d7df782545886930ddac0daa1de.pdf

# Mapping Flow

In [1]:
!ls lyrics_consolidated/ | grep Eminem

Eminem-and-dj-buttafingaz.mpk
Eminem-ft-logic-joyner-lucas-nitin-randhawa-remix.mpk
Eminem.mpk
Eminem-x-proof.mpk


In [2]:
import msgpack
from random import choice
from pprint import pprint as pp

lyric = ''
with open('lyrics_consolidated/Pharoahe-monch.mpk', 'rb') as lyric:
    corp = msgpack.unpack(lyric, encoding='utf-8')
    lyric = corp['Pharoahe-monch-simon-says-lyrics']['lyrics']

In [3]:
def get_phones(text, stress=True):
    if isinstance(text, str):
        text = [text]
    phoned = []
    for word in text:
        phones = pronouncing.phones_for_word(word)        
        # lots of words in genius are listed as being
        # pronounced as 'in' instead of the formal
        # 'ing' spelling that is in the cmu data
        # and needs modification to correct
        if not phones and re.search('in\'?$', word):
            subbed = re.sub(r'in\'?', 'ing', word)
            with_g = pronouncing.phones_for_word(subbed)
            if with_g:
                # convert the 'ng' phenomes to 'n'
                without_g = re.sub(r'(?<=IH\d) (NG)$', ' N', with_g[0])
                phones = without_g
        # some words start with an apostraphy and might not
        # be listed in the cmu as such
        elif not phones and re.search("^['`‘]", word):
            without_apo = pronouncing.phones_for_word(word[1:])
            if without_apo:
                phones = without_apo[0]
        # we don't need nested lists
        elif phones:
            phones = phones[0]
        # fallback to just the word if not found in cmu
        else:
            phones = word
        # the numbers after a phenome are useful for determining
        # stresses and syllables within words, but aren't that
        # useful for comparing sounds themselves (rhymes)
        if not stress and phones:
            phones = re.sub('\d*', '', phones)
        phoned.append(phones)
    return phoned

In [4]:
import re
from colorama import Fore, Style
import pronouncing

def red_if_not_found(word):
    # color words not found in the cmu as red for further review
    # of pre-processing steps
    if get_phones(word)[0] == word:
        return Fore.RED + word + Fore.BLACK
    return word

def process_line(line):
    words = []
    # remove adlibs
    line = re.sub('\(.+?\)', '', line)
    # split words delimited by either spaces or commas
    for word in re.split('[ ,]', line):
        # strip out lots of characters we don't want for our
        # word analysis, but keep apostrophies
        stripped = re.sub(r"(^'|'$|[;\?\!\n \t\"\:]|\.+|\…)+", '', word)
        # convert a hyphenated word into multiple words
        words += re.split(r"[-–—]", stripped)
    no_blanks = list(filter(None, words))
    return no_blanks

def process_lyrics(lyrics, colorizer=None):
    processed = []
    for line in lyrics:
        proc_line = []
        # filter out song block headers that can span multiple lines
        opened = re.search('^\[', line)
        closed = re.search('\]$', line)
        if opened:
            bracket_open = True
        if bracket_open:
            if closed:
                bracket_open = False
            continue
        for word in process_line(line):
            if colorizer:
                word = colorizer(word)
            proc_line.append(word)
        if proc_line:
            processed.append(proc_line)
    return processed

In [5]:
processed = process_lyrics(lyric, colorizer=red_if_not_found)
for line in processed:
    print(' '.join(line))

Uh uh uh uh uh uh
Uh uh uh uh uh
Uh uh uh uh uh uh
Uh uh uh uh [31mahhh[30m
Get the fuck up
Simon says Get the fuck up
Throw your hands in the sky
Queens is in the back sipping [31mgnac[30m y'all what's up
Girls rub on your [31mtitties[30m
Yeah I said it rub on your [31mtitties[30m
New York City gritty committee pity the fool
That act shitty in the midst of the calm the witty
Y'all know the name
[31mPharoahe[30m fuckin [31mMonch[30m ain't a damn thing changed
You all up in ya Range and shit inebriated
Strayed from your original plan you deviated
I alleviated the pain with long term goals
Took my underground loot without the gold
You sold platinum round the world I sold wood in the hood
But when I'm in the street and shit it's all good
I'm soon to motivate a room control the game like Tomb Raider
Rock clock dollars flip tips like a waiter
Block shots [31mstyle's[30m greater let my lyrics anoint
If you holding up the wall then you missin the point
Get the fuck up
Simon says

In [6]:
def syllabize(line):
    bricks = []
    for word in process_line(line):
        phones = get_phones(word, stress=True)[0]
        count = pronouncing.syllable_count(phones)
        bricks.append((word, count, phones))
    return bricks

def pad_brick(brick):
    padding = max([len(str(b)) for b in brick])
    pad = lambda x: str(x).ljust(padding, ' ')
    brick = [pad(b) for b in brick]
    return brick

In [7]:
from collections import defaultdict
rhymes = defaultdict(list)
processed = process_lyrics(lyric)
found = []
for i, line in enumerate(processed):
    # for the first line we compare it to the next line
    if i == 0:
        lines = processed[:2]
    # for the last line we compare it to the previous
    elif i == len(processed) - 1:
        lines = processed[-2:-1]
    # for all other lines we compare to the previous
    # and the next line
    else:
        lines = processed[i-1:i+2]
    phone_lines = [get_phones(l, stress=True) for l in lines]
    filtered_lines = []
    bank = defaultdict(int)
    for pl in phone_lines:
        rp = pronouncing.rhyming_part
        try:
            rhyming_phones = [rp(word) for word in pl]
        except BaseException as e:
            print(word, type(word), e)
        # remove middle constonants which don't have
        # much to do with loose rhyming
        nix_const = lambda x: re.sub('( \w |\d)+', ' ', x).strip().replace('  ', ' ')
        filtered = list(map(nix_const, rhyming_phones))
        filtered_lines.append(filtered)
        for f in filtered:
            bank[f] += 1    
    found_rhymes = []
    for pl in filtered_lines:
        found_rhymes.append([bank[w] > 1 and ' ' in w for w in pl])
    if i == 0 or i == len(processed) - 1:
        found.append(found_rhymes[0])
    else:
        found.append(found_rhymes[1])
finished = [list(zip(found[i], processed[i])) for i in range(len(found))]
finished[5]

TypeError: 'int' object is not iterable

In [None]:
from IPython.core.display import display, HTML
buff = ''
for line in finished:
    buff += '<br>'
    for rhymes, word in line:
        if rhymes:
            buff += '<b>{} </b>'.format(word)
        else:
            buff += word + ' '
    buff += '</br>'
display(HTML(buff))

In [None]:
for line in lyric:
    padded = [pad_brick(b) for b in syllabize(line)]
    transposed = list(map(list, zip(*padded)))
    print('\n'.join([' '.join(t) for t in transposed]))
    print()

In [None]:
import g2p_en as g2p
