In [97]:
import nltk
from nltk.corpus import cmudict, wordnet
from nltk.tokenize import word_tokenize, wordpunct_tokenize
import pprint
import itertools
import copy
from types import *
import collections
from functools import reduce

In [98]:
nltk.download("cmudict")
nltk.download("wordnet")

[nltk_data] Downloading package cmudict to /home/ds/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ds/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [99]:
pp = pprint.PrettyPrinter(indent=4)

In [100]:
VOWELS = ['A', 'E', 'I', 'O', 'U']

CMUDICT_DICT = collections.defaultdict(list)

In [101]:
VOWELS_IN_CMUDICT = set(pro
                        for (_, pron) in cmudict.entries()
                        for pro in pron
                        if pro[0] in VOWELS)

In [102]:
VOWELS_DICT = {}

In [103]:
input1 = "eye mull of mush sheen"
output1 = "i'm a love machine"

In [104]:
def find_vowel_combinations(middle_letter):
    if middle_letter not in VOWELS_DICT.keys():
        VOWELS_DICT[middle_letter] = [pro
                                      for pro in VOWELS_IN_CMUDICT
                                      if pro[1] == middle_letter]
    return VOWELS_DICT[middle_letter]

In [105]:
def tokenize_sentence(sentence):
    words = [word for word in sentence.split()]
    return words

In [106]:
def get_syllables(w):
    if w not in CMUDICT_DICT.keys():
        CMUDICT_DICT[w] = \
            [pron for (word, pron) in cmudict.entries() if word == w]
    return CMUDICT_DICT[w]


def get_syllables_of_sentence(words):
    return [get_syllables(word) for word in words]

In [107]:
def possible_pronunciations(syllables):
    # input: [[[u'AH1', u'V'], [u'AH0', u'V']], [[u'W']]]
    # pronunciations: [([u'AH1', u'V'], [u'W']),([u'AH0', u'V'], [u'W'])]
    # output: [[u'AH1', u'V', u'W'], [u'AH0', u'V', u'W']]
    pronunciations = [t for t in itertools.product(*syllables)]
    
    collapse_sentence = lambda acc, word: acc.extend(word) or acc

    outer = [reduce(collapse_sentence,
                    one_pronunciation, [])
             for one_pronunciation in pronunciations]
    return outer

In [108]:
def post_processing(pronunciations):
    # turn AH0 -> [AH0, AH1, EH0, EH1, IH0, IH1, OH0, OH1, UH0, UH1] # not exactly though
    # two or more of a sound gets knocked off [SH, SH] -> [[SH, ''], [SH]]
    #                                           ->[[SH, SH], ['', SH]]
    # turn B -> [B]
    # then do cartesian product to build all possible strings
    copy_of_pronunciations = copy.deepcopy(pronunciations)
    for pronunciation in pronunciations:
        for sound_loc in range(len(pronunciation)):
            sound = pronunciation[sound_loc]
            if sound_loc + 1 < len(pronunciation) \
                    and pronunciation[sound_loc + 1] == pronunciation[sound_loc]:
                deal_with_duplicates(pronunciation, sound_loc)
            if pronunciation[sound_loc][0] in VOWELS:
                deal_with_vowels(pronunciation, sound_loc)
            if pronunciation[sound_loc] == sound:
                pronunciation[sound_loc] = [sound]
        copy_of_pronunciations += map(lambda x: list(x),
                                      itertools.product(*pronunciation))
    return copy_of_pronunciations

In [109]:
def deal_with_duplicates(pronunciation, sound_loc):
    pronunciation[sound_loc] = [pronunciation[sound_loc], u'']

def deal_with_vowels(pronunciation, sound_loc):
    middle_letter = pronunciation[sound_loc][1]
    pronunciation[sound_loc] = find_vowel_combinations(middle_letter)

In [110]:
def remove_empty_quotes(pronunciations):
    # exists to remove empty quotes in pronunciations
    # turn [[SH, SH], ['', SH]] -> [[SH, SH], [SH]]
    for pronunciation_loc in range(len(pronunciations)):
        pronunciation = pronunciations[pronunciation_loc]
        pronunciations[pronunciation_loc] = [sound
                                             for sound in pronunciation
                                             if sound != '']
    return pronunciations

In [111]:
def do_two_strings_match(input_pronunciations, output_pronunciations):
    return any(output_pronunciation == list(input_pronunciation)
               for input_pronunciation in input_pronunciations
               for output_pronunciation in output_pronunciations)

In [112]:
def compare_two_sentences(input, output, run_everything=False):
    input_syllables = get_syllables_of_sentence(tokenize_sentence(input))
    output_syllables = get_syllables_of_sentence(tokenize_sentence(output))
    input_pronunciations = remove_empty_quotes(
        post_processing(
            possible_pronunciations(input_syllables)))
    prettied_input_pronunciations = set([tuple(x) for x in input_pronunciations])
    output_pronunciations = possible_pronunciations(output_syllables)
    print(do_two_strings_match(prettied_input_pronunciations, output_pronunciations))
    if (run_everything):
        do_everything(prettied_input_pronunciations)

def do_everything(pretty_input_pronunciations):
    # this part is too slow and/or doesn't work
    a = [(word, pron, pronunciation)
         for pronunciation in pretty_input_pronunciations
         for (word, pron)
         in cmudict.entries()
         if pronunciation[:len(pron)] == pron]
    print(a)

In [113]:
compare_two_sentences(input1, output1)

True
