In [8]:
import nltk
from nltk.corpus import cmudict, wordnet
from nltk.tokenize import word_tokenize, wordpunct_tokenize

from itertools import groupby
import itertools

from collections import defaultdict
import collections

import json
import pprint

import copy
from functools import reduce

In [9]:
nltk.download("cmudict")
nltk.download("wordnet")

[nltk_data] Downloading package cmudict to /home/ds/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ds/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
pp = pprint.PrettyPrinter(indent=4)

VOWELS = ['A', 'E', 'I', 'O', 'U']
NUMBER_CHOICES = [0, 1]

input1 = "eye mull of mush sheen"
output1 = "i'm a love machine"

In [5]:
def tokenize_sentence(sentence):
    return sentence.split()


def get_syllables(w):
    return [pron for (word, pron) in cmudict.entries() if word == w]


def get_syllables_of_sentence(sentence):
    return [get_syllables(word) for word in sentence]

In [6]:
def possible_pronunciations(syllables):
    pronunciations = [t for t in itertools.product(*syllables)]
    outer = []
    for pronunciation in pronunciations:
        inner = []
        for word in pronunciation:
            inner += word
        outer += [inner]
    return outer

In [7]:
def post_processing(pronunciations):
    # turn AH0 -> [AH0, AH1, EH0, EH1, IH0, IH1, OH0, OH1, UH0, UH1]
    # two or more of a sound gets knocked off [SH, SH] -> [[SH, ''], [SH]]
    #                                           ->[[SH, SH], ['', SH]]
    # turn B -> [B]
    # then do cartesian product to build all possible strings
    copy_of_pronunciations = copy.deepcopy(pronunciations)
    for pronunciation in pronunciations:
        for sound_loc in range(len(pronunciation)):
            sound = pronunciation[sound_loc]
            if sound_loc + 1 < len(pronunciation) \
                    and pronunciation[sound_loc + 1] == pronunciation[sound_loc]:
                deal_with_duplicates(pronunciation, sound_loc)
            if pronunciation[sound_loc][0] in VOWELS:
                deal_with_vowels(pronunciation, sound_loc)
            if pronunciation[sound_loc] == sound:
                pronunciation[sound_loc] = [pronunciation[sound_loc]]
        copy_of_pronunciations += itertools.product(*pronunciation)
    return copy_of_pronunciations

In [8]:
def deal_with_duplicates(pronunciation, sound_loc):
    pronunciation[sound_loc] = [pronunciation[sound_loc], '']


def deal_with_vowels(pronunciation, sound_loc):
    middle_letter = pronunciation[sound_loc][1]
    pronunciation[sound_loc] = [vowel + middle_letter + str(number)
                                for vowel in VOWELS
                                for number in NUMBER_CHOICES]

In [9]:
def remove_empty_quotes(pronunciations):
    # exists to remove empty quotes in pronunciations
    # turn [[SH, SH], ['', SH]] -> [[SH, SH], [SH]]
    for pronunciation_loc in range(len(pronunciations)):
        pronunciation = pronunciations[pronunciation_loc]
        pronunciations[pronunciation_loc] = [sound
                                             for sound in pronunciation
                                             if sound != '']
    return pronunciations

In [10]:
def do_two_strings_match(input_pronunciations, output_pronunciations):
    return any(output_pronunciation == list(input_pronunciation)
               for input_pronunciation in input_pronunciations
               for output_pronunciation in output_pronunciations)

In [14]:
convert_string_to_syllables = lambda input_string: get_syllables_of_sentence(tokenize_sentence(input_string))
convert_input = lambda syllables: remove_empty_quotes(post_processing(possible_pronunciations(syllables)))

input1_syllables = get_syllables_of_sentence(tokenize_sentence(input1))
output1_syllables = get_syllables_of_sentence(tokenize_sentence(output1))

input_pronunciations = convert_input(convert_string_to_syllables(input1))
prettied_input_pronunciations = set([tuple(x) for x in input_pronunciations])
output_pronunciations = possible_pronunciations(convert_string_to_syllables(output1))

In [15]:
do_two_strings_match(prettied_input_pronunciations, output_pronunciations)

True

In [16]:
VOWELS_IN_CMUDICT = set(pro
                        for (_, pron) in cmudict.entries()
                        for pro in pron
                        if pro[0] in VOWELS)
CMUDICT_DICT = cmudict.dict()
VOWELS_DICT = {}

In [17]:
def find_vowel_combinations(middle_letter):
    if middle_letter not in VOWELS_DICT.keys():
        VOWELS_DICT[middle_letter] = [pro
                                      for pro in VOWELS_IN_CMUDICT
                                      if pro[1] == middle_letter]
    return VOWELS_DICT[middle_letter]

In [18]:
def get_syllables(w):
    if w not in CMUDICT_DICT.keys():
        CMUDICT_DICT[w] = \
            [pron for (word, pron) in cmudict.entries() if word == w]
    return CMUDICT_DICT[w]

In [1]:
def possible_pronunciations(syllables):
    # input: [[[u'AH1', u'V'], [u'AH0', u'V']], [[u'W']]]
    # pronunciations: [([u'AH1', u'V'], [u'W']),([u'AH0', u'V'], [u'W'])]
    # output: [[u'AH1', u'V', u'W'], [u'AH0', u'V', u'W']]
    pronunciations = [t for t in itertools.product(*syllables)]
    
    collapse_sentence = lambda acc, word: acc.extend(word) or acc

    outer = [reduce(collapse_sentence,
                    one_pronunciation, [])
             for one_pronunciation in pronunciations]
    return outer

In [2]:
def post_processing(pronunciations):
    # turn AH0 -> [AH0, AH1, EH0, EH1, IH0, IH1, OH0, OH1, UH0, UH1] # not exactly though
    # two or more of a sound gets knocked off [SH, SH] -> [[SH, ''], [SH]]
    #                                           ->[[SH, SH], ['', SH]]
    # turn B -> [B]
    # then do cartesian product to build all possible strings
    copy_of_pronunciations = copy.deepcopy(pronunciations)
    for pronunciation in pronunciations:
        for sound_loc in range(len(pronunciation)):
            sound = pronunciation[sound_loc]
            if sound_loc + 1 < len(pronunciation) \
                    and pronunciation[sound_loc + 1] == pronunciation[sound_loc]:
                deal_with_duplicates(pronunciation, sound_loc)
            if pronunciation[sound_loc][0] in VOWELS:
                deal_with_vowels(pronunciation, sound_loc)
            if pronunciation[sound_loc] == sound:
                pronunciation[sound_loc] = [sound]
        copy_of_pronunciations += map(lambda x: list(x),
                                      itertools.product(*pronunciation))
    return copy_of_pronunciations

In [3]:
def deal_with_duplicates(pronunciation, sound_loc):
    pronunciation[sound_loc] = [pronunciation[sound_loc], u'']

def deal_with_vowels(pronunciation, sound_loc):
    middle_letter = pronunciation[sound_loc][1]
    pronunciation[sound_loc] = find_vowel_combinations(middle_letter)

In [4]:
def remove_empty_quotes(pronunciations):
    # exists to remove empty quotes in pronunciations
    # turn [[SH, SH], ['', SH]] -> [[SH, SH], [SH]]
    for pronunciation_loc in range(len(pronunciations)):
        pronunciation = pronunciations[pronunciation_loc]
        pronunciations[pronunciation_loc] = [sound
                                             for sound in pronunciation
                                             if sound != '']
    return pronunciations

In [5]:
def do_two_strings_match(input_pronunciations, output_pronunciations):
    return any(output_pronunciation == list(input_pronunciation)
               for input_pronunciation in input_pronunciations
               for output_pronunciation in output_pronunciations)

In [6]:
def compare_two_sentences(input, output, run_everything=False):
    input_syllables = get_syllables_of_sentence(tokenize_sentence(input))
    output_syllables = get_syllables_of_sentence(tokenize_sentence(output))
    input_pronunciations = remove_empty_quotes(
        post_processing(
            possible_pronunciations(input_syllables)))
    prettied_input_pronunciations = set([tuple(x) for x in input_pronunciations])
    output_pronunciations = possible_pronunciations(output_syllables)
    print(do_two_strings_match(prettied_input_pronunciations, output_pronunciations))
    if (run_everything):
        do_everything(prettied_input_pronunciations)

def do_everything(pretty_input_pronunciations):
    # this part is too slow and/or doesn't work
    a = [(word, pron, pronunciation)
         for pronunciation in pretty_input_pronunciations
         for (word, pron)
         in cmudict.entries()
         if pronunciation[:len(pron)] == pron]
    print(a)

In [10]:
d = cmudict.dict()
def tree(): return defaultdict(tree)
users = tree()

In [11]:
users['harold']['username'] = 'hrldcpr'
users['handler']['username'] = 'matthandlersux'

In [12]:
print(json.dumps(users))
print(json.dumps(users['harold']))
users['harold']['username']

{"harold": {"username": "hrldcpr"}, "handler": {"username": "matthandlersux"}}
{"username": "hrldcpr"}


'hrldcpr'

In [13]:
def add(dictionary, sounds, word):
    for sound in sounds:
        dictionary = dictionary[sound]
    dictionary['value'] = word

cmu = tree()

In [14]:
for x in d:
    for p in d[x]:
        add(cmu, p, x)

In [15]:
cmu.keys()

dict_keys(['G', 'DH', 'V', 'AE0', 'HH', 'F', 'AY0', 'D', 'AO0', 'UW1', 'EH2', 'IY2', 'AW2', 'TH', 'AA0', 'AE2', 'IH2', 'IY1', 'M', 'IH1', 'AW0', 'T', 'AH2', 'AW1', 'K', 'AY2', 'L', 'S', 'AH0', 'OY2', 'EH1', 'UH1', 'Z', 'OY1', 'AE1', 'ER2', 'OW0', 'AH1', 'AA1', 'Y', 'EY1', 'ER1', 'EH0', 'JH', 'W', 'R', 'ZH', 'AO2', 'OW2', 'ER0', 'UW0', 'N', 'AY1', 'EY0', 'SH', 'P', 'UW2', 'CH', 'B', 'AA2', 'EY2', 'OW1', 'AO1', 'IY0', 'IH0'])

In [16]:
cmud = json.loads(json.dumps(cmu))

In [18]:
pprint.pprint(cmud['S']['AY1'])

{'AE0': {'D': {'Z': {'value': "sayad's"}, 'value': 'sayad'},
         'K': {'value': 'cyacq'},
         'M': {'value': 'siam'}},
 'AH0': {'N': {'AH0': {'Z': {'IY2': {'N': {'value': 'cyanazine'}}}},
               'AY2': {'D': {'value': 'cyanide'}},
               'S': {'AH0': {'Z': {'value': 'sciences'}},
                     'IH0': {'Z': {'value': "science's"}},
                     'value': 'science'},
               'T': {'IH0': {'S': {'T': {'S': {'value': "scientist's"},
                                         'value': 'scientist'},
                                   'value': 'scientists'}}},
               'value': 'scion'}},
 'B': {'AH0': {'L': {'value': 'sible'},
               'Z': {'value': "ciba's"},
               'value': 'ciba'},
       'AO0': {'R': {'G': {'value': 'cyborg'}}},
       'ER0': {'K': {'AE2': {'SH': {'value': 'cybercash'}}},
               'L': {'IH0': {'K': {'value': 'seiberlich'},
                             'NG': {'value': 'seiberling'}}},
               

In [19]:
cmud['S']['AY1']['value']

'sigh'