#### Import statements

In [1]:
import nltk
import json
import re
import pickle
from time import time
from tqdm.notebook import tqdm
from nltk.metrics.distance import jaccard_distance
from nltk.util import ngrams

#### Arpabet Setup

In [2]:
#https://stackoverflow.com/questions/33666557/get-phonemes-from-any-word-in-python-nltk-or-other-modules
#Set arpabet to nltk cmudict corpus. If cmudict unavalible, download it then set.
try:
    arpabet = nltk.corpus.cmudict.dict()
except LookupError:
    nltk.download('cmudict')
    arpabet = nltk.corpus.cmudict.dict()

#Keeping only the first set of phonemes
for phoneme in arpabet: arpabet[phoneme] = arpabet[phoneme][0]

accepted_words = list(arpabet.keys())
accepted_phonemes = list(arpabet.values())

#### Define Functions

In [12]:
# Define translation from word to phonetics
# If word not in cmudict dictionary, find closest match (Jaccard Distance)
def word_to_phonetic(word):
    try:
        phonetic = arpabet[word]
    except:
        closest = [(jaccard_distance(set(ngrams(word, 2)),set(ngrams(w, 2))),w) for w in accepted_words if w[0]==word[0]]
        phonetic = arpabet[min(closest, key = lambda d: d[0])[1]]
    return phonetic

# Define translation from phonetics to words
# If word not in cmudict dictionary, find closest match (Jaccard Distance)
def phonetic_to_word(phonemes):
    try:
        word = accepted_words[accepted_phonemes.index(phonemes)]
    except:
        closest = [(jaccard_distance(set(ngrams(phonemes, 2)),set(ngrams(w, 2))),w) for w in accepted_phonemes]
        word = accepted_words[accepted_phonemes.index(min(closest, key = lambda t: t[0])[1])]
    return word

# Convert list of sentence to list of phoneme lists
def sentences_to_phonemes(data):
    return [sentence_to_phonemes(sentence) for sentence in tqdm(data)]

def sentence_to_phonemes(sentence):
    return [word_to_phonetic(word) for word in sentence.split(' ')]

# Convert list of phoneme lists to sentences
def phonemes_to_sentence(data):
    return [phoneme_to_sentence(sentence) for sentence in tqdm(data)]

def phoneme_to_sentence(phonemes):
    return ' '.join([phonetic_to_word(phoneme) for phoneme in phonemes])

# Pre-processing of *.txt into sentences
def text_to_sentences(data, split_str='\.|\!|\?', remove_chars=r'[^a-zA-Z ]+'):
    data = re.split(split_str, data)
    data = [d.replace('\n', ' ') for d in data]
    data = [re.sub(remove_chars, '', d.lower()).lstrip() for d in data]
    data = [" ".join(d.split()) for d in data if len(d) != 0]
    return data

# Output phonemes in json format
def to_json(jsonOutput_name, data):
    data_as_dict = {"Sentence "+str(i+1):{"Word "+str(l+1):{"Phoneme "+str(m+1):n for (m, n) in enumerate(k)} for (l, k) in enumerate(j)} for (i, j) in enumerate(data)}
    jsonOutput = open(jsonOutput_name, "w")
    jsonOutput.write(json.dumps(data_as_dict, indent=4))
    jsonOutput.close()


# Import phonemes from json format
def from_json(jsonInput_file):
    data = json.load(open(jsonInput_file))
    data = [[list(n.values()) for n in list(m.values())] for m in list(data.values())]
    return data

def arpabet_recompiler(arpabet, data, export_file='arpabet.pkl'):
    #Data preprocessing
    data = data.replace('\n', ' ')
    data = re.sub(r'[^a-zA-Z ]+', '', data.lower())
    data = re.split(' ', data)
    data = [" ".join(d.split()) for d in data if len(d) != 0]
    
    #Removing all unused arpabet words
    arpabet_set, data_set = set(arpabet), set(data)
    for word in tqdm(data_set - arpabet_set):
        closest = [(jaccard_distance(set(ngrams(word, 2)),set(ngrams(w, 2))),w) for w in accepted_words if w[0]==word[0]]
        data_set.add(min(closest, key = lambda d: d[0])[1])
    for k in arpabet_set - data_set:
        del arpabet[k]
        
    #Export new arpabet dictionary
    file = open(export_file,"wb")
    pickle.dump(arpabet,file)
    file.close()
    
    return arpabet

def arpabet_reader(import_file):
    file = open(import_file, "rb")
    arpabet = pickle.load(file)
    file.close()
    return arpabet, list(arpabet.keys()), list(arpabet.values())

#### English/Phonetics Translation Example usage

In [13]:
start_time = time()

print("----- Recompiling Arpabet -----")
file = open("bible.txt")
arpabet = arpabet_recompiler(arpabet, file.read())
file.close()
print("")

file = open("bible.txt")
data = text_to_sentences(file.read())
data = data[:500]
arpabet, accepted_words, accepted_phonemes = arpabet_reader('arpabet.pkl')

print("----- Transcribing sentences to phonemes -----")
p = sentences_to_phonemes(data)
print("Example phonetic transcription:", p[0], "\n")

print("----- Transcribing phonemes to sentences -----")
s = phonemes_to_sentence(p)
print("Example English transcription:", s[0],"\n")

# close file
file.close()

print("--- Total Time Elapsed: %s seconds ---\n" % (time() - start_time))

to_json("example.json", p)
f = from_json('example.json')
print("Export/import path matches original values: ", f == p)

----- Recompiling Arpabet -----


  0%|          | 0/5379 [00:00<?, ?it/s]


----- Transcribing sentences to phonemes -----


  0%|          | 0/500 [00:00<?, ?it/s]

Example phonetic transcription: [['IH0', 'N'], ['DH', 'AH0'], ['B', 'IH0', 'G', 'IH1', 'N', 'IH0', 'NG'], ['G', 'AA1', 'D'], ['K', 'R', 'IY0', 'EY1', 'T', 'AH0', 'D'], ['DH', 'AH0'], ['HH', 'EH1', 'V', 'AH0', 'N'], ['AH0', 'N', 'D'], ['DH', 'AH0'], ['ER1', 'TH']] 

----- Transcribing phonemes to sentences -----


  0%|          | 0/500 [00:00<?, ?it/s]

Example English transcription: in the beginning god created the heaven and the earth 

--- Total Time Elapsed: 17.870779514312744 seconds ---

Export/import path matches original values:  True
