#### Import statements

In [20]:
import nltk
import json
import re
from time import time
from tqdm.notebook import tqdm
from nltk.metrics.distance import jaccard_distance
from nltk.util import ngrams

#### Arpabet Setup

In [21]:
#https://stackoverflow.com/questions/33666557/get-phonemes-from-any-word-in-python-nltk-or-other-modules
#Set arpabet to nltk cmudict corpus. If cmudict unavalible, download it then set.
try:
    arpabet = nltk.corpus.cmudict.dict()
except LookupError:
    nltk.download('cmudict')
    arpabet = nltk.corpus.cmudict.dict()

#Keeping only the first set of phonemes
for phoneme in arpabet: arpabet[phoneme] = arpabet[phoneme][0]

accepted_words = list(arpabet.keys())
accepted_phonemes = list(arpabet.values())

In [22]:
# Define translation from word to phonetics
# If word not in cmudict dictionary, find closest match (Jaccard Distance)
def word_to_phonetic(word):
    try:
        phonetic = arpabet[word]
    except:
        closest = [(jaccard_distance(set(ngrams(word, 2)),set(ngrams(w, 2))),w) for w in accepted_words if w[0]==word[0]]
        phonetic = arpabet[min(closest, key = lambda d: d[0])[1]]
    return phonetic

# Define translation from phonetics to words
# If word not in cmudict dictionary, find closest match (Jaccard Distance)
def phonetic_to_word(phonemes):
    try:
        word = accepted_words[accepted_phonemes.index(phonemes)]
    except:
        closest = [(jaccard_distance(set(phonemes),set(w)),w) for w in accepted_phonemes]
        word = accepted_words[accepted_phonemes.index(min(closest, key = lambda t: t[0])[1])]
    return word

In [23]:
# Pre-processing of *.txt into sentences
def text_to_sentences(data, split_str='\.|\!|\?', remove_chars=r'[^a-zA-Z ]+'):
    data = re.split(split_str, data)
    data = [d.replace('\n', ' ') for d in data]
    data = [re.sub(remove_chars, '', d.lower()).lstrip() for d in data]
    data = [re.sub(' +', ' ', d) for d in data]
    return data

In [24]:
# Convert list of sentence to list of phoneme lists
def sentences_to_phonemes(data):
    return [sentence_to_phonemes(sentence) for sentence in tqdm(data)]

def sentence_to_phonemes(sentence):
    return [word_to_phonetic(word) for word in sentence.split(' ')]

# Convert list of phoneme lists to sentences
def phonemes_to_sentence(data):
    return [phoneme_to_sentence(sentence) for sentence in tqdm(data)]

def phoneme_to_sentence(phonemes):
    return ' '.join([phonetic_to_word(phoneme) for phoneme in phonemes])

In [25]:
# Output phonemes in json format
def to_json(jsonOutput_name, data):
    data_as_dict = {"Sentence "+str(i+1):{"Word "+str(l+1):{"Phoneme "+str(m+1):n for (m, n) in enumerate(k)} for (l, k) in enumerate(j)} for (i, j) in enumerate(data)}
    jsonOutput = open(jsonOutput_name, "w")
    jsonOutput.write(json.dumps(data_as_dict, indent=4))
    jsonOutput.close()


# Import phonemes from json format
def from_json(jsonInput_file):
    data = json.load(open(jsonInput_file))
    data = [[list(n.values()) for n in list(m.values())] for m in list(data.values())]
    return tuple(data)

In [26]:
start_time = time()
file = open("bible.txt")
data = text_to_sentences(file.read())
data = data[:5]

r = 4 #random.randint(0, len(data))

print("----- Transcribing sentences to phonemes -----")
p = sentences_to_phonemes(data)

print("Example phonetic transcription:", p[r])
# Get English equivilents of phonetic transcription

print("----- Transcribing phonemes to sentences -----")
s = phonemes_to_sentence(p)

print("Example English transcription:", s[r])
# close file
file.close()

print("--- Total Time Elapsed: %s seconds ---" % (time() - start_time))

to_json("bible.json", p)

----- Transcribing sentences to phonemes -----


  0%|          | 0/5 [00:00<?, ?it/s]

Example phonetic transcription: [['AH0', 'N', 'D'], ['G', 'AA1', 'D'], ['S', 'AO1'], ['DH', 'AH0'], ['L', 'AY1', 'T'], ['DH', 'AE1', 'T'], ['IH1', 'T'], ['W', 'AA1', 'Z'], ['G', 'UH1', 'D'], ['AH0', 'N', 'D'], ['G', 'AA1', 'D'], ['D', 'IH0', 'V', 'AY1', 'D', 'AH0', 'D'], ['DH', 'AH0'], ['L', 'AY1', 'T'], ['F', 'R', 'AH1', 'M'], ['DH', 'AH0'], ['D', 'AA1', 'R', 'K', 'N', 'AH0', 'S']]
----- Transcribing phonemes to sentences -----


  0%|          | 0/5 [00:00<?, ?it/s]

Example English transcription: and god saw the light that it waas good and god divided the light from the darkness
--- Total Time Elapsed: 0.3593268394470215 seconds ---
