Run this if nltk not installed in kernal

In [None]:
import sys
!conda install --yes --prefix {sys.prefix} nltk

Import Statements

In [None]:
import nltk
import json
import re
import os
import time
from difflib import SequenceMatcher
import numpy as np

Arpabet Setup

In [None]:
#https://stackoverflow.com/questions/33666557/get-phonemes-from-any-word-in-python-nltk-or-other-modules
#Set arpabet to nltk cmudict corpus. If cmudict unavalible, download it then set.
try:
    arpabet = nltk.corpus.cmudict.dict()
except LookupError:
    nltk.download('cmudict')
    arpabet = nltk.corpus.cmudict.dict()

Define Functions

In [None]:
# Define translation from word to phonetics
# If word not in cmudict dictionary, find closest match (SequenceMatcher)
def word_to_phonetic(arpabet, word):
    word = word.lower()
    try:
        phonetic = arpabet[word]
    except:
        keys = arpabet.keys()
        how_similar = [SequenceMatcher(None, word, key).ratio() for key in keys]
        max_index = how_similar.index(max(how_similar))
        phonetic = list(arpabet.values())[max_index]
    if type(phonetic) == list:
        phonetic = phonetic[0]
    return phonetic


# Define translation from phonetics to words
# If word not in cmudict dictionary, find closest match (SequenceMatcher)
def phonetic_to_word(arpabet, phonemes):
    try:
        word = list(arpabet.keys())[list(arpabet.values()).index(phonemes)]
    except:
        phonemes = phonemes[0]
        values = arpabet.values()
        how_similar = [SequenceMatcher(None, phonemes, value[0]).ratio() for value in values]
        max_index = how_similar.index(max(how_similar))
        word = list(arpabet.keys())[max_index]
        if type(word) == list:
            word = word[0]
    return word


# Pre-processing of *.txt into sentences
def text_to_sentences(data, split_str, remove_chars, r, toLowerCase = True):
    data = re.split(split_str, data)
    data = [d.replace('\n', ' ') for d in data]
    data = [re.sub(remove_chars, '', d) for d in data]
    if toLowerCase:
        data = [d.lower() for d in data]
    data = [" ".join([i for i in d.split(' ') if i]) for d in data]
    data = [d for d in data if len(d) >= r[0] and len(d) <= r[1]]
    uniqueChars = set(' '.join(data))
    return (data, len(uniqueChars))


# Convert list of sentence to list of phoneme lists
def sentences_to_phonemes(arpabet, data, print_every, of):
    data = [([word_to_phonetic(arpabet, word) for word in d.split(' ')],
             (print("Line:", i, "of", of) if i % print_every == 0 else '')) for i, d in enumerate(data, 1)]
    return list(zip(*data))[0]


# Convert list of phoneme lists to sentences
def phonemes_to_sentences(arpabet, data, print_every, of):
    data = [(' '.join([phonetic_to_word(arpabet, [p]) for p in d]),
             (print("Line:", i, "of", of) if i % print_every == 0 else '')) for i, d in enumerate(data, 1)]
    return list(zip(*data))[0]


# Output phonemes in json format
def to_json(jsonOutput_name, data):
    data_as_dict = {"Sentence "+str(i+1):{"Word "+str(l+1):{"Phoneme "+str(m+1):n for (m, n) in enumerate(k)} for (l, k) in enumerate(j)} for (i, j) in enumerate(data)}
    jsonOutput = open(jsonOutput_name, "w")
    jsonOutput.write(json.dumps(data_as_dict, indent=4))
    jsonOutput.close()

    
# Import phonemes from json format
def from_json(jsonInput_file):
    data = json.load(open(jsonInput_file))
    data = [[list(n.values()) for n in list(m.values())] for m in list(data.values())]
    return tuple(data)

Automatic processing 

In [None]:
book_path = 'books'
book_processed_path = 'books-processed'
json_path = 'jsons'
files = [book_path+'/'+str(f) for f in os.listdir(book_path) if f.endswith('.txt')]
total_books = len(files)

for i, f in enumerate(files):
    file = open(f, encoding="utf8")
    book_name = os.path.basename(file.name).split(".")[0]
    print("============ Processing book "+str(i+1)+" of "+str(total_books)+": "+str(book_name)+" ============")
    data, unique_chars = \
    text_to_sentences(file.read(), '\.|\!|\?|\n(?=[A-Z])', r'[^a-zA-Z ]+', (10, 200), toLowerCase = True)
    print("Number of sentences in book: "+str(len(data)))
    book_processed = open(book_processed_path+'/'+book_name+".txt", "w")
    book_processed.write("\n". join(data))
    book_processed.close()
    print("List of sentences saved in "+book_processed_path+'/'+book_name+".txt")
    p = sentences_to_phonemes(arpabet, data, 100, len(data))
    file.close()
    to_json(json_path+'/'+book_name+".json", p)
    time.sleep(5)
    p_from_file = from_json(json_path+'/'+book_name+".json")
    print("Export/import path matches original values: ", p_from_file == p)

print("Done!")