Run this if nltk not installed in kernal

In [1]:
import sys
!conda install --yes --prefix {sys.prefix} nltk

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



Import Statements

In [2]:
import nltk
import json
import re
from difflib import SequenceMatcher
import numpy as np

Arpabet Setup

In [3]:
#https://stackoverflow.com/questions/33666557/get-phonemes-from-any-word-in-python-nltk-or-other-modules
#Set arpabet to nltk cmudict corpus. If cmudict unavalible, download it then set.
try:
    arpabet = nltk.corpus.cmudict.dict()
except LookupError:
    nltk.download('cmudict')
    arpabet = nltk.corpus.cmudict.dict()

Define Functions

In [4]:
# Define translation from word to phonetics
# If word not in cmudict dictionary, find closest match (SequenceMatcher)
def word_to_phonetic(arpabet, word):
    word = word.lower()
    try:
        phonetic = arpabet[word]
    except:
        keys = arpabet.keys()
        how_similar = [SequenceMatcher(None, word, key).ratio() for key in keys]
        max_index = how_similar.index(max(how_similar))
        phonetic = list(arpabet.values())[max_index]
    if type(phonetic) == list:
        phonetic = phonetic[0]
    return phonetic


# Define translation from phonetics to words
# If word not in cmudict dictionary, find closest match (SequenceMatcher)
def phonetic_to_word(arpabet, phonemes):
    try:
        word = list(arpabet.keys())[list(arpabet.values()).index(phonemes)]
    except:
        phonemes = phonemes[0]
        values = arpabet.values()
        how_similar = [SequenceMatcher(None, phonemes, value[0]).ratio() for value in values]
        max_index = how_similar.index(max(how_similar))
        word = list(arpabet.keys())[max_index]
        if type(word) == list:
            word = word[0]
    return word


# Pre-processing of *.txt into sentences
def text_to_sentences(data, split_str, remove_chars, r, toLowerCase = True):
    data = re.split(split_str, data)
    data = [d.replace('\n', ' ') for d in data]
    data = [re.sub(remove_chars, '', d) for d in data]
    if toLowerCase:
        data = [d.lower() for d in data]
    data = [" ".join([i for i in d.split(' ') if i]) for d in data]
    data = [d for d in data if len(d) >= r[0] and len(d) <= r[1]]
    uniqueChars = set(' '.join(data))
    return (data, len(uniqueChars))


# Convert list of sentence to list of phoneme lists
def sentences_to_phonemes(arpabet, data, print_every, of):
    data = [([word_to_phonetic(arpabet, word) for word in d.split(' ')],
             (print("Line:", i, "of", of) if i % print_every == 0 else '')) for i, d in enumerate(data, 1)]
    return list(zip(*data))[0]


# Convert list of phoneme lists to sentences
def phonemes_to_sentences(arpabet, data, print_every, of):
    data = [(' '.join([phonetic_to_word(arpabet, [p]) for p in d]),
             (print("Line:", i, "of", of) if i % print_every == 0 else '')) for i, d in enumerate(data, 1)]
    return list(zip(*data))[0]


# Output phonemes in json format
def to_json(jsonOutput_name, data):
    data_as_dict = {"Sentence "+str(i+1):{"Word "+str(l+1):{"Phoneme "+str(m+1):n for (m, n) in enumerate(k)} for (l, k) in enumerate(j)} for (i, j) in enumerate(data)}
    jsonOutput = open(jsonOutput_name, "w")
    jsonOutput.write(json.dumps(data_as_dict, indent=4))
    jsonOutput.close()

    
# Import phonemes from json format
def from_json(jsonInput_file):
    data = json.load(open(jsonInput_file))
    data = [[list(n.values()) for n in list(m.values())] for m in list(data.values())]
    return tuple(data)

English/Phonetics Translation Example usage

In [5]:
#Get phonetics of word in dictionary
w = "biggest"
p = word_to_phonetic(arpabet, w)
print(w, "=>",p)
#Get closest phonetics of word NOT in dictionary
w = "bigests"
p = word_to_phonetic(arpabet, w)
print(w, "=>",p)
#Get word of phonetics in dictionary
p = [['B', 'IH1', 'G', 'AH0', 'S', 'T']]
w = phonetic_to_word(arpabet, p)
print(p, "=>",w)
#Get closest word of phonetics NOT in dictionary
p = [['B', 'IH1', 'G', 'AH0', 'S', 'G']]
w = phonetic_to_word(arpabet, p)
print(p, "=>",w)

biggest => ['B', 'IH1', 'G', 'AH0', 'S', 'T']
bigests => ['B', 'IH1', 'G', 'AH0', 'S', 'T']
[['B', 'IH1', 'G', 'AH0', 'S', 'T']] => biggest
[['B', 'IH1', 'G', 'AH0', 'S', 'G']] => biggest


\*.txt processing Example usage

In [6]:
# Open *.txt file
file = open("bible.txt")
# Process *.txt file into list of sentences, and obtain number of unique characters
# text_to_sentences(str, regex sentence split chars, regex whitelist chars, acceptable sentence range, toLower)
# Here, sentence splits specified as .?! and \n with uppercase following, whitelist only letters, remove any sentences
# of fewer than 10 or more than 100 letters (including spaces), and converts final array to all lowercase.
data, unique_chars = \
    text_to_sentences(file.read(), '\.|\!|\?|\n(?=[A-Z])', r'[^a-zA-Z ]+', (10, 100), toLowerCase = True)
# For illustration purposes, only use first 10 lines from here on
data = data[0:5]
r = 4 #random.randint(0, len(data))
# Print some stuff
print("Number of sentences in book:", len(data))
print("Number of unique characters:", unique_chars)
print("Example sentence:", data[r]) #random.choice(data))
# Get phonetic transcriptions of sentences
print("----- Transcribing sentences to phonemes -----")
p = sentences_to_phonemes(arpabet, data, 1, len(data))
print("Example phonetic transcription:", p[r])
# Get English equivilents of phonetic transcription
print("----- Transcribing phonemes to sentences -----")
s = phonemes_to_sentences(arpabet, p, 1, len(data))
print("Example English transcription:", s[r])
# close file
file.close()

# Output Phonetic transcriptions p to 'example.json'
to_json("example.json", p)
# Import Phonetic transcriptions from 'example.json'
f = from_json('example.json')
# Compare f and p to confirm to_json/from_json works as intended
print("Export/import path matches original values: ", f == p)

Number of sentences in book: 5
Number of unique characters: 27
Example sentence: the old testament of the king james version of the bible
----- Transcribing sentences to phonemes -----
Line: 1 of 5
Line: 2 of 5
Line: 3 of 5
Line: 4 of 5
Line: 5 of 5
Example phonetic transcription: [['DH', 'AH0'], ['OW1', 'L', 'D'], ['T', 'EH1', 'S', 'T', 'AH0', 'M', 'AH0', 'N', 'T'], ['AH1', 'V'], ['DH', 'AH0'], ['K', 'IH1', 'NG'], ['JH', 'EY1', 'M', 'Z'], ['V', 'ER1', 'ZH', 'AH0', 'N'], ['AH1', 'V'], ['DH', 'AH0'], ['B', 'AY1', 'B', 'AH0', 'L']]
----- Transcribing phonemes to sentences -----
Line: 1 of 5
Line: 2 of 5
Line: 3 of 5
Line: 4 of 5
Line: 5 of 5
Example English transcription: the old testament of the king james version of the bible
Export/import path matches original values:  True


Now, process entire bible.txt 

In [8]:
file = open("bible.txt")
data, unique_chars = \
    text_to_sentences(file.read(), '\.|\!|\?|\n(?=[A-Z])', r'[^a-zA-Z ]+', (10, 100), toLowerCase = True)
print("Number of sentences in book:", len(data))
print("Number of unique characters:", unique_chars)
print("----- Transcribing sentences to phonemes -----")
p = sentences_to_phonemes(arpabet, data, 100, len(data))
file.close()
to_json("bible.json", p)

Number of sentences in book: 17326
Number of unique characters: 27
----- Transcribing sentences to phonemes -----
Line: 100 of 17326
Line: 200 of 17326
Line: 300 of 17326
Line: 400 of 17326
Line: 500 of 17326
Line: 600 of 17326
Line: 700 of 17326
Line: 800 of 17326
Line: 900 of 17326
Line: 1000 of 17326
Line: 1100 of 17326
Line: 1200 of 17326
Line: 1300 of 17326
Line: 1400 of 17326
Line: 1500 of 17326
Line: 1600 of 17326
Line: 1700 of 17326
Line: 1800 of 17326
Line: 1900 of 17326
Line: 2000 of 17326
Line: 2100 of 17326
Line: 2200 of 17326
Line: 2300 of 17326
Line: 2400 of 17326
Line: 2500 of 17326
Line: 2600 of 17326
Line: 2700 of 17326
Line: 2800 of 17326
Line: 2900 of 17326
Line: 3000 of 17326
Line: 3100 of 17326
Line: 3200 of 17326
Line: 3300 of 17326
Line: 3400 of 17326
Line: 3500 of 17326
Line: 3600 of 17326
Line: 3700 of 17326
Line: 3800 of 17326
Line: 3900 of 17326
Line: 4000 of 17326
Line: 4100 of 17326
Line: 4200 of 17326
Line: 4300 of 17326
Line: 4400 of 17326
Line: 4500 of 1

In [9]:
f = from_json('bible.json')
print("Export/import path matches original values: ", f == p)

Export/import path matches original values:  True
