In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from operator import itemgetter

In [2]:
# Read in English dataset
english = '../datasets/cmudict.dict'
data = []
with open(english, 'r') as infile:
        for line in infile:
                    data.append((line.rstrip('\n').split(' ')))
data[:5]

[["'bout", 'B', 'AW1', 'T'],
 ["'cause", 'K', 'AH0', 'Z'],
 ["'course", 'K', 'AO1', 'R', 'S'],
 ["'cuse", 'K', 'Y', 'UW1', 'Z'],
 ["'em", 'AH0', 'M']]

In [3]:
# There were words included that weren't in English, so removing those and storing elsewhere for now
not_english = []
for item in data:
    if '#' in item:
        not_english.append('item')
        data.remove(item)

In [4]:
# Adding a stop character to the end of each word and start at the beginning
for item in data:
    item.append('stop')
for item in data:
    item.insert(1, 'start')

In [8]:
data[:5]

[["'bout", 'start', 'B', 'AW1', 'T', 'stop'],
 ["'cause", 'start', 'K', 'AH0', 'Z', 'stop'],
 ["'course", 'start', 'K', 'AO1', 'R', 'S', 'stop'],
 ["'cuse", 'start', 'K', 'Y', 'UW1', 'Z', 'stop'],
 ["'em", 'start', 'AH0', 'M', 'stop']]

In [9]:
# Extending the data so each word incrementally gains one of its phonemes
extended_data = []
for item in data:
        for i in range(len(item)-1):
            extended_data.append(item[:i+2])

In [10]:
extended_data

[["'bout", 'start'],
 ["'bout", 'start', 'B'],
 ["'bout", 'start', 'B', 'AW1'],
 ["'bout", 'start', 'B', 'AW1', 'T'],
 ["'bout", 'start', 'B', 'AW1', 'T', 'stop'],
 ["'cause", 'start'],
 ["'cause", 'start', 'K'],
 ["'cause", 'start', 'K', 'AH0'],
 ["'cause", 'start', 'K', 'AH0', 'Z'],
 ["'cause", 'start', 'K', 'AH0', 'Z', 'stop'],
 ["'course", 'start'],
 ["'course", 'start', 'K'],
 ["'course", 'start', 'K', 'AO1'],
 ["'course", 'start', 'K', 'AO1', 'R'],
 ["'course", 'start', 'K', 'AO1', 'R', 'S'],
 ["'course", 'start', 'K', 'AO1', 'R', 'S', 'stop'],
 ["'cuse", 'start'],
 ["'cuse", 'start', 'K'],
 ["'cuse", 'start', 'K', 'Y'],
 ["'cuse", 'start', 'K', 'Y', 'UW1'],
 ["'cuse", 'start', 'K', 'Y', 'UW1', 'Z'],
 ["'cuse", 'start', 'K', 'Y', 'UW1', 'Z', 'stop'],
 ["'em", 'start'],
 ["'em", 'start', 'AH0'],
 ["'em", 'start', 'AH0', 'M'],
 ["'em", 'start', 'AH0', 'M', 'stop'],
 ["'frisco", 'start'],
 ["'frisco", 'start', 'F'],
 ["'frisco", 'start', 'F', 'R'],
 ["'frisco", 'start', 'F', 'R', 'I

In [54]:
with open("../datasets/processed_english.dict", "w") as txt_file:
    for line in extended_data:
        txt_file.write(" ".join(line) + "\n")

In [16]:
pre_process = []
for item in extended_data:
    pre_process.append([item[0], ' '.join(item[1:-1]), item[-1]])

In [17]:
pre_process[:5]

[["'bout", '', 'start'],
 ["'bout", 'start', 'B'],
 ["'bout", 'start B', 'AW1'],
 ["'bout", 'start B AW1', 'T'],
 ["'bout", 'start B AW1 T', 'stop']]

In [22]:
eng_df = pd.DataFrame(pre_process, columns = ['word', 'phonemes', 'label'])

In [24]:
eng_df.to_csv('../datasets/processed/processed_english.csv')