In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from operator import itemgetter
from string import digits
import re

In [30]:
# Read in English dataset
english = '../data/cmudict/cmudict.dict'
data = []
with open(english, 'r') as infile:
        for line in infile:
                    data.append((line.rstrip('\n').split(' ')))
data[:5]

[["'bout", 'B', 'AW1', 'T'],
 ["'cause", 'K', 'AH0', 'Z'],
 ["'course", 'K', 'AO1', 'R', 'S'],
 ["'cuse", 'K', 'Y', 'UW1', 'Z'],
 ["'em", 'AH0', 'M']]

In [7]:
# Read in data
english = '../data/cmudict/cmudict.dict'
data = []

with open(english) as f:
        lines = f.readlines()
for line in lines:
    pairs = line.strip('\n').split(' ')
    for pair in pairs:
        if re.search(r'(\d)', pair) or '#' in pair:
        # skipping any alternate pronunciations, which are denoted by (2) or (3)
        # similarly, skipping any words of foreign descent, denoted by '#'
            continue
        else:
            # remove 
            pair = re.sub(r'\d', '', pair)
            pair = re.sub(r'[^A-Za-z\s]', '?', pair)
    data.append(pairs)

In [None]:
# Read in data
english = '../data/cmudict/cmudict.dict'
data = []

with open(english) as f:
        lines = f.readlines()
for line in lines:
    pairs = line.strip('\n').split(' ', 1)
    if re.search(r'(\d)', pairs[0]) or '#' in pairs[1]:
        # skipping any alternate pronunciations, which are denoted by (2) or (3)
        # similarly, skipping any words of foreign descent, denoted by '#'
        continue
    else:
        # remove 
        pairs[1] = re.sub(r'\d', '', pairs[1])
        pairs[0] = re.sub(r'[^A-Za-z\s]', '?', pairs[0])
        data.append(pairs)

In [31]:
# There were words included that weren't in English, so removing those and storing elsewhere for now
not_english = []
for item in data:
    if '#' in item:
        not_english.append(item)
    else:
        for word in item:
            if '(' in word:
                 not_english.append(item)

In [34]:
data_clean = []
for item in data:
    if item not in not_english:
        data_clean.append(item)

In [35]:
(len(data) - len(not_english)) - len(data_clean)

0

In [36]:
# Adding a stop character to the end of each word and start at the beginning
for item in data_clean:
    item.append('stop')
for item in data_clean:
    item.insert(1, 'start')

In [37]:
data_clean[:5]

[["'bout", 'start', 'B', 'AW1', 'T', 'stop'],
 ["'cause", 'start', 'K', 'AH0', 'Z', 'stop'],
 ["'course", 'start', 'K', 'AO1', 'R', 'S', 'stop'],
 ["'cuse", 'start', 'K', 'Y', 'UW1', 'Z', 'stop'],
 ["'em", 'start', 'AH0', 'M', 'stop']]

In [38]:
# Extending the data so each word incrementally gains one of its phonemes
extended_data = []
for item in data_clean:
        for i in range(len(item)-1):
            extended_data.append(item[:i+2])

In [39]:
extended_data[:7]

[["'bout", 'start'],
 ["'bout", 'start', 'B'],
 ["'bout", 'start', 'B', 'AW1'],
 ["'bout", 'start', 'B', 'AW1', 'T'],
 ["'bout", 'start', 'B', 'AW1', 'T', 'stop'],
 ["'cause", 'start'],
 ["'cause", 'start', 'K']]

In [40]:
with open("../data/model_ready/dict/processed_english.dict", "w") as txt_file:
    for line in extended_data:
        txt_file.write(" ".join(line) + "\n")

In [41]:
pre_process = []
for item in extended_data:
    pre_process.append([item[0], ' '.join(item[1:-1]), item[-1]])

In [42]:
pre_process[:5]

[["'bout", '', 'start'],
 ["'bout", 'start', 'B'],
 ["'bout", 'start B', 'AW1'],
 ["'bout", 'start B AW1', 'T'],
 ["'bout", 'start B AW1 T', 'stop']]

In [43]:
eng_df = pd.DataFrame(pre_process, columns = ['word', 'phonemes', 'label'])

In [44]:
eng_df = eng_df[eng_df['phonemes'] != '']
eng_df.head()

Unnamed: 0,word,phonemes,label
1,'bout,start,B
2,'bout,start B,AW1
3,'bout,start B AW1,T
4,'bout,start B AW1 T,stop
6,'cause,start,K


In [45]:
remove_digits = str.maketrans('', '', digits)
eng_df['phonemes'] = eng_df['phonemes'].str.translate(remove_digits)
eng_df['label'] = eng_df['label'].str.translate(remove_digits)
eng_df.head()

Unnamed: 0,word,phonemes,label
1,'bout,start,B
2,'bout,start B,AW
3,'bout,start B AW,T
4,'bout,start B AW T,stop
6,'cause,start,K


In [46]:
max_L = 2
ix = 'B'
for item in eng_df['label']:
    if len(item) > max_L:
        max_L = len(item)
        ix = item
max_L

4

In [57]:
len(eng_df['phonemes'].iloc[])

5

In [48]:
eng_df.to_csv('../data/model_ready/dict/processed_english.csv', index = False)