In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from operator import itemgetter

In [113]:
#Set path to data and language variable in lowercase (e.g. spanish) and run the rest
path = '../../datasets/'
language = 'spanish'
file_name = path+language+'_words2phones.csv'
file_name

'../datasets/spanish_words2phones.csv'

In [114]:
# Read in csv file
data_table = pd.read_csv(file_name)
data_table.head()

Unnamed: 0,word,ipa,ARPAbet
0,aaleniana,a l e n j a n a,AE L EH N Y AE N AE
1,aalenianas,a l e n j a n a s,AE L EH N Y AE N AE S
2,aaleniano,a l e n j a n o,AE L EH N Y AE N OH
3,aalenianos,a l e n j a n a s,AE L EH N Y AE N AE S
4,ababa,a b a b a,AE B AE B AE


In [115]:
# Reshape and convert to list
data_arr = np.array((data_table['word']+' '+data_table['ARPAbet'])).reshape(-1,1)

In [116]:
data_arr

array([['aaleniana AE L EH N Y AE N AE'],
       ['aalenianas AE L EH N Y AE N AE S'],
       ['aaleniano AE L EH N Y AE N OH'],
       ...,
       ['útero UW T EH DX OH'],
       ['útica UW T IH K AE'],
       ['ü W']], dtype=object)

In [117]:
# Storing seperated values in a list since it's easier to work with strings
data_list = []
non_strings = []
for item in data_arr:
    if type(item[0]) == str:
        data_list.append(item[0].split(' '))
    else:
        non_strings.append(item[0])
data_list[:5]

[['aaleniana', 'AE', 'L', 'EH', 'N', 'Y', 'AE', 'N', 'AE'],
 ['aalenianas', 'AE', 'L', 'EH', 'N', 'Y', 'AE', 'N', 'AE', 'S'],
 ['aaleniano', 'AE', 'L', 'EH', 'N', 'Y', 'AE', 'N', 'OH'],
 ['aalenianos', 'AE', 'L', 'EH', 'N', 'Y', 'AE', 'N', 'AE', 'S'],
 ['ababa', 'AE', 'B', 'AE', 'B', 'AE']]

In [118]:
# Check what kind of info was excluded
non_strings[:5]

[nan]

In [119]:
# Adding a stop character to the end of each word
for item in data_list:
    item.append('stop')
for item in data_list:
    item.insert(1, 'start')
data_list[:5]

[['aaleniana', 'start', 'AE', 'L', 'EH', 'N', 'Y', 'AE', 'N', 'AE', 'stop'],
 ['aalenianas',
  'start',
  'AE',
  'L',
  'EH',
  'N',
  'Y',
  'AE',
  'N',
  'AE',
  'S',
  'stop'],
 ['aaleniano', 'start', 'AE', 'L', 'EH', 'N', 'Y', 'AE', 'N', 'OH', 'stop'],
 ['aalenianos',
  'start',
  'AE',
  'L',
  'EH',
  'N',
  'Y',
  'AE',
  'N',
  'AE',
  'S',
  'stop'],
 ['ababa', 'start', 'AE', 'B', 'AE', 'B', 'AE', 'stop']]

In [120]:
# Extending the data so each word incrementally gains one of its phonemes
extended_data = []
for item in data_list:
        for i in range(len(item)-1):
            extended_data.append(item[:i+2])
extended_data[:7]

[['aaleniana', 'start'],
 ['aaleniana', 'start', 'AE'],
 ['aaleniana', 'start', 'AE', 'L'],
 ['aaleniana', 'start', 'AE', 'L', 'EH'],
 ['aaleniana', 'start', 'AE', 'L', 'EH', 'N'],
 ['aaleniana', 'start', 'AE', 'L', 'EH', 'N', 'Y'],
 ['aaleniana', 'start', 'AE', 'L', 'EH', 'N', 'Y', 'AE']]

In [121]:
# Write to dict file
out_file = path + 'processed/processed_' + language + '.dict'
with open(out_file, "w") as txt_file:
    for line in extended_data:
        txt_file.write(" ".join(line) + "\n")

In [122]:
pre_process = []
for item in extended_data:
    pre_process.append([item[0], ' '.join(item[1:-1]), item[-1]])
pre_process[:5]

[['aaleniana', '', 'start'],
 ['aaleniana', 'start', 'AE'],
 ['aaleniana', 'start AE', 'L'],
 ['aaleniana', 'start AE L', 'EH'],
 ['aaleniana', 'start AE L EH', 'N']]

In [123]:
df = pd.DataFrame(pre_process, columns = ['word', 'phonemes', 'label'])

In [124]:
out_file = path + 'processed/processed_' + language + '.csv'
df.to_csv(out_file, index=False)