In [1]:
import re
import numpy as np
import pandas as pd
from jamotools import Vectorizationer, rules
from unicodedata import normalize

In [2]:
# specify additional null value "NONE"

In [3]:
data = pd.read_csv("../web-scraping/pronunciation_all.csv", sep="\t", na_values="NONE")
data.head()

Unnamed: 0,entry_id,word_id,spelling,pronunciation
0,13943,역사학,역사학,역싸학
1,13943,역사학,역사학이,역싸하기
2,13943,역사학,역사학도,역싸학또
3,13943,역사학,역사학만,역싸항만
4,13955,시내버스,시내버스,


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87771 entries, 0 to 87770
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   entry_id       87771 non-null  int64 
 1   word_id        87771 non-null  object
 2   spelling       87771 non-null  object
 3   pronunciation  81080 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.7+ MB


In [5]:
data[data.pronunciation.isnull()].head(15)

Unnamed: 0,entry_id,word_id,spelling,pronunciation
4,13955,시내버스,시내버스,
15,13962,시디,시디,
16,13963,시디롬,시디롬,
17,13964,시멘트,시멘트,
19,13966,시스템,시스템,
22,13969,시외버스,시외버스,
37,13981,시즌,시즌,
331,14178,과실 치사,과실 치사,
462,14284,앨범,앨범,
529,14320,간접 경험,간접 경험,


In [6]:
# drop rows with null values in pronunciation before masking to check for English and Chinese loan words

In [7]:
data_drop_null = data[data.pronunciation.notna()]
len(data_drop_null)

81080

In [8]:
# check for Chinese characters

In [9]:
data_drop_null[data_drop_null.pronunciation.str.contains(u'[\u4e00-\u9fff]', regex=True)]

Unnamed: 0,entry_id,word_id,spelling,pronunciation


In [10]:
data_drop_null[data_drop_null.spelling.str.contains(u'[\u4e00-\u9fff]', regex=True)]

Unnamed: 0,entry_id,word_id,spelling,pronunciation


In [11]:
# check for English lowercase letters 

In [12]:
data_drop_null[data_drop_null.pronunciation.str.contains(u'[\u0061-\u007A]', regex=True)]

Unnamed: 0,entry_id,word_id,spelling,pronunciation


In [13]:
data_drop_null[data_drop_null.spelling.str.contains(u'[\u0061-\u007A]', regex=True)]

Unnamed: 0,entry_id,word_id,spelling,pronunciation


In [14]:
# check for English uppercase letters

In [15]:
data_drop_null[data_drop_null.pronunciation.str.contains(u'[\u0041-\u005A]', regex=True)]

Unnamed: 0,entry_id,word_id,spelling,pronunciation


In [16]:
data_drop_null[data_drop_null.spelling.str.contains(u'[\u0041-\u005A]', regex=True)]

Unnamed: 0,entry_id,word_id,spelling,pronunciation


In [17]:
# check for duplicates

In [18]:
len(data_drop_null.drop_duplicates())

81075

In [19]:
data_drop_null[data_drop_null.duplicated()]

Unnamed: 0,entry_id,word_id,spelling,pronunciation
2425,15639,수입되다,수입되는,수입뙤는/수입뛔는
11201,27284,고착시키다,고착시키는,고착씨키는
11472,27702,곧이듣다,곧이듣는,고지든는
16785,36706,구지레하다,구지레한,구지레한
80742,88358,노닥대다,노닥대는,노닥때는


In [20]:
data_drop_null[data_drop_null.entry_id == 15639]

Unnamed: 0,entry_id,word_id,spelling,pronunciation
2421,15639,수입되다,수입되다,수입뙤다/수입뛔다
2422,15639,수입되다,수입되는,수입뙤는/수입뛔는
2423,15639,수입되다,수입되어,수입뙤어/수입뛔여
2424,15639,수입되다,수입돼,수입뙈
2425,15639,수입되다,수입되는,수입뙤는/수입뛔는
2426,15639,수입되다,수입됩니다,수입뙴니다/수입뛤니다


In [21]:
data_drop_dups = data_drop_null.drop_duplicates()
len(data_drop_dups)

81075

In [22]:
# handle pronunciations containing forward slashes ("/")

In [23]:
forward_slash = data_drop_dups[data_drop_dups.pronunciation.str.contains("/")]
len(forward_slash)

7582

In [24]:
# define function for splitting these rows

In [25]:
def split_row(row, row_accumulator, target_column, separator):
    split_row = row[target_column].split(separator)
    if len(split_row) != 2:
        print(row)
    for s in split_row:
        new_row = row.to_dict()
        new_row[target_column] = s
        row_accumulator.append(new_row)

In [26]:
new_rows = []
forward_slash.apply(split_row, axis=1, args=(new_rows, "pronunciation", "/"))
temp_df = pd.DataFrame(new_rows)
len(temp_df)

entry_id                       26170
word_id                         가족회의
spelling                        가족회의
pronunciation    가조쾨의/가조쾨이/가족퀘의/가족퀘이
Name: 10240, dtype: object
entry_id                     27927
word_id                        가계약
spelling                      가계약이
pronunciation    가ː계야기/가ː게야기/가ː게야기
Name: 11778, dtype: object
entry_id                     27927
word_id                        가계약
spelling                      가계약도
pronunciation    가ː계약또/가ː게약또/가ː게약또
Name: 11779, dtype: object
entry_id                     27927
word_id                        가계약
spelling                      가계약만
pronunciation    가ː계양만/가ː게양만/가ː게양만
Name: 11780, dtype: object


15169

In [27]:
temp_df.head()

Unnamed: 0,entry_id,word_id,spelling,pronunciation
0,13992,식히다,식히어,시키어
1,13992,식히다,식히어,시키여
2,14009,연기되다,연기되다,연기되다
3,14009,연기되다,연기되다,연기뒈다
4,14041,간음죄,간음죄,가ː늠쬐


In [28]:
no_slash = data_drop_dups[~data_drop_dups.pronunciation.str.contains("/")]
print(len(data_drop_dups) - len(forward_slash))
len(no_slash)

73493


73493

In [29]:
# recombine DataFrames and handle any duplicates that resulted from splitting on forward slashes

In [30]:
combined = pd.concat([no_slash, temp_df])
print(len(no_slash) + len(temp_df))
len(combined)

88662


88662

In [31]:
combined.head()

Unnamed: 0,entry_id,word_id,spelling,pronunciation
0,13943,역사학,역사학,역싸학
1,13943,역사학,역사학이,역싸하기
2,13943,역사학,역사학도,역싸학또
3,13943,역사학,역사학만,역싸항만
5,13957,시대적2,시대적,시대적


In [32]:
print(len(combined[combined.duplicated()]))
combined[combined.duplicated()]

7


Unnamed: 0,entry_id,word_id,spelling,pronunciation
1585,23444,개막되다,개막되다,개막뙤다
1589,23472,개방화되다,개방화되다,개방화되다
2428,27927,가계약,가계약이,가ː게야기
2431,27927,가계약,가계약도,가ː게약또
2434,27927,가계약,가계약만,가ː게양만
4192,40799,날갯짓,날갯짓,날개찓
4396,41865,노골화되다,노골화됩니다,노골화됨니다


In [33]:
combined[combined.entry_id == 23444]

Unnamed: 0,entry_id,word_id,spelling,pronunciation
1584,23444,개막되다,개막되다,개막뙤다
1585,23444,개막되다,개막되다,개막뙤다


In [34]:
combined_drop_dups = combined.drop_duplicates()
len(combined_drop_dups)

88655

In [35]:
# replace all instances of the phonetic "length" symbol "ː" with an ordinary colon ":"

In [36]:
len(combined_drop_dups[combined_drop_dups.pronunciation.str.contains("ː")])

23424

In [37]:
# use .loc like this to avoid setting value on a copy

In [38]:
combined_drop_dups.loc[:, ["pronunciation"]] = combined_drop_dups.pronunciation.str.replace("ː", ":")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [39]:
len(combined_drop_dups[combined_drop_dups.pronunciation.str.contains("ː")])

0

In [40]:
# create the Vectorizationer() instance

In [41]:
vec = Vectorizationer(rule=rules.RULE_1, max_length=None)

In [42]:
# test it out

In [43]:
vec.vectorize("가게")

array([ 2, 21,  2, 26], dtype=uint8)

In [44]:
# vectorize the spelling and pronunciation columns

In [45]:
combined_drop_dups["vec_spelling"] = combined_drop_dups.spelling.apply(vec.vectorize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [46]:
combined_drop_dups["vec_pronunciation"] = combined_drop_dups.pronunciation.apply(vec.vectorize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [47]:
combined_drop_dups.head()

Unnamed: 0,entry_id,word_id,spelling,pronunciation,vec_spelling,vec_pronunciation
0,13943,역사학,역사학,역싸학,"[13, 27, 42, 11, 21, 20, 21, 42]","[13, 27, 42, 12, 21, 20, 21, 42]"
1,13943,역사학,역사학이,역싸하기,"[13, 27, 42, 11, 21, 20, 21, 42, 13, 41]","[13, 27, 42, 12, 21, 20, 21, 2, 41]"
2,13943,역사학,역사학도,역싸학또,"[13, 27, 42, 11, 21, 20, 21, 42, 5, 29]","[13, 27, 42, 12, 21, 20, 21, 42, 6, 29]"
3,13943,역사학,역사학만,역싸항만,"[13, 27, 42, 11, 21, 20, 21, 42, 8, 21, 45]","[13, 27, 42, 12, 21, 20, 21, 62, 8, 21, 45]"
5,13957,시대적2,시대적,시대적,"[11, 41, 5, 22, 14, 25, 42]","[11, 41, 5, 22, 14, 25, 42]"


In [48]:
# find the longest string of letters in either the spelling or pronunciation columns

In [49]:
combined_drop_dups.vec_spelling.apply(len).max()

23

In [50]:
combined_drop_dups[combined_drop_dups.vec_spelling.apply(len) == 23]

Unnamed: 0,entry_id,word_id,spelling,pronunciation,vec_spelling,vec_pronunciation
65120,77574,천부당만부당하다,천부당만부당합니다,천부당만부당함니다,"[16, 25, 45, 9, 34, 5, 21, 62, 8, 21, 45, 9, 3...","[16, 25, 45, 9, 34, 5, 21, 62, 8, 21, 45, 9, 3..."
82752,89795,구불텅구불텅하다,구불텅구불텅합니다,구불텅구불텅함니다,"[2, 34, 9, 34, 49, 18, 25, 62, 2, 34, 9, 34, 4...","[2, 34, 9, 34, 49, 18, 25, 62, 2, 34, 9, 34, 4..."
86163,91995,헤벌쭉헤벌쭉하다2,헤벌쭉헤벌쭉합니다,헤벌쭈케벌쭈캄니다,"[20, 26, 9, 25, 49, 15, 34, 42, 20, 26, 9, 25,...","[20, 26, 9, 25, 49, 15, 34, 17, 26, 9, 25, 49,..."


In [51]:
# NOTE use of .loc here

In [52]:
combined_drop_dups.loc[65120].vec_spelling

array([16, 25, 45,  9, 34,  5, 21, 62,  8, 21, 45,  9, 34,  5, 21, 62, 20,
       21, 58,  4, 41,  5, 21], dtype=uint8)

In [53]:
combined_drop_dups.vec_pronunciation.apply(len).max()

23

In [54]:
combined_drop_dups[combined_drop_dups.vec_pronunciation.apply(len) == 23]

Unnamed: 0,entry_id,word_id,spelling,pronunciation,vec_spelling,vec_pronunciation
65120,77574,천부당만부당하다,천부당만부당합니다,천부당만부당함니다,"[16, 25, 45, 9, 34, 5, 21, 62, 8, 21, 45, 9, 3...","[16, 25, 45, 9, 34, 5, 21, 62, 8, 21, 45, 9, 3..."
82752,89795,구불텅구불텅하다,구불텅구불텅합니다,구불텅구불텅함니다,"[2, 34, 9, 34, 49, 18, 25, 62, 2, 34, 9, 34, 4...","[2, 34, 9, 34, 49, 18, 25, 62, 2, 34, 9, 34, 4..."


In [55]:
combined_drop_dups.loc[65120].vec_pronunciation

array([16, 25, 45,  9, 34,  5, 21, 62,  8, 21, 45,  9, 34,  5, 21, 62, 20,
       21, 57,  4, 41,  5, 21], dtype=uint8)

In [56]:
# create a decoder and write a function to unvectorize the spelling and pronunciation to double-check output before padding

In [57]:
decoder = {v: k for k, v in vec.symbol_map.items()}

def unvectorize_norm(vector):
    temp_list = [decoder[num] for num in vector]
    temp_string = "".join(temp_list)
    return normalize("NFC", temp_string)

In [58]:
combined_drop_dups["unvec_spelling"] = combined_drop_dups.vec_spelling.apply(unvectorize_norm)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [59]:
combined_drop_dups["unvec_pronunciation"] = combined_drop_dups.vec_pronunciation.apply(unvectorize_norm)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [60]:
len(combined_drop_dups[combined_drop_dups.pronunciation != combined_drop_dups.unvec_pronunciation])

21

In [61]:
combined_drop_dups[combined_drop_dups.pronunciation != combined_drop_dups.unvec_pronunciation]

Unnamed: 0,entry_id,word_id,spelling,pronunciation,vec_spelling,vec_pronunciation,unvec_spelling,unvec_pronunciation
51620,68847,ㅏ,ㅏ,ㅏ,[1],[1],<UNK>,<UNK>
51621,68848,ㅐ,ㅐ,ㅐ,[1],[1],<UNK>,<UNK>
51622,68849,ㅑ,ㅑ,ㅑ,[1],[1],<UNK>,<UNK>
51625,68852,ㅒ,ㅒ,ㅒ,[1],[1],<UNK>,<UNK>
51627,68854,ㅓ,ㅓ,ㅓ,[1],[1],<UNK>,<UNK>
51628,68855,ㅔ,ㅔ,ㅔ,[1],[1],<UNK>,<UNK>
51630,68857,ㅖ,ㅖ,ㅖ,[1],[1],<UNK>,<UNK>
51631,68858,ㅘ,ㅘ,ㅘ,[1],[1],<UNK>,<UNK>
51633,68860,ㅙ,ㅙ,ㅙ,[1],[1],<UNK>,<UNK>
51634,68861,ㅚ,ㅚ,ㅚ,[1],[1],<UNK>,<UNK>


In [62]:
# these 21 entries for the vowel and compound vowel entries seem to be the only issue for pronunciation

In [63]:
len(combined_drop_dups[combined_drop_dups.vec_pronunciation.apply(len) == 1])

21

In [64]:
# remove the vowel only entries

In [65]:
combined_clean = combined_drop_dups[~(combined_drop_dups.pronunciation != combined_drop_dups.unvec_pronunciation)]
print(len(combined_drop_dups) - len(combined_drop_dups[combined_drop_dups.vec_pronunciation.apply(len) == 1]))
len(combined_clean)

88634


88634

In [66]:
# check spelling now

In [67]:
len(combined_clean[combined_clean.spelling != combined_clean.unvec_spelling])

64

In [68]:
len(combined_clean[combined_clean.unvec_spelling.str.contains("<UNK>")])

64

In [69]:
# Korean has 14 consonants, plus 5 double consonants, and ㄴ, ㄹ, ㅁ, and ㅇ seemed to have been treated as special cases.
# So four entries each, except for the special cases which only have one entry

In [70]:
(14 + 5 - 4)* 4 + 4

64

In [71]:
# just drop these entries like the vowels

In [72]:
combined_clean[combined_clean.unvec_spelling.str.contains("<UNK>")].head(40)

Unnamed: 0,entry_id,word_id,spelling,pronunciation,vec_spelling,vec_pronunciation,unvec_spelling,unvec_pronunciation
51956,69069,ㄱ,ㄱ,기역,[1],"[2, 41, 13, 27, 42]",<UNK>,기역
51957,69069,ㄱ,ㄱ이,기여기,"[1, 13, 41]","[2, 41, 13, 27, 2, 41]",<UNK>이,기여기
51958,69069,ㄱ,ㄱ도,기역또,"[1, 5, 29]","[2, 41, 13, 27, 42, 6, 29]",<UNK>도,기역또
51959,69069,ㄱ,ㄱ만,기영만,"[1, 8, 21, 45]","[2, 41, 13, 27, 62, 8, 21, 45]",<UNK>만,기영만
51964,69071,ㄲ,ㄲ,쌍기역,[1],"[12, 21, 62, 2, 41, 13, 27, 42]",<UNK>,쌍기역
51965,69071,ㄲ,ㄲ이,쌍기여기,"[1, 13, 41]","[12, 21, 62, 2, 41, 13, 27, 2, 41]",<UNK>이,쌍기여기
51966,69071,ㄲ,ㄲ도,쌍기역또,"[1, 5, 29]","[12, 21, 62, 2, 41, 13, 27, 42, 6, 29]",<UNK>도,쌍기역또
51967,69071,ㄲ,ㄲ만,쌍기역만,"[1, 8, 21, 45]","[12, 21, 62, 2, 41, 13, 27, 42, 8, 21, 45]",<UNK>만,쌍기역만
51972,69073,ㄴ1,ㄴ,니은,[1],"[4, 41, 13, 39, 45]",<UNK>,니은
51973,69075,ㄷ,ㄷ,디귿,[1],"[5, 41, 2, 39, 48]",<UNK>,디귿


In [73]:
combined_clean_final = combined_clean[~(combined_clean.spelling != combined_clean.unvec_spelling)]
print(len(combined_clean) - len(combined_clean[combined_clean.unvec_spelling.str.contains("<UNK>")]))
len(combined_clean_final)

88570


88570

In [74]:
# pad all vectors with zeroes to ensure uniform length

In [75]:
def pad_with_zeros(vector, num_zeros):
    zeros_to_pad = num_zeros - len(vector)
    # default value for "constant" mode is zero.
    # pad the array with 0 zeros on the left and "zeros_to_pad" zeros on the right
    return np.pad(vector, (0, zeros_to_pad), "constant")

In [76]:
combined_clean_final["vec_spelling_pad"] = combined_clean_final.vec_spelling.apply(pad_with_zeros, args=(23,))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [77]:
combined_clean_final["vec_pronunciation_pad"] = combined_clean_final.vec_pronunciation.apply(pad_with_zeros, args=(23,))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [78]:
combined_clean_final.head()

Unnamed: 0,entry_id,word_id,spelling,pronunciation,vec_spelling,vec_pronunciation,unvec_spelling,unvec_pronunciation,vec_spelling_pad,vec_pronunciation_pad
0,13943,역사학,역사학,역싸학,"[13, 27, 42, 11, 21, 20, 21, 42]","[13, 27, 42, 12, 21, 20, 21, 42]",역사학,역싸학,"[13, 27, 42, 11, 21, 20, 21, 42, 0, 0, 0, 0, 0...","[13, 27, 42, 12, 21, 20, 21, 42, 0, 0, 0, 0, 0..."
1,13943,역사학,역사학이,역싸하기,"[13, 27, 42, 11, 21, 20, 21, 42, 13, 41]","[13, 27, 42, 12, 21, 20, 21, 2, 41]",역사학이,역싸하기,"[13, 27, 42, 11, 21, 20, 21, 42, 13, 41, 0, 0,...","[13, 27, 42, 12, 21, 20, 21, 2, 41, 0, 0, 0, 0..."
2,13943,역사학,역사학도,역싸학또,"[13, 27, 42, 11, 21, 20, 21, 42, 5, 29]","[13, 27, 42, 12, 21, 20, 21, 42, 6, 29]",역사학도,역싸학또,"[13, 27, 42, 11, 21, 20, 21, 42, 5, 29, 0, 0, ...","[13, 27, 42, 12, 21, 20, 21, 42, 6, 29, 0, 0, ..."
3,13943,역사학,역사학만,역싸항만,"[13, 27, 42, 11, 21, 20, 21, 42, 8, 21, 45]","[13, 27, 42, 12, 21, 20, 21, 62, 8, 21, 45]",역사학만,역싸항만,"[13, 27, 42, 11, 21, 20, 21, 42, 8, 21, 45, 0,...","[13, 27, 42, 12, 21, 20, 21, 62, 8, 21, 45, 0,..."
5,13957,시대적2,시대적,시대적,"[11, 41, 5, 22, 14, 25, 42]","[11, 41, 5, 22, 14, 25, 42]",시대적,시대적,"[11, 41, 5, 22, 14, 25, 42, 0, 0, 0, 0, 0, 0, ...","[11, 41, 5, 22, 14, 25, 42, 0, 0, 0, 0, 0, 0, ..."


In [79]:
# convert the padded vectors to DataFrames

In [80]:
vec_spelling_all = pd.DataFrame(combined_clean_final.vec_spelling_pad.to_list())
print(len(vec_spelling_all))
vec_spelling_all.head()

88570


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,13,27,42,11,21,20,21,42,0,0,...,0,0,0,0,0,0,0,0,0,0
1,13,27,42,11,21,20,21,42,13,41,...,0,0,0,0,0,0,0,0,0,0
2,13,27,42,11,21,20,21,42,5,29,...,0,0,0,0,0,0,0,0,0,0
3,13,27,42,11,21,20,21,42,8,21,...,0,0,0,0,0,0,0,0,0,0
4,11,41,5,22,14,25,42,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [81]:
vec_pronunciation_all = pd.DataFrame(combined_clean_final.vec_pronunciation_pad.to_list())
print(len(vec_pronunciation_all))
vec_pronunciation_all.head()

88570


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,13,27,42,12,21,20,21,42,0,0,...,0,0,0,0,0,0,0,0,0,0
1,13,27,42,12,21,20,21,2,41,0,...,0,0,0,0,0,0,0,0,0,0
2,13,27,42,12,21,20,21,42,6,29,...,0,0,0,0,0,0,0,0,0,0
3,13,27,42,12,21,20,21,62,8,21,...,0,0,0,0,0,0,0,0,0,0
4,11,41,5,22,14,25,42,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [82]:
# write to file

In [83]:
vec_spelling_all.to_csv("vec_spelling_all.csv", index=False, sep="\t")

In [84]:
vec_pronunciation_all.to_csv("vec_pronunciation_all.csv", index=False, sep="\t")

In [85]:
# write to file data on entries used for training for use with validation

In [86]:
combined_clean_final[["entry_id", "word_id", "spelling", "pronunciation"]].to_csv("reference_all.csv", 
                                                                                  index=False, sep="\t")