In [10]:
import re
import pandas as pd
import pickle as pkl
import numpy as np
from collections import Counter 
# import seaborn as sns
# import matplotlib.pyplot as plt

In [11]:
cmudict = dict()
phone_freq = Counter()

with open('../data/cmudict/cmudict.dict') as f:
    lines = f.readlines()

for line in lines:
    pairs = line.strip('\n').split(' ', 1)
    # as mentioned in the exploration notebook, lines including '#' are usually words
    # of non-English origin or abbreviations and have been removed as they are not properly
    # spoken words. Similarly, remove pronounciation variations of the same word. these are denoted
    # by (2) or (3) for example. 
    if re.search(r'(\d)', pairs[0]) or '#' in pairs[1]:
        continue
    else:
        # re.sub() is to remove indications of primary, secondary, or non-stress in vowels for simplicity.
        pairs[1] = re.sub(r'\d', '', pairs[1])
        cmudict[pairs[0]] = pairs[1]
        phone_freq += Counter(pairs[1].split(' '))

In [12]:
cmudict

{"'bout": 'B AW T',
 "'cause": 'K AH Z',
 "'course": 'K AO R S',
 "'cuse": 'K Y UW Z',
 "'em": 'AH M',
 "'frisco": 'F R IH S K OW',
 "'gain": 'G EH N',
 "'kay": 'K EY',
 "'m": 'AH M',
 "'n": 'AH N',
 "'round": 'R AW N D',
 "'s": 'EH S',
 "'til": 'T IH L',
 "'tis": 'T IH Z',
 "'twas": 'T W AH Z',
 'a': 'AH',
 "a's": 'EY Z',
 'a.': 'EY',
 "a.'s": 'EY Z',
 'a.d.': 'EY D IY',
 'a.m.': 'EY EH M',
 'a.s': 'EY Z',
 'aaa': 'T R IH P AH L EY',
 'aaberg': 'AA B ER G',
 'aachen': 'AA K AH N',
 'aachener': 'AA K AH N ER',
 'aaker': 'AA K ER',
 'aaliyah': 'AA L IY AA',
 'aalseth': 'AA L S EH TH',
 'aamodt': 'AA M AH T',
 'aancor': 'AA N K AO R',
 'aardema': 'AA R D EH M AH',
 'aardvark': 'AA R D V AA R K',
 'aardvarks': 'AA R D V AA R K S',
 'aargh': 'AA R G',
 'aarhus': 'AA HH UW S',
 'aaron': 'EH R AH N',
 "aaron's": 'EH R AH N Z',
 'aarons': 'EH R AH N Z',
 'aaronson': 'EH R AH N S AH N',
 "aaronson's": 'EH R AH N S AH N Z',
 'aarti': 'AA R T IY',
 'aase': 'AA S',
 'aasen': 'AA S AH N',
 'ab': '

# Revisiting phoneme compatibility after minor pre-processing

In [13]:
from ipapy.arpabetmapper import ARPABETMapper
def parse_wikipron(filepath):
    amapper = ARPABETMapper()  
    df = pd.read_csv(filepath, encoding='utf-8', sep='\t', header=None)
    df.columns = ['word', 'ipa']
    df['ARPAbet'] = df['ipa'].map(lambda row: ' '.join(amapper.map_unicode_string(row, ignore=True, return_as_list=True)))
    phone_freq = Counter()
    for arpa in df['ARPAbet']:
        phone_freq += Counter(arpa.split(' '))
    return df, phone_freq

In [14]:
def jaccard_similarity(dict1, dict2):
    dict1_keys = set(dict1.keys())
    dict2_keys = set(dict2.keys())
    intersection = dict1_keys.intersection(dict2_keys)
    union = dict1_keys.union(dict2_keys)
    print('Jaccard similarity: {:.3f}'.format(len(intersection)/len(union)))
    print('Shared phonemes: {}'.format(intersection))
    print('All phonemes: {}'.format(union))

## Icelandic

In [5]:
ic_df, ic_phone_freq = parse_wikipron('../../Downloads/ice_latn_broad.tsv')

In [7]:
jaccard_similarity(phone_freq, ic_phone_freq)

Jaccard similarity: 0.610
Shared phonemes: {'R', 'T', 'AO', 'DH', 'NG', 'N', 'W', 'K', 'L', 'TH', 'S', 'Y', 'V', 'G', 'P', 'UH', 'D', 'HH', 'F', 'EH', 'AE', 'M', 'UW', 'IH', 'B'}
All phonemes: {'R', 'IY', 'AH', 'T', 'ZH', 'AO', 'DH', 'AY', 'N', 'NG', 'W', 'Z', 'EY', 'JH', 'AA', 'K', 'L', 'ER', 'DX', 'TH', 'CH', 'S', 'Y', 'V', 'G', 'P', 'UH', 'D', 'SH', 'OW', 'HH', 'F', 'AW', 'EH', 'AE', 'M', 'UW', 'IH', 'OH', 'B', 'OY'}


## Romanian

In [10]:
rom_df, rom_phone_freq = parse_wikipron('../../Downloads/rum_latn_narrow.tsv')

In [11]:
jaccard_similarity(phone_freq, rom_phone_freq)

Jaccard similarity: 0.605
Shared phonemes: {'R', 'ZH', 'T', 'NG', 'N', 'W', 'Z', 'JH', 'K', 'L', 'CH', 'S', 'Y', 'V', 'G', 'P', 'D', 'SH', 'HH', 'F', 'EH', 'AE', 'M', 'UW', 'IH', 'B'}
All phonemes: {'R', 'IY', 'AH', 'T', 'ZH', 'AO', 'DH', 'AY', 'N', 'NG', 'W', 'Z', 'EY', 'JH', 'AA', 'K', 'L', 'ER', 'DX', 'TH', 'CH', 'S', 'Y', 'V', 'G', 'P', 'UH', 'D', 'SH', 'OW', 'HH', 'F', 'AW', 'EH', 'AE', 'M', 'IX', 'UW', 'IH', 'OH', 'B', 'OY', 'AX'}


## Russian

In [16]:
rus_df, rus_phone_freq = parse_wikipron('../rus_cyrl_narrow.tsv')

In [17]:
jaccard_similarity(phone_freq, rus_phone_freq)

Jaccard similarity: 0.478
Shared phonemes: {'B', 'UH', 'F', 'L', 'W', 'M', 'UW', 'G', 'T', 'K', 'N', 'AA', 'Z', 'R', 'EH', 'AE', 'D', 'Y', 'V', 'IH', 'P', 'S'}
All phonemes: {'', 'B', 'UH', 'OW', 'F', 'HH', 'TH', 'IY', 'ZH', 'L', 'CH', 'W', 'AW', 'EY', 'UW', 'M', 'EM', 'IX', 'AY', 'ER', 'JH', 'Q', 'G', 'OY', 'T', 'NG', 'DH', 'K', 'N', 'AA', 'AH', 'Z', 'EH', 'R', 'AO', 'AE', 'SH', 'AX', 'OH', 'D', 'Y', 'V', 'IH', 'P', 'S', 'UX'}


## Polish

In [18]:
pol_df, pol_phone_freq = parse_wikipron('../pol_latn_broad.tsv')

In [19]:
jaccard_similarity(phone_freq, pol_phone_freq)

Jaccard similarity: 0.558
Shared phonemes: {'B', 'HH', 'F', 'L', 'W', 'UW', 'M', 'G', 'T', 'NG', 'K', 'N', 'AA', 'Z', 'EH', 'R', 'AO', 'AE', 'D', 'Y', 'V', 'IH', 'P', 'S'}
All phonemes: {'', 'B', 'UH', 'OW', 'F', 'HH', 'TH', 'IY', 'ZH', 'L', 'CH', 'W', 'AW', 'EY', 'UW', 'M', 'IX', 'AY', 'ER', 'JH', 'Q', 'G', 'OY', 'T', 'NG', 'DH', 'K', 'N', 'AA', 'AH', 'Z', 'EH', 'R', 'AO', 'AE', 'SH', 'D', 'Y', 'V', 'IH', 'P', 'S', 'UX'}


## Dutch

In [22]:
dut_df, dut_phone_freq = parse_wikipron('../dut_latn_broad.tsv')

In [23]:
jaccard_similarity(phone_freq, dut_phone_freq)

Jaccard similarity: 0.651
Shared phonemes: {'B', 'UH', 'F', 'ZH', 'L', 'W', 'M', 'UW', 'JH', 'G', 'T', 'NG', 'K', 'N', 'AA', 'Z', 'AH', 'R', 'EH', 'AO', 'AE', 'SH', 'D', 'Y', 'V', 'IH', 'P', 'S'}
All phonemes: {'', 'B', 'UH', 'OW', 'F', 'HH', 'TH', 'IY', 'ZH', 'L', 'CH', 'W', 'AW', 'EY', 'UW', 'M', 'AY', 'ER', 'JH', 'Q', 'G', 'OY', 'T', 'NG', 'DH', 'K', 'N', 'AA', 'AH', 'Z', 'EH', 'R', 'AO', 'AE', 'SH', 'AX', 'OH', 'D', 'Y', 'V', 'IH', 'P', 'S'}


## Finnish

In [25]:
fin_df, fin_phone_freq = parse_wikipron('../fin_latn_broad.tsv')

In [26]:
jaccard_similarity(phone_freq, fin_phone_freq)

Jaccard similarity: 0.571
Shared phonemes: {'B', 'HH', 'F', 'ZH', 'L', 'CH', 'W', 'UW', 'M', 'G', 'T', 'NG', 'K', 'AA', 'N', 'Z', 'R', 'EH', 'SH', 'D', 'Y', 'IH', 'P', 'S'}
All phonemes: {'', 'B', 'UH', 'OW', 'F', 'HH', 'TH', 'IY', 'ZH', 'L', 'CH', 'W', 'AW', 'EY', 'UW', 'M', 'AY', 'ER', 'JH', 'Q', 'G', 'OY', 'T', 'NG', 'DH', 'K', 'N', 'AA', 'AH', 'Z', 'EH', 'R', 'AO', 'AE', 'SH', 'OH', 'D', 'Y', 'V', 'IH', 'P', 'S'}


## Italian

In [27]:
ita_df, ita_phone_freq = parse_wikipron('../ita_latn_broad.tsv')

In [28]:
jaccard_similarity(phone_freq, ita_phone_freq)

Jaccard similarity: 0.651
Shared phonemes: {'B', 'HH', 'F', 'TH', 'ZH', 'L', 'CH', 'W', 'UW', 'M', 'JH', 'G', 'T', 'NG', 'K', 'N', 'Z', 'EH', 'R', 'AO', 'AE', 'SH', 'D', 'Y', 'V', 'IH', 'P', 'S'}
All phonemes: {'', 'B', 'UH', 'OW', 'F', 'HH', 'TH', 'IY', 'ZH', 'L', 'CH', 'W', 'AW', 'EY', 'UW', 'M', 'AY', 'ER', 'JH', 'G', 'OY', 'T', 'NG', 'DH', 'K', 'N', 'AA', 'DX', 'AH', 'Z', 'EH', 'R', 'AO', 'AE', 'SH', 'OH', 'AX', 'D', 'Y', 'V', 'IH', 'P', 'S'}


## Spanish

In [30]:
spa_df, spa_phone_freq = parse_wikipron('../spa_latn_la_broad.tsv')

In [31]:
jaccard_similarity(phone_freq, spa_phone_freq)

Jaccard similarity: 0.595
Shared phonemes: {'B', 'HH', 'F', 'TH', 'L', 'CH', 'W', 'M', 'UW', 'G', 'T', 'NG', 'DH', 'K', 'N', 'Z', 'R', 'EH', 'AE', 'SH', 'D', 'Y', 'IH', 'P', 'S'}
All phonemes: {'', 'B', 'UH', 'OW', 'F', 'HH', 'TH', 'IY', 'ZH', 'L', 'CH', 'W', 'AW', 'EY', 'UW', 'M', 'AY', 'ER', 'JH', 'G', 'OY', 'T', 'NG', 'DH', 'K', 'N', 'AA', 'DX', 'AH', 'Z', 'EH', 'R', 'AO', 'AE', 'SH', 'OH', 'D', 'Y', 'V', 'IH', 'P', 'S'}


## Czech

In [32]:
cze_df, cze_phone_freq = parse_wikipron('../cze_latn_narrow.tsv')

In [33]:
jaccard_similarity(phone_freq, cze_phone_freq)

Jaccard similarity: 0.587
Shared phonemes: {'B', 'UH', 'F', 'ZH', 'L', 'CH', 'W', 'UW', 'M', 'JH', 'G', 'T', 'NG', 'K', 'N', 'Z', 'EH', 'R', 'AO', 'SH', 'AE', 'D', 'Y', 'V', 'IH', 'P', 'S'}
All phonemes: {'', 'B', 'UH', 'OW', 'F', 'HH', 'TH', 'IY', 'ZH', 'L', 'CH', 'W', 'AW', 'EY', 'UW', 'M', 'EM', 'AY', 'ER', 'JH', 'Q', 'G', 'OY', 'T', 'NG', 'DH', 'K', 'N', 'AA', 'EL', 'DX', 'AH', 'Z', 'EH', 'R', 'AO', 'AE', 'SH', 'OH', 'D', 'Y', 'V', 'IH', 'EN', 'P', 'S'}


## French

In [36]:
fre_df, fre_latn_broad = parse_wikipron('../fre_latn_broad.tsv')

In [37]:
jaccard_similarity(phone_freq, fre_latn_broad)

Jaccard similarity: 0.667
Shared phonemes: {'B', 'UH', 'HH', 'F', 'ZH', 'L', 'CH', 'W', 'M', 'UW', 'JH', 'G', 'T', 'NG', 'K', 'AA', 'N', 'Z', 'AH', 'EH', 'R', 'AO', 'AE', 'SH', 'D', 'Y', 'V', 'IH', 'P', 'S'}
All phonemes: {'', 'B', 'UH', 'OW', 'F', 'HH', 'TH', 'IY', 'ZH', 'L', 'CH', 'W', 'AW', 'EY', 'UW', 'M', 'AY', 'ER', 'JH', 'Q', 'G', 'OY', 'T', 'NG', 'DH', 'AXR', 'K', 'N', 'AA', 'DX', 'AH', 'Z', 'EH', 'R', 'AO', 'AE', 'SH', 'OH', 'AX', 'D', 'Y', 'V', 'IH', 'P', 'S'}


## Croatian

In [38]:
hbs_df, hbs_phone_freq = parse_wikipron('../hbs_latn_broad.tsv')

In [39]:
jaccard_similarity(phone_freq, hbs_phone_freq)

Jaccard similarity: 0.636
Shared phonemes: {'B', 'UH', 'HH', 'F', 'ZH', 'L', 'CH', 'UW', 'M', 'JH', 'G', 'T', 'NG', 'K', 'N', 'AA', 'Z', 'EH', 'R', 'AO', 'AE', 'SH', 'D', 'Y', 'V', 'IH', 'P', 'S'}
All phonemes: {'', 'B', 'UH', 'OW', 'F', 'HH', 'TH', 'IY', 'ZH', 'L', 'CH', 'W', 'AW', 'EY', 'UW', 'M', 'AY', 'ER', 'JH', 'G', 'OY', 'T', 'NG', 'DH', 'K', 'N', 'AA', 'EL', 'AH', 'Z', 'EH', 'R', 'AO', 'AE', 'SH', 'OH', 'AX', 'D', 'Y', 'V', 'IH', 'EN', 'P', 'S'}


# Creating dataframes

In [12]:
cmudict_df = pd.DataFrame(cmudict.items(), columns = ['word', 'ARPAbet'])

In [13]:
cmudict_df.head()

Unnamed: 0,word,ARPAbet
0,'bout,B AW T
1,'cause,K AH Z
2,'course,K AO R S
3,'cuse,K Y UW Z
4,'em,AH M


In [14]:
cmudict_df.to_csv('../data/words_to_phones/cmudict_words2phones.csv', index = False)

In [15]:
ic_df.to_csv('../data/words_to_phones/icelandic_words2phones.csv', index = False)
rom_df.to_csv('../data/words_to_phones/romanian_words2phones.csv', index = False)

In [21]:
rus_df.to_csv('../data/words_to_phones/russian_words2phones.csv', index = False)
pol_df.to_csv('../data/words_to_phones/polish_words2phones.csv', index = False)

In [40]:
dut_df.to_csv('../data/words_to_phones/dutch_words2phones.csv', index = False)
fin_df.to_csv('../data/words_to_phones/finnish_words2phones.csv', index = False)
ita_df.to_csv('../data/words_to_phones/italian_words2phones.csv', index = False)
spa_df.to_csv('../data/words_to_phones/spanish_words2phones.csv', index = False)
cze_df.to_csv('../data/words_to_phones/czech_words2phones.csv', index = False)
fre_df.to_csv('../data/words_to_phones/french_words2phones.csv', index = False)
hbs_df.to_csv('../data/words_to_phones/croatian_words2phones.csv', index = False)