In [None]:
#Import required packages
import pandas as pd
import numpy as np
import pathlib
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale
import warnings
import glob
from scipy import signal
import re
import json
import logging
import collections
import sqlite3

In [None]:
#Read audio classification file and clean
audio_class_df = pd.read_csv("audioclassification_meta.csv")
c_names = audio_class_df.columns.tolist()
c_names = c_names[0].replace(" ", "_").split("\t")

audio_class_df[c_names] = audio_class_df['VoxCeleb1 ID\tVGGFace1 ID\tGender\tNationality\tSet'].\
                        str.split("\t", expand = True)
audio_class_df = audio_class_df[c_names]

#Set as dictionary
audio_class_dict = audio_class_df.set_index("VoxCeleb1_ID").T.to_dict('list')

#View data
audio_class_df.head()

Unnamed: 0,VoxCeleb1_ID,VGGFace1_ID,Gender,Nationality,Set
0,id10001,A.J._Buckley,m,Ireland,dev
1,id10002,A.R._Rahman,m,India,dev
2,id10003,Aamir_Khan,m,India,dev
3,id10004,Aaron_Tveit,m,USA,dev
4,id10005,Aaron_Yoo,m,USA,dev


In [None]:
phoible_df = pd.read_csv("phoible.csv")
phoible_df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,InventoryID,Glottocode,ISO6393,LanguageName,SpecificDialect,GlyphID,Phoneme,Allophones,Marginal,SegmentClass,...,retractedTongueRoot,advancedTongueRoot,periodicGlottalSource,epilaryngealSource,spreadGlottis,constrictedGlottis,fortis,raisedLarynxEjective,loweredLarynxImplosive,click
0,1,kore1280,kor,Korean,,68,h,ç h ɦ,,consonant,...,0,0,-,-,+,-,-,-,-,-
1,1,kore1280,kor,Korean,,006A,j,j,,consonant,...,0,0,+,-,-,-,-,-,-,-
2,1,kore1280,kor,Korean,,006B,k,k̚ ɡ k,,consonant,...,0,0,-,-,-,-,-,-,-,-
3,1,kore1280,kor,Korean,,006B+02B0,kʰ,kʰ,,consonant,...,0,0,-,-,+,-,-,-,-,-
4,1,kore1280,kor,Korean,,006B+02C0,kˀ,kˀ,,consonant,...,0,0,-,-,-,+,-,-,-,-


In [None]:
# Generate a mapping from nationality to language spoken
nationalities_to_language = {'Irish': 'English',
                             'India': 'Hindi', 
                             'USA': 'English (American)',
                             'Australia': 'English (Australian)',
                             'Canada': 'English', 
                             'UK': 'English (British)', 
                             'Norway': 'Norwegian',
                             'Italy': 'Italian',
                             'Sudan': 'Arabic',
                             'Mexico': 'Spanish',
                             'China': 'Standard Chinese; Mandarin',
                             'Switzerland': 'Swiss German',
                             'Guyana': 'English',
                             'Philippines':'Filipino',
                             'New Zealand': 'English (New Zealand)',
                             'Germany': 'German', 
                             'Portugal': 'Portuguese (European)',
                             'Netherlands': 'Dutch',
                             'Pakistan': 'Urdu',
                             'Croatia': 'Croatian',
                             'South Korea': 'Korean',
                             'Sweden': 'Swedish',
                             'Russia': 'Russian',
                             'Poland': 'Polish',
                             'Sri Lanka': 'Sinhalese', 
                             'Singapore': 'Mandarin Chinese',
                             'Chile': 'Spanish',
                             'Spain': 'Spanish',
                             'Israel':'Modern Hebrew',
                             'Brazil': 'Portuguese (Brazilian)',
                             'Trinidad and Tobago': 'English', 
                             'Denmark': 'Danish',
                             'Austria': 'German', 
                             'South Africa': 'English', 
                             'Iran': 'Farsi'} 

In [None]:
# Filter dataframe to only nationalities that will be encountered
phoible_df = phoible_df[phoible_df['LanguageName'].isin(list(nationalities_to_language.values()))]

In [None]:
# Find all languages spoken within VoxCeleb
all_languages = list(phoible_df['LanguageName'].unique())

In [None]:
all_phonemes = list(phoible_df['Phoneme'].unique())

In [None]:
# Define a mapping from language to phoneme 
# Key is language and value is a set of phonemes within that language
phonemes_per_lang = {}
for j in range(len(all_languages)):
    phonemes_per_lang[all_languages[j]] = {}
    phonemes_in_lang = phoible_df[phoible_df['LanguageName'] == all_languages[j]]['Phoneme'].unique()
    phonemes_per_lang[all_languages[j]] = set()
    for i in range(len(phonemes_in_lang)):
        phonemes_per_lang[all_languages[j]].add(phonemes_in_lang[i])

In [None]:
# Create a set of all the phonemes in English languages
eng_langs = ['English', 'English (American)','English (Australian)', \
             'English (British)', 'English (New Zealand)']
english_phonemes = set()
for lang in eng_langs:
    english_phonemes.update(phonemes_per_lang[lang])

# Define a mapping from English phonemes to allophones that may be present in tother languages
english_phonemes_to_allophones = {}
for phoneme in english_phonemes:
    english_phonemes_to_allophones[phoneme] = set(phoneme)                               
    for allophones in phoible_df[(phoible_df['Phoneme'] == phoneme) & (phoible_df['LanguageName'].isin(eng_langs))].Allophones:
        if pd.isnull(allophones) == False and allophones.isalnum():
            for allophone in allophones:
                english_phonemes_to_allophones[phoneme].add(allophone)

In [None]:
# Define path to where wav files are located
wav_path = '/Users/ariellestern/Desktop/cis519_project/aus_wav/'

In [None]:
def pull_id_wavs(wav_path, id):
    '''
    Function: Find paths to wav files within specified directory
    Inputs: 
        - wav_path: string of directory where wav files are located
    Outputs:
        - wav_ls: list paths to individual wav files
    '''
    wav_ls = list()
    for path, subdirs, files in os.walk(wav_path + id):
        for name in files:
            wav_path = str(pathlib.PurePath(path, name))
            wav_ls.append(wav_path)
    return wav_ls


In [None]:
# Get all the ids of Australian individuals 
# NOTE: Becuase of the time it takes to generate these files, we generated phoneme npz files in batches by nationality
aus_ids = list(list(audio_class_df[audio_class_df['Nationality'].isin(['Australia'])].VoxCeleb1_ID))


In [None]:
# Remove corrupt files, if any
aus_ids.remove('id10155') # corrupt file
aus_ids.remove('id10347') # corrupt file

In [None]:
# Define dictionary from id to list of all wav file paths associated with that id  
all_wav_dict = {key: pull_id_wavs(wav_path, key) for key in  aus_ids} 

In [None]:
def get_key_english_phonemes_to_allophones(val):
    '''
    Function: Find English allophones of non-English phonemes
    Inputs: 
        - val: a phoneme
    Outputs:
        - key: the allophone that phoneme is known as in English, if applicable
    '''
    for key, value in english_phonemes_to_allophones.items():
        if val in value:
            return key

In [None]:
import speech_recognition as sr
import eng_to_ipa as p 


def audio_to_phonemes(audio_dict):
    '''
    Function: Convert an audio file to phonmes
    Inputs: 
        - audio_dict: a dictionary from id to a list of filepaths for wav files associated with that id 
    Outputs:
        - phonemes_dict: a dictionary from id to a list of a sets of phonemes associated with recordings for that id
    '''
    num_ids = len(audio_dict)
    i = 0
    phonemes_dict = {}
    r = sr.Recognizer()
    for key in audio_dict:
        i += 1
        print("Working on id "+str(i)+" out of "+ str(num_ids))
        print(key)
        phonemes_list = list()
        for wav_file in audio_dict[key]:
            audio_file = sr.AudioFile(wav_file)
            with audio_file as source: 
                try:
                    audio = r.record(audio_file)
                    text = r.recognize_google(audio)
                except:
                    text = ""
            phonemes = p.convert(text)
            phonemes = phonemes.replace(" ", "")
            phonemes_in_sample = set()
            for char in phonemes:
                key_phoneme = get_key_english_phonemes_to_allophones(char)
                if key_phoneme == 'unseen':
                    print('hit unseen')
                if key_phoneme != None:
                    phonemes_in_sample.add(key_phoneme)
            phonemes_list.append(phonemes_in_sample)
        phonemes_dict[key] = phonemes_list
        print('num files for id ')
        print(len(phonemes_dict[key]))
    return phonemes_dict

In [None]:
phonemes_dict = audio_to_phonemes(all_wav_dict)

Working on id 1 out of 5
id11094
num files for id 
51
Working on id 2 out of 5
id11173
num files for id 
265
Working on id 3 out of 5
id11179
num files for id 
65
Working on id 4 out of 5
id11192
num files for id 
86
Working on id 5 out of 5
id11240
num files for id 
72


In [None]:
# Save phonemes per id to .npz format
for id in list(phonemes_dict.keys()):
    filename = id + '.npz'
    data_to_store = phonemes_dict[id]
    np.savez(filename, *data_to_store)