In [29]:
# modules
from tqdm import  tqdm
from glob import glob
import pandas as pd
import numpy as np
import os
from os import listdir
from os.path import basename as bn, join, split as sp

import librosa
import parselmouth
from parselmouth.praat import call
from scipy.io.wavfile import write

from sklearn.utils import shuffle
import praat_formants_python.praat_formants_python as pfp


# 1. Vowels of TIMIT Dataset

In [2]:
# Read paths
ROOT_TIMIT_DATA_PATH = "/home/jeevan/datasets/TIMIT Acoustic-Phonetic Continuous Speech Corpus (LDC93S1)/TIMIT"

# Write paths
ALL_EXP_FOLDER = "/home/jeevan/Jeevan_K/Projects/Asquire/Reverb-Quest/Formants/CSV2"

#####

### Initial data
1. TIMIT AUDIO FS
2. ALL TIMIT AUDIO PATHS
3. ALL TIMIT VOWELS TYPES

In [3]:
# Init data

TIMIT_AUDIO_FS = 16000

''' Get ALL TIMIT AUDIO PATHS from TIMIT dataset '''

DATA_LOCATION_PATHS = f"{ROOT_TIMIT_DATA_PATH}/T*/*/*/*.WAV"
ALL_TIMIT_AUDIO_PATHS = glob(DATA_LOCATION_PATHS)
ALL_TIMIT_AUDIO_PATHS.sort()

print(len(ALL_TIMIT_AUDIO_PATHS), "audio files exists.")

'''list ALL TIMIT VOWELS TYPES from TIMIT dataset '''

v_dump = """            iy         beet          bcl b IY tcl t
                        ih         bit           bcl b IH tcl t 
                        eh         bet           bcl b EH tcl t
                        ey         bait          bcl b EY tcl t
                        ae         bat           bcl b AE tcl t
                        aa         bott          bcl b AA tcl t
                        aw         bout          bcl b AW tcl t
                        ay         bite          bcl b AY tcl t
                        ah         but           bcl b AH tcl t
                        ao         bought        bcl b AO tcl t
                        oy         boy           bcl b OY
                        ow         boat          bcl b OW tcl t
                        uh         book          bcl b UH kcl k
                        uw         boot          bcl b UW tcl t
                        ux         toot          tcl t UX tcl t
                        er         bird          bcl b ER dcl d
                        ax         about         AX bcl b aw tcl t
                        ix         debit         dcl d eh bcl b IX tcl t
                        axr        butter        bcl b ah dx AXR
                        ax-h       suspect       s AX-H s pcl p eh kcl k tcl t"""
v_dump = v_dump.split("\n")
ALL_TIMIT_VOWELS_TYPES = [' '.join(x.split()).split(" ")[0] for x in v_dump]

print(len(ALL_TIMIT_VOWELS_TYPES), "types of vowels are present")
print(ALL_TIMIT_VOWELS_TYPES)

6300 audio files exists.
20 types of vowels are present
['iy', 'ih', 'eh', 'ey', 'ae', 'aa', 'aw', 'ay', 'ah', 'ao', 'oy', 'ow', 'uh', 'uw', 'ux', 'er', 'ax', 'ix', 'axr', 'ax-h']


### ALL TIMIT VOWELS 

Create array of row values

In [4]:
seperator = " "
columns = {
    "filepath": None,
    "person": None,
    "sex": None,
    "start_sample": None,
    "end_sample": None,
    "phone": None,
}

ALL_TIMIT_VOWELS_ARR: list[dict] = []

for audio_path in tqdm(ALL_TIMIT_AUDIO_PATHS[:]):
    _anotation_path: str = audio_path.replace(".WAV", ".PHN")
    _phns: list = []

    with open(_anotation_path, mode="r") as _ph_file:
        _phns = (
            _ph_file.readlines()
        )  # read all lines : <start-sample, end-sample, phone>; space seperated values

        _phns = [
            p.strip().split(seperator) for p in _phns
        ]  # split phones (last element)

    _rows = [
        {
            "audio_filepath": audio_path,
            "person": bn(sp(_anotation_path)[0]),
            "sex": bn(sp(_anotation_path)[0])[0],
            "start_sample": int(_ph[0]),
            "end_sample": int(_ph[1]),
            "start_second": int(_ph[0])/TIMIT_AUDIO_FS,
            "end_second": int(_ph[1])/TIMIT_AUDIO_FS,
            "duration_second": (int(_ph[1]) - int(_ph[0]))/TIMIT_AUDIO_FS,
            "vowel_type": _ph[-1],
        }

        for _ph in _phns
        if _ph[-1] in ALL_TIMIT_VOWELS_TYPES
    ]

    # Accumulate rows
    ALL_TIMIT_VOWELS_ARR += _rows 


100%|██████████| 6300/6300 [00:16<00:00, 376.22it/s]


Create and export pandas dataframe

In [5]:
ALL_TIMIT_VOWELS_EXP_FILENAME = "all-timit_vowels.csv"
ALL_TIMIT_VOWELS_EXP_FILEPATH = join(ALL_EXP_FOLDER, ALL_TIMIT_VOWELS_EXP_FILENAME)


ALL_TIMIT_VOWELS_DF = pd.DataFrame(ALL_TIMIT_VOWELS_ARR)
ALL_TIMIT_VOWELS_DF.index.name = "idx"

ALL_TIMIT_VOWELS_DF.to_csv(ALL_TIMIT_VOWELS_EXP_FILEPATH, index=True)

In [28]:
main_df = ALL_TIMIT_VOWELS_DF
all_vowels = pd.unique(main_df["vowel_type"])
all_vowels
LIMIT = 2
dfs = []
for vowel in all_vowels:
    sub_df = main_df[main_df["vowel_type"] == vowel]

    sub_df_male = shuffle(sub_df[sub_df["sex"] == "M"], random_state=0)
    sub_df_female = shuffle(sub_df[sub_df["sex"] == "F"], random_state=0)

    size_m = LIMIT if len(sub_df_male) > LIMIT else len(sub_df_male)
    size_f = LIMIT if len(sub_df_female) > LIMIT else len(sub_df_female)

    sub_df_male = sub_df_male[:size_m]
    sub_df_female = sub_df_female[:size_f]

    df = pd.concat([sub_df_male, sub_df_female])

    dfs.append(df)

TIMIT_VOWEL_SUBSET_DF = pd.concat(dfs)

TIMIT_VOWEL_SUBSET_DF = TIMIT_VOWEL_SUBSET_DF.reset_index()
# TIMIT_VOWEL_SUBSET_DF = TIMIT_VOWEL_SUBSET_DF.drop(columns=["index"])
TIMIT_VOWEL_SUBSET_DF.index.name = "slno"

# Export timit vowel fpe subset
TIMIT_VOWELS_SUBSET_EXP_FILENAME = f"timit-vowels_subset_{LIMIT}.csv"
TIMIT_VOWELS_SUBSET_EXP_FILEPATH = join(
    ALL_EXP_FOLDER, TIMIT_VOWELS_SUBSET_EXP_FILENAME
)

TIMIT_VOWEL_SUBSET_DF.to_csv(TIMIT_VOWELS_SUBSET_EXP_FILEPATH, index=True)


# TIMIT_VOWELS_SUBSET_FPE_EXP_FILENAME_JSON = (
#     "timit-vowels_formant_estimation_subset_8k.json"
# )
# TIMIT_VOWELS_SUBSET_FPE_EXP_FILEPATH_JSON = join(
#     ALL_EXP_FOLDER, TIMIT_VOWELS_SUBSET_FPE_EXP_FILENAME_JSON
# )


# json_columns = [
#     "idx",
#     "person",
#     "sex",
#     "vowel_type",
#     "pitch_org_praat",
#     "f1_mean_org_praat",
#     "f2_mean_org_praat",
#     "f3_mean_org_praat",
#     "f4_mean_org_praat",
# ]
# TIMIT_VOWEL_SUBSET_DF_JSON = TIMIT_VOWEL_SUBSET_DF[json_columns]
# TIMIT_VOWEL_SUBSET_DF_JSON.to_json(
#     TIMIT_VOWELS_SUBSET_FPE_EXP_FILEPATH_JSON, index=True, orient="table"
# )

TIMIT_VOWEL_SUBSET_DF


Unnamed: 0_level_0,idx,audio_filepath,person,sex,start_sample,end_sample,start_second,end_second,duration_second,vowel_type
slno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,23297,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,MJEB1,M,35600,37320,2.225000,2.332500,0.107500,iy
1,13257,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,MAHH0,M,4479,7240,0.279938,0.452500,0.172563,iy
2,9265,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,FMCM0,F,19843,20553,1.240187,1.284563,0.044375,iy
3,35760,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,FGCS0,F,27779,28574,1.736187,1.785875,0.049688,iy
4,22811,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,MDAC0,M,41400,43800,2.587500,2.737500,0.150000,ae
...,...,...,...,...,...,...,...,...,...,...
75,21663,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,FKFB0,F,43960,45080,2.747500,2.817500,0.070000,uh
76,77336,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,MKRG0,M,8382,8727,0.523875,0.545438,0.021562,ax-h
77,48920,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,MJMM0,M,24280,24640,1.517500,1.540000,0.022500,ax-h
78,399,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,FJEM0,F,19512,19978,1.219500,1.248625,0.029125,ax-h


## Formant and Pitch Estimation: Praat

In [16]:
# TIMIT Vowels
ALL_TIMIT_VOWELS_DF_FPE = pd.read_csv(TIMIT_VOWELS_SUBSET_EXP_FILEPATH); ALL_TIMIT_VOWELS_DF_FPE

Unnamed: 0,slno,idx,audio_filepath,person,sex,start_sample,end_sample,start_second,end_second,duration_second,vowel_type
0,0,23297,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,MJEB1,M,35600,37320,2.225000,2.332500,0.107500,iy
1,1,13257,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,MAHH0,M,4479,7240,0.279938,0.452500,0.172563,iy
2,2,74968,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,MTMN0,M,3310,4181,0.206875,0.261313,0.054437,iy
3,3,47517,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,MFRM0,M,15320,17754,0.957500,1.109625,0.152125,iy
4,4,40922,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,MJLG1,M,9645,10485,0.602812,0.655312,0.052500,iy
...,...,...,...,...,...,...,...,...,...,...,...
7882,7882,76056,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,FJRB0,F,18255,18670,1.140937,1.166875,0.025937,ax-h
7883,7883,53946,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,FGDP0,F,28920,29240,1.807500,1.827500,0.020000,ax-h
7884,7884,46054,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,FSAK0,F,46056,46720,2.878500,2.920000,0.041500,ax-h
7885,7885,27941,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,FSCN0,F,34400,34935,2.150000,2.183438,0.033438,ax-h


In [34]:
def measure_pitch(audio_path):
    f0min, f0max = [75, 500]
    
    sound = parselmouth.Sound(audio_path) # read the sound
    pitch = call(sound, "To Pitch", 0.0, f0min, f0max) #create a praat pitch object
    mean_pitch = call(pitch, "Get mean", 0, 0, "Hertz") # get mean pitch
    return mean_pitch

In [25]:
def measureFormants(audio_path, start_sec, end_sec, vowel_type):
    f0min, f0max = [75, 500]
    sound = parselmouth.Sound(audio_path) # read the sound
    pitch = call(sound, "To Pitch (cc)", 0, f0min, 15, 'no', 0.03, 0.45, 0.01, 0.35, 0.14, f0max)
    mean_pitch = call(pitch, "Get mean", 0, 0, "Hertz") # get mean pitch
    
    audio_chunk, fs = librosa.load(audio_path, sr=None, offset=start_sec, duration=(end_sec - start_sec))
    tmp_audio_file = f"/home/jeevan/Jeevan_K/Projects/Asquire/Reverb-Quest/Formants/AUDIO/tmp_timit_phones/{vowel_type}.wav"
    write(tmp_audio_file, fs, audio_chunk)
    sound_frm = parselmouth.Sound(tmp_audio_file)
    # sound_frm = sound_frm.extract_part(rom_time=start_sec, to_time=end_sec, window_shape=0, relative_width=1, preserve_times=False) # read the sound chunk
    pointProcess = call(sound_frm, "To PointProcess (periodic, cc)", f0min, f0max)
    formants = call(sound_frm, "To Formant (burg)", 0.0025, 5, 5000, 0.025, 50)
    numPoints = call(pointProcess, "Get number of points")

    f1_list = []
    f2_list = []
    f3_list = []
    f4_list = []
    
    # Measure formants only at glottal pulses
    for point in range(0, numPoints):
        point += 1
        t  = call(pointProcess, "Get time from index", point)
        f1 = call(formants, "Get value at time", 1, t, 'Hertz', 'Linear')
        f2 = call(formants, "Get value at time", 2, t, 'Hertz', 'Linear')
        f3 = call(formants, "Get value at time", 3, t, 'Hertz', 'Linear')
        f4 = call(formants, "Get value at time", 4, t, 'Hertz', 'Linear')
        f1_list.append(f1)
        f2_list.append(f2)
        f3_list.append(f3)
        f4_list.append(f4)
    
    f1_list = [f1 for f1 in f1_list if str(f1) != 'nan']
    f2_list = [f2 for f2 in f2_list if str(f2) != 'nan']
    f3_list = [f3 for f3 in f3_list if str(f3) != 'nan']
    f4_list = [f4 for f4 in f4_list if str(f4) != 'nan']
    
    # calculate mean formants across pulses
    f1_mean = np.mean(f1_list)
    f2_mean = np.mean(f2_list)
    f3_mean = np.mean(f3_list)
    f4_mean = np.mean(f4_list)
    
    # calculate median formants across pulses, this is what is used in all subsequent calcualtions
    # you can use mean if you want, just edit the code in the boxes below to replace median with mean
    f1_median = np.median(f1_list)
    f2_median = np.median(f2_list)
    f3_median = np.median(f3_list)
    f4_median = np.median(f4_list)
    
    return mean_pitch, f1_mean, f2_mean, f3_mean, f4_mean, f1_median, f2_median, f3_median, f4_median

In [50]:
def measureFormants2(audio_path, start_sec, end_sec):
    formants = pfp.formants_at_interval(
        audio_path, start_sec, end_sec, maxformant=5500, winlen=0.025, preemph=50
    )

    formants_mean = formants.mean(axis=0)
    formants_mean = list(formants_mean)[1:]  # skip time

    formants_median = np.median(formants, axis=0)
    formants_median = list(formants_median)[1:]  # skip time

    pitch_mean = measure_pitch(audio_path)

    return (
        pitch_mean,
        formants_mean[0],
        formants_mean[1],
        formants_mean[2],
        formants_median[0],
        formants_median[1],
        formants_median[2],
    )


In [51]:
measureFormants2("/home/jeevan/datasets/TIMIT Acoustic-Phonetic Continuous Speech Corpus (LDC93S1)/TIMIT/TRAIN/DR4/MPEB0/SX240.WAV", 0.6725, 0.7275)

(132.4246513288537,
 346.9794,
 2111.1302,
 2750.2146000000002,
 339.309,
 2111.74,
 2737.514)

In [52]:
def vowel_formant_estimation(df: pd.DataFrame):

    split_id = df.iloc[0, :]["idx"]
    ALL_TIMIT_VOWELS_FPE_EXP_FILENAME = (
        f"all_timit_vowels_formant_estimation_ll-{split_id}.csv"
    )

    _ALL_TIMIT_VOWELS_FPE_ARR: list[dict] = []
    for _, _vowel in tqdm(df.iterrows()):

        # Praat: pitch and formant estimation
        (
            pitch,
            f1_mean,
            f2_mean,
            f3_mean,
            f1_median,
            f2_median,
            f3_median,
        ) = measureFormants2(
            _vowel["audio_filepath"],
            _vowel["start_sec"],
            _vowel["end_sec"],
            f'{_vowel["vowel_type"]}-{_vowel["idx"]}',
        )

        _row = [
            {
                "idx": _vowel["idx"],
                "person": _vowel["person"],
                "vowel_type": _vowel["vowel_type"],
                "pitch_org_praat": pitch,
                "f1_mean_org_praat": f1_mean,
                "f2_mean_org_praat": f2_mean,
                "f3_mean_org_praat": f3_mean,
                "f4_mean_org_praat": f4_mean,
                "f1_median_org_praat": f1_median,
                "f2_median_org_praat": f2_median,
                "f3_median_org_praat": f3_median,
                "f4_median_org_praat": f4_median,
            }
        ]

        _ALL_TIMIT_VOWELS_FPE_ARR += _row

        break

    ALL_TIMIT_VOWELS_FPE_EXP_FILEPATH = join(
        ALL_EXP_FOLDER, ALL_TIMIT_VOWELS_FPE_EXP_FILENAME
    )

    ALL_TIMIT_VOWELS_FPE_DF = pd.DataFrame(_ALL_TIMIT_VOWELS_FPE_ARR)

    # ALL_TIMIT_VOWELS_FPE_DF.to_csv(ALL_TIMIT_VOWELS_FPE_EXP_FILEPATH, index=False)

    return _ALL_TIMIT_VOWELS_FPE_ARR


# make this section parallel


Combine Parallel Generated Files

In [54]:
# Combine parallel generated files: vowel formants

ALL_VOWEL_FPE_LL_EXP_FOLDER = (
    "/home/jeevan/Jeevan_K/Projects/Asquire/Reverb-Quest/Formants/CSV2/tmp_min_ll"
)

all_csv_files = [
    join(ALL_VOWEL_FPE_LL_EXP_FOLDER, filename)
    for filename in listdir(ALL_VOWEL_FPE_LL_EXP_FOLDER)
]
all_csv_files

all_vowel_fpe_dfs_ll = [pd.read_csv(csv_path) for csv_path in all_csv_files]
all_vowel_fpe_dfs_ll

ALL_TIMIT_VOWELS_FPE_DF = pd.concat(all_vowel_fpe_dfs_ll)
ALL_TIMIT_VOWELS_FPE_DF = ALL_TIMIT_VOWELS_FPE_DF.rename(
    {
        "pitch_praat": "pitch_org_praat",
        "f1_mean_praat": "f1_mean_org_praat",
        "f2_mean_praat": "f2_mean_org_praat",
        "f3_mean_praat": "f3_mean_org_praat",
        "f1_median_praat": "f1_median_org_praat",
        "f2_median_praat": "f2_median_org_praat",
        "f3_median_praat": "f3_median_org_praat",
    }, axis=1
)
ALL_TIMIT_VOWELS_FPE_DF

Unnamed: 0,idx,person,vowel_type,pitch_org_praat,f1_mean_org_praat,f2_mean_org_praat,f3_mean_org_praat,f1_median_org_praat,f2_median_org_praat,f3_median_org_praat
0,19550,FCMH1,ay,208.260808,813.117818,1820.268364,2900.199636,854.1010,1838.5700,2967.329
1,55061,FPMY0,ay,188.090050,835.446000,1574.517000,2473.371625,841.9565,1533.3245,2460.367
2,29181,MDLB0,ax,129.971107,490.113167,1405.893167,2702.338500,488.5360,1431.9645,2756.669
3,59253,MPMB0,ax,163.372185,619.503667,1331.860000,2456.828500,603.0830,1328.4010,2421.293
4,61887,FBCH0,ax,211.857979,538.638200,1282.232200,3060.811000,542.3600,1291.5480,3061.724
...,...,...,...,...,...,...,...,...,...,...
5,72581,MKDB0,ah,128.330939,621.098182,1469.444545,2724.076000,623.8820,1446.1360,2715.015
6,66865,FJSK0,ah,195.772859,558.883444,1656.679000,2752.324000,581.7680,1642.2010,2791.100
7,66917,FJSK0,ah,218.448693,657.999125,1807.719750,2988.945250,670.5455,1795.2560,3058.976
8,72484,MKAG0,ey,122.697817,499.924308,1859.491154,2577.673231,496.0450,1854.6990,2515.064


Create and export pandas dataframe: Formant estimation - (Praat, )

In [152]:
# Use formants extract ll to run in parallel
ALL_TIMIT_VOWELS_FPE_ARR = vowel_formant_estimation(ALL_TIMIT_VOWELS_DF_FPE) 
ALL_TIMIT_VOWELS_FPE_DF = pd.DataFrame(ALL_TIMIT_VOWELS_FPE_ARR); ALL_TIMIT_VOWELS_FPE_DF

0it [00:00, ?it/s]


TypeError: extract_part(): incompatible function arguments. The following argument types are supported:
    1. (self: parselmouth.Sound, from_time: Optional[float] = None, to_time: Optional[float] = None, window_shape: parselmouth.WindowShape = <WindowShape.RECTANGULAR: 0>, relative_width: Positive[float] = 1.0, preserve_times: bool = False) -> parselmouth.Sound

Invoked with: <parselmouth.Sound object at 0x7f8f78c95eb0>, '/home/jeevan/datasets/TIMIT Acoustic-Phonetic Continuous Speech Corpus (LDC93S1)/TIMIT/TEST/DR1/FAKS0/SA1.WAV'; kwargs: rom_time=0.7025, to_time=0.7989375, window_shape=0, relative_width=1, preserve_times=False

In [130]:
# Use formants extract ll to run in parallel
# ALL_TIMIT_VOWELS_FPE_ARR = vowel_formant_estimation(ALL_TIMIT_VOWELS_DF_FPE) 
# ALL_TIMIT_VOWELS_FPE_DF = pd.DataFrame(ALL_TIMIT_VOWELS_FPE_ARR)

ALL_TIMIT_VOWELS_FPE_EXP_FILENAME = "all-timit_vowels_formant-estimation.csv"
ALL_TIMIT_VOWELS_FPE_EXP_FILEPATH = join(ALL_EXP_FOLDER, ALL_TIMIT_VOWELS_FPE_EXP_FILENAME)

# ALL_TIMIT_VOWELS_FPE_EXP_FILENAME_JSON = "all_timit_vowels_formant-estimation.json"
# ALL_TIMIT_VOWELS_FPE_EXP_FILEPATH_JSON = join(ALL_EXP_FOLDER, ALL_TIMIT_VOWELS_FPE_EXP_FILENAME_JSON)



ALL_TIMIT_VOWELS_FPE_DF.to_csv(ALL_TIMIT_VOWELS_FPE_EXP_FILEPATH, index=False)
# ALL_TIMIT_VOWELS_FPE_DF.to_json(ALL_TIMIT_VOWELS_FPE_EXP_FILEPATH_JSON, index=False, orient="table")

Merge dataframes

In [131]:
ALL_TIMIT_VOWELS_DF_ = pd.read_csv(ALL_TIMIT_VOWELS_EXP_FILEPATH); ALL_TIMIT_VOWELS_DF_
ALL_TIMIT_VOWELS_FPE_DF = pd.read_csv(ALL_TIMIT_VOWELS_FPE_EXP_FILEPATH); ALL_TIMIT_VOWELS_FPE_DF

ALL_TIMIT_VOWELS_FPE_MRG_DF = pd.merge(ALL_TIMIT_VOWELS_DF_, ALL_TIMIT_VOWELS_FPE_DF, how="inner", on=['idx', 'person', 'vowel_type']); ALL_TIMIT_VOWELS_FPE_MRG_DF


ALL_TIMIT_VOWELS_FPE_MRG_EXP_FILENAME = "all-timit_vowels_formant_estimation_merge.csv"
ALL_TIMIT_VOWELS_FPE_MRG_EXP_FILEPATH = join(ALL_EXP_FOLDER, ALL_TIMIT_VOWELS_FPE_MRG_EXP_FILENAME)

ALL_TIMIT_VOWELS_FPE_MRG_EXP_FILENAME_JSON = "all-timit_vowels_formant_estimation_merge.json"
ALL_TIMIT_VOWELS_FPE_MRG_EXP_FILEPATH_JSON = join(ALL_EXP_FOLDER, ALL_TIMIT_VOWELS_FPE_MRG_EXP_FILENAME_JSON)

ALL_TIMIT_VOWELS_FPE_MRG_DF.to_csv(ALL_TIMIT_VOWELS_FPE_MRG_EXP_FILEPATH, index=False)
# ALL_TIMIT_VOWELS_FPE_MRG_DF.to_json(ALL_TIMIT_VOWELS_FPE_MRG_EXP_FILEPATH_JSON, index=False, orient="table")

In [132]:
ALL_TIMIT_VOWELS_FPE_MRG_DF

Unnamed: 0,idx,audio_filepath,person,sex,start_sample,end_sample,start_sec,end_sec,duration_sec,vowel_type,pitch_org_praat,f1_mean_org_praat,f2_mean_org_praat,f3_mean_org_praat,f4_mean_org_praat,f1_median_org_praat,f2_median_org_praat,f3_median_org_praat,f4_median_org_praat
0,0,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,FAKS0,F,11240,12783,0.702500,0.798937,0.096437,iy,241.221155,601.939945,1683.950885,2587.135519,3430.181358,550.583804,1684.561468,2632.051857,3411.683362
1,1,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,FAKS0,F,14078,16157,0.879875,1.009813,0.129938,ae,241.221155,601.939945,1683.950885,2587.135519,3430.181358,550.583804,1684.561468,2632.051857,3411.683362
2,2,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,FAKS0,F,17587,18760,1.099187,1.172500,0.073313,er,241.221155,601.939945,1683.950885,2587.135519,3430.181358,550.583804,1684.561468,2632.051857,3411.683362
3,3,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,FAKS0,F,19962,21514,1.247625,1.344625,0.097000,aa,241.221155,601.939945,1683.950885,2587.135519,3430.181358,550.583804,1684.561468,2632.051857,3411.683362
4,4,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,FAKS0,F,26280,28591,1.642500,1.786938,0.144437,uw,241.221155,601.939945,1683.950885,2587.135519,3430.181358,550.583804,1684.561468,2632.051857,3411.683362
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78369,78369,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,MTCS0,M,22751,23586,1.421938,1.474125,0.052187,ao,129.177040,573.020865,1623.424441,2387.667442,3453.730475,487.980723,1649.648554,2344.035140,3429.384694
78370,78370,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,MTCS0,M,24520,25386,1.532500,1.586625,0.054125,ih,129.177040,573.020865,1623.424441,2387.667442,3453.730475,487.980723,1649.648554,2344.035140,3429.384694
78371,78371,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,MTCS0,M,26843,28490,1.677687,1.780625,0.102938,ay,129.177040,573.020865,1623.424441,2387.667442,3453.730475,487.980723,1649.648554,2344.035140,3429.384694
78372,78372,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,MTCS0,M,30310,31707,1.894375,1.981688,0.087313,ih,129.177040,573.020865,1623.424441,2387.667442,3453.730475,487.980723,1649.648554,2344.035140,3429.384694


Create subset of vowels for the experiment

In [136]:
main_df = ALL_TIMIT_VOWELS_FPE_MRG_DF
all_vowels = pd.unique(main_df["vowel_type"])
all_vowels
LIMIT = 200
dfs = []
for vowel in all_vowels:
    sub_df = main_df[main_df["vowel_type"] == vowel]

    sub_df_male = shuffle(sub_df[sub_df["sex"] == "M"])
    sub_df_female = shuffle(sub_df[sub_df["sex"] == "F"])

    size_m = LIMIT if len(sub_df_male) > LIMIT else len(sub_df_male)
    size_f = LIMIT if len(sub_df_female) > LIMIT else len(sub_df_female)

    sub_df_male = sub_df_male[:size_m]
    sub_df_female = sub_df_female[:size_f]

    df = pd.concat([sub_df_male, sub_df_female])

    dfs.append(df)

TIMIT_VOWEL_SUBSET_DF = pd.concat(dfs)
TIMIT_VOWEL_SUBSET_DF
TIMIT_VOWEL_SUBSET_DF = TIMIT_VOWEL_SUBSET_DF.reset_index()
TIMIT_VOWEL_SUBSET_DF = TIMIT_VOWEL_SUBSET_DF.drop(columns=["index"])
TIMIT_VOWEL_SUBSET_DF.index.name = "slno"

# Export timit vowel fpe subset
TIMIT_VOWELS_SUBSET_FPE_EXP_FILENAME = "timit-vowels_formant_estimation_subset_8k.csv"
TIMIT_VOWELS_SUBSET_FPE_EXP_FILEPATH = join(
    ALL_EXP_FOLDER, TIMIT_VOWELS_SUBSET_FPE_EXP_FILENAME
)

TIMIT_VOWELS_SUBSET_FPE_EXP_FILENAME_JSON = (
    "timit-vowels_formant_estimation_subset_8k.json"
)
TIMIT_VOWELS_SUBSET_FPE_EXP_FILEPATH_JSON = join(
    ALL_EXP_FOLDER, TIMIT_VOWELS_SUBSET_FPE_EXP_FILENAME_JSON
)

TIMIT_VOWEL_SUBSET_DF.to_csv(TIMIT_VOWELS_SUBSET_FPE_EXP_FILEPATH, index=True)

json_columns = [
    "idx",
    "person",
    "sex",
    "vowel_type",
    "pitch_org_praat",
    "f1_mean_org_praat",
    "f2_mean_org_praat",
    "f3_mean_org_praat",
    "f4_mean_org_praat",
]
TIMIT_VOWEL_SUBSET_DF_JSON = TIMIT_VOWEL_SUBSET_DF[json_columns]
TIMIT_VOWEL_SUBSET_DF_JSON.to_json(
    TIMIT_VOWELS_SUBSET_FPE_EXP_FILEPATH_JSON, index=True, orient="table"
)
TIMIT_VOWEL_SUBSET_DF


Unnamed: 0_level_0,idx,audio_filepath,person,sex,start_sample,end_sample,start_sec,end_sec,duration_sec,vowel_type,pitch_org_praat,f1_mean_org_praat,f2_mean_org_praat,f3_mean_org_praat,f4_mean_org_praat,f1_median_org_praat,f2_median_org_praat,f3_median_org_praat,f4_median_org_praat
slno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,63466,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,MAJP0,M,20929,21700,1.308062,1.356250,0.048188,iy,115.975891,521.795449,1539.272068,2379.567771,3504.988911,521.351466,1589.922305,2448.463207,3470.430594
1,77016,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,MEJS0,M,27480,28680,1.717500,1.792500,0.075000,iy,105.750725,540.313948,1531.663936,2384.025650,3462.560333,546.201585,1578.373426,2421.078731,3429.233848
2,34449,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,MTBC0,M,9107,9960,0.569187,0.622500,0.053312,iy,120.633474,489.921579,1543.918976,2368.823186,3256.372115,482.823214,1533.343421,2449.876241,3242.355380
3,70407,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,MDLR0,M,23227,25315,1.451688,1.582188,0.130500,iy,111.682914,562.677539,1320.319182,2501.931164,3460.706569,548.101083,1310.749785,2476.401228,3442.462452
4,75285,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,MTWH1,M,10943,12490,0.683937,0.780625,0.096687,iy,133.941708,556.371642,1622.177894,2203.357364,3479.152705,555.391943,1703.817444,2389.401927,3506.851933
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7882,21144,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,FDAW0,F,17787,18120,1.111687,1.132500,0.020813,ax-h,196.447362,579.739987,1733.742313,2517.004199,3443.241187,591.403411,1792.007927,2566.729804,3460.316977
7883,22569,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,FVMH0,F,37180,37974,2.323750,2.373375,0.049625,ax-h,195.616210,651.641845,1715.511163,2643.680886,3783.917134,651.095457,1761.753903,2712.900205,3980.331394
7884,209,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,FDAC1,F,51765,52309,3.235313,3.269312,0.034000,ax-h,169.087936,594.395233,1525.209616,2747.216501,3780.761725,560.331228,1558.781563,2731.276477,3802.235385
7885,9553,/home/jeevan/datasets/TIMIT Acoustic-Phonetic ...,FREW0,F,13643,13994,0.852688,0.874625,0.021937,ax-h,205.846217,562.668073,1614.329340,2454.408462,3770.241443,519.508982,1624.744318,2492.133440,3892.818254


# Generate Synthetic Vowels with Klaat