# Benchmark different data formats for audio

### Download files from the children song dataset

In [2]:
from dagshub.streaming import DagsHubFilesystem
fs = DagsHubFilesystem(".", repo_url="https://dagshub.com/kinkusuma/children-song-dataset")


In [3]:
for num in range(1, 51):
    for letter in ["a", "b"]:
        f = fs.open(f"CSD/english/wav/en{num:03}{letter}.wav", 'r')
        f.close()

### Load the wavfile as arrays

In [5]:
import soundfile as sf

sound_arrays = []

for num in range(1, 5):
    for letter in ["a", "b"]:
        # print(f"Loading en{num:03}{letter}")
        data, sample_rate = sf.read(f"CSD/english/wav/en{num:03}{letter}.wav")
        sound_arrays.append({
            "audio": data,
            "sample_rate": sample_rate,
            "filename": f"en{num:03}{letter}"
        })

In [6]:
import pandas as pd

df = pd.DataFrame(sound_arrays)
df.head()

Unnamed: 0,audio,sample_rate,filename
0,"[[-3.0517578125e-05, -6.103515625e-05], [0.0, ...",44100,en001a
1,"[[-3.0517578125e-05, 3.0517578125e-05], [6.103...",44100,en001b
2,"[[-3.0517578125e-05, 0.0], [-3.0517578125e-05,...",44100,en002a
3,"[[-3.0517578125e-05, 0.0], [0.0, 0.0], [3.0517...",44100,en002b
4,"[[-3.0517578125e-05, 0.0], [0.0, 0.0], [3.0517...",44100,en003a


In [3]:
len(df.iloc[0]["audio"])

2857680

In [4]:
import numpy as np

# Split the 2d array into 2 columns of 1d array to make it easier to save to different file formats
def split_2d_array(arr):
    np_arr = np.array(arr)
    return pd.Series([np_arr[:, 0], np_arr[:, 1]])

# Apply the function to the 'array_column'
df_split = df['audio'].apply(split_2d_array)

# Rename the columns and join with the original DataFrame (excluding 'array_column')
df_split.columns = ['audio_dim0', 'audio_dim1']
df_final = pd.concat([df.drop('audio', axis=1), df_split], axis=1)
df_final.head()

Unnamed: 0,sample_rate,filename,audio_dim0,audio_dim1
0,44100,en001a,"[-3.0517578125e-05, 0.0, 6.103515625e-05, -3.0...","[-6.103515625e-05, -3.0517578125e-05, -3.05175..."
1,44100,en001b,"[-3.0517578125e-05, 6.103515625e-05, 3.0517578...","[3.0517578125e-05, 0.0, 0.0, 6.103515625e-05, ..."
2,44100,en002a,"[-3.0517578125e-05, -3.0517578125e-05, 3.05175...","[0.0, 3.0517578125e-05, 0.0, 6.103515625e-05, ..."
3,44100,en002b,"[-3.0517578125e-05, 0.0, 3.0517578125e-05, -3....","[0.0, 0.0, 0.0, 3.0517578125e-05, 6.103515625e..."
4,44100,en003a,"[-3.0517578125e-05, 0.0, 3.0517578125e-05, -3....","[0.0, 0.0, 0.0, 3.0517578125e-05, 0.0, 0.0, 3...."


In [5]:
df = df_final

In [6]:
import time, os

# Dictionary to hold benchmark results
results = {}

def benchmark_save_load(format_name, save_func, load_func, file_ext):
    file_name = f'benchmark_test.{file_ext}'

    # Measure save time
    start_time = time.time()
    save_func(df, file_name)
    save_time = time.time() - start_time

    # Measure load time
    start_time = time.time()
    loaded_df = load_func(file_name)
    load_time = time.time() - start_time

    # Clean up the file after testing
    os.remove(file_name)

    # Store results
    results[format_name] = {'save_time': save_time, 'load_time': load_time}
                            
# CSV
benchmark_save_load(
    'CSV',
    lambda df, filename: df.to_csv(filename, index=False),
    lambda filename: pd.read_csv(filename),
    'csv'
)

# HDF5
benchmark_save_load(
    'HDF5',
    lambda df, filename: df.to_hdf(filename, key='df', mode='w'),
    lambda filename: pd.read_hdf(filename, 'df'),
    'h5'
)

# Feather
benchmark_save_load(
    'Feather',
    lambda df, filename: df.to_feather(filename),
    lambda filename: pd.read_feather(filename),
    'feather'
)

# Parquet
benchmark_save_load(
    'Parquet',
    lambda df, filename: df.to_parquet(filename, engine='pyarrow'),
    lambda filename: pd.read_parquet(filename),
    'parquet'
)

# Display the results
for fmt, times in results.items():
    print(f"{fmt}: Save time = {times['save_time']:.4f} s, Load time = {times['load_time']:.4f} s")

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index(['filename', 'audio_dim0', 'audio_dim1'], dtype='object')]

  lambda df, filename: df.to_hdf(filename, key='df', mode='w'),


CSV: Save time = 0.0071 s, Load time = 0.0013 s
HDF5: Save time = 1.1481 s, Load time = 1.0334 s
Feather: Save time = 1.0951 s, Load time = 0.1601 s
Parquet: Save time = 1.6985 s, Load time = 0.3170 s
