# Creating the audio dataset


## Create dataset

<https://huggingface.co/docs/datasets/audio_dataset#create-an-audio-dataset>

<https://huggingface.io/datasets/autumnjohnson/ceti_audio>

- Directory names: “CodaType”
- Filenames: Tag|CodaNum2018.flac
- Recording format: 3 channels, 96 kHz, int16


In [13]:
import IPython.display as ipd
from collections import Counter
import io
import pandas as pd
from librosa.util import find_files
from datasets import Dataset, Value, Audio, Features, GeneratorBasedBuilder, BuilderConfig, Version, DatasetInfo, load_dataset
import pyarrow as pa 
import pathlib
from huggingface_hub import login
import soundfile as sf


In [14]:
def get_flac_bytes(files):
    flac_bytes = []
    for file in files:
        flac_bytes.append(numpy_to_bytes(file))
    return flac_bytes

In [16]:
login(token = TOKEN)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/autumn/.var/app/org.jupyter.JupyterLab/cache/huggingface/token
Login successful


### Get all the audio files


In [43]:
# Recursively finds and returns all.flac files in the given folder (and its subfolders)
def get_files(folder):
    files = find_files(directory=folder, ext='flac', recurse=True)
    dataset = []
    for path in files:
        start, end = path.split('data/')
        dataset.append("data/" + end)
    return dataset

In [44]:
files = get_files(DATA_FOLDER)

In [45]:
files_df = pd.DataFrame(files)
files_df

Unnamed: 0,0
0,data/codas.ch1/1+1+3/sw061b4933.flac
1,data/codas.ch1/1+1+3/sw061b4934.flac
2,data/codas.ch1/1+1+3/sw061b4935.flac
3,data/codas.ch1/1+1+3/sw061b4936.flac
4,data/codas.ch1/1+1+3/sw061b4937.flac
...,...
3507,data/codas.ch1/9i/sw106a8171.flac
3508,data/codas.ch1/9i/sw106a8173.flac
3509,data/codas.ch1/9i/sw106a8174.flac
3510,data/codas.ch1/9i/sw106a8270.flac


In [46]:
coda_types = []
all = files.copy()
for i in all:
    coda_type = i.split(DATA_FOLDER)[0]
    coda_type = coda_type.split("/")[2]
    coda_types.append(coda_type)
coda_types

['1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',


In [47]:
dataset_dict = {"audio": files, "coda_type": coda_types, "path": files.copy(), "sampling_rate": [16000] * len(files)}

In [48]:
[[x,dataset_dict["coda_type"].count(x)] for x in set(dataset_dict["coda_type"])]

[['2+3', 23],
 ['7D1', 13],
 ['10i', 14],
 ['5R2', 155],
 ['3R', 4],
 ['9i', 44],
 ['6i', 44],
 ['1+1+3', 2139],
 ['10R', 8],
 ['6R', 4],
 ['7i', 30],
 ['3D', 38],
 ['8i', 54],
 ['8D', 18],
 ['5R3', 74],
 ['5R1', 511],
 ['4R2', 267],
 ['4R1', 16],
 ['9R', 7],
 ['7D2', 33],
 ['8R', 16]]

In [49]:
features = Features({'audio': Audio(sampling_rate=16000), 'coda_type': Value('string'), 'path': Value('string'), 'sampling_rate': 
Value('int64')})

In [50]:
dataset = Dataset.from_dict(dataset_dict)

In [51]:
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
dataset.features

{'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None),
 'coda_type': Value(dtype='string', id=None),
 'path': Value(dtype='string', id=None),
 'sampling_rate': Value(dtype='int64', id=None)}

In [52]:
dataset[0]

{'audio': {'path': 'data/codas.ch1/1+1+3/sw061b4933.flac',
  'array': array([-0.03293778, -0.06520429, -0.05711398, ..., -0.06182573,
         -0.0577666 , -0.06570289]),
  'sampling_rate': 16000},
 'coda_type': '1+1+3',
 'path': 'data/codas.ch1/1+1+3/sw061b4933.flac',
 'sampling_rate': 16000}

In [53]:
new_dataset = pd.DataFrame(dataset)

In [54]:
new_dataset = Dataset.from_pandas(new_dataset)


In [55]:
new_dataset['coda_type'][:10]

['1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3']

In [56]:
new_dataset = new_dataset.cast_column("audio", Audio(sampling_rate=16000, decode=True))

In [58]:
new_dataset = new_dataset.train_test_split(test_size=0.3, shuffle=True)

In [None]:
new_dataset.push_to_hub(DATA_REPO)