# Creating the audio dataset


## Create dataset

<https://huggingface.co/docs/datasets/audio_dataset#create-an-audio-dataset>

<https://huggingface.io/datasets/autumnjohnson/ceti_audio>

- Directory names: “CodaType”
- Filenames: Tag|CodaNum2018.flac
- Recording format: 3 channels, 96 kHz, int16


In [None]:
import IPython.display as ipd
from collections import Counter
import io
import pandas as pd
from librosa.util import find_files
from datasets import Dataset, Value, Audio, Features, GeneratorBasedBuilder, BuilderConfig, Version, DatasetInfo, load_dataset
import pyarrow as pa 
import pathlib
from huggingface_hub import login
import soundfile as sf


In [None]:
DATA_FOLDER = './data/codas.ch1/'
DATA_REPO = 'autumnjohnson/ceti_audio'
TOKEN = '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'

In [None]:
def get_flac_bytes(files):
    flac_bytes = []
    for file in files:
        flac_bytes.append(numpy_to_bytes(file))
    return flac_bytes

In [None]:
login(token = TOKEN)

### Get all the audio files


In [None]:
# Recursively finds and returns all.flac files in the given folder (and its subfolders)
def get_files(folder):
    files = find_files(directory=folder, ext='flac', recurse=True)
    dataset = []
    for path in files:
        start, end = path.split('data/')
        dataset.append("data/" + end)
    return dataset

In [None]:
files = get_files(DATA_FOLDER)

In [None]:
files_df = pd.DataFrame(files)
files_df

In [None]:
coda_types = []
files_copy = files.copy()
for i in files_copy:
    coda_type = i.split(DATA_FOLDER)[0]
    coda_type = coda_type.split("/")[2]
    coda_types.append(coda_type)
coda_types

In [None]:
dataset_dict = {"audio": files, "coda_type": coda_types, "path": files.copy(), "sampling_rate": [16000] * len(files)}

In [None]:
[[x,dataset_dict["coda_type"].count(x)] for x in set(dataset_dict["coda_type"])]

In [None]:
features = Features({'audio': Audio(sampling_rate=16000), 'coda_type': Value('string'), 'path': Value('string'), 'sampling_rate': 
Value('int64')})

In [None]:
dataset = Dataset.from_dict(dataset_dict)

In [None]:
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
dataset.features

In [None]:
dataset[0]

In [None]:
new_dataset = pd.DataFrame(dataset)

In [None]:
new_dataset = Dataset.from_pandas(new_dataset)


In [None]:
new_dataset['coda_type'][:10]

In [None]:
new_dataset = new_dataset.cast_column("audio", Audio(sampling_rate=16000, decode=True))

In [None]:
new_dataset = new_dataset.train_test_split(test_size=0.3, shuffle=True)

In [None]:
new_dataset.push_to_hub(DATA_REPO)