# Creating the audio dataset


<https://huggingface.co/docs/datasets/audio_dataset#create-an-audio-dataset>

<https://huggingface.io/datasets/autumnjohnson/ceti_audio>

- Directory names: “CodaType”
- Filenames: Tag|CodaNum2018.flac
- Recording format: 3 channels, 96 kHz, int16


In [70]:
import IPython.display as ipd
from collections import Counter
import pandas as pd
from librosa.util import find_files
from datasets import Dataset, Value, Audio, Features, GeneratorBasedBuilder, BuilderConfig, Version, DatasetInfo, load_dataset
import pyarrow as pa 
import pathlib
from huggingface_hub import login


### Initialize variables


In [71]:
DATA_FOLDER = './data/codas.ch1/'
DATA_REPO = 'autumnjohnson/ceti_audio'
TOKEN = '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'

In [72]:
login(token = TOKEN)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/autumn/.var/app/org.jupyter.JupyterLab/cache/huggingface/token
Login successful


### Get all the audio files


In [73]:
# Recursively finds and returns all.flac files in the given folder (and its subfolders)
def get_files(folder):
    files = find_files(directory=folder, ext='flac', recurse=True)
    dataset = []
    for path in files:
        start, end = path.split('data/')
        dataset.append("data/" + end)
    return dataset

In [74]:
files = get_files(DATA_FOLDER)

In [75]:
files_df = pd.DataFrame(files)
files_df

Unnamed: 0,0
0,data/codas.ch1/1+1+3/sw061b4933.flac
1,data/codas.ch1/1+1+3/sw061b4934.flac
2,data/codas.ch1/1+1+3/sw061b4935.flac
3,data/codas.ch1/1+1+3/sw061b4936.flac
4,data/codas.ch1/1+1+3/sw061b4937.flac
...,...
3524,data/codas.ch1/9i/sw106a8171.flac
3525,data/codas.ch1/9i/sw106a8173.flac
3526,data/codas.ch1/9i/sw106a8174.flac
3527,data/codas.ch1/9i/sw106a8270.flac


In [76]:
coda_types = []
all = files.copy()
for i in all:
    coda_type = i.split(DATA_FOLDER)[0]
    coda_type = coda_type.split("/")[2]
    coda_types.append(coda_type)
coda_types

['1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',


In [77]:
dataset_dict = {"audio": files, "coda_type": coda_types, "path": files.copy(), "sampling_rate": [16000] * len(files)}

In [78]:
Counter(dataset_dict["coda_type"])


Counter({'1+1+3': 2145,
         '1+31': 6,
         '10R': 11,
         '10i': 14,
         '2+3': 23,
         '3D': 38,
         '3R': 4,
         '4D': 2,
         '4R1': 16,
         '4R2': 267,
         '5R1': 511,
         '5R2': 155,
         '5R3': 74,
         '6R': 4,
         '6i': 44,
         '7D1': 13,
         '7D2': 33,
         '7i': 30,
         '8D': 18,
         '8R': 16,
         '8i': 54,
         '9R': 7,
         '9i': 44})

In [79]:
[[x,dataset_dict["coda_type"].count(x)] for x in set(dataset_dict["coda_type"])]

[['8i', 54],
 ['8R', 16],
 ['5R3', 74],
 ['1+31', 6],
 ['9i', 44],
 ['3D', 38],
 ['7i', 30],
 ['6i', 44],
 ['10R', 11],
 ['6R', 4],
 ['5R1', 511],
 ['3R', 4],
 ['2+3', 23],
 ['4D', 2],
 ['9R', 7],
 ['7D2', 33],
 ['8D', 18],
 ['1+1+3', 2145],
 ['10i', 14],
 ['5R2', 155],
 ['4R2', 267],
 ['4R1', 16],
 ['7D1', 13]]

In [80]:
features = Features({'audio': Audio(sampling_rate=16000), 'coda_type': Value('string'), 'path': Value('string'), 'sampling_rate': 
Value('int64')})

In [81]:
dataset = Dataset.from_dict(dataset_dict).cast_column("audio", Audio(sampling_rate=16000))
dataset.features

{'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None),
 'coda_type': Value(dtype='string', id=None),
 'path': Value(dtype='string', id=None),
 'sampling_rate': Value(dtype='int64', id=None)}

In [82]:
dataset[0]

{'audio': {'path': 'data/codas.ch1/1+1+3/sw061b4933.flac',
  'array': array([-0.03293778, -0.06520429, -0.05711398, ..., -0.06182573,
         -0.0577666 , -0.06570289]),
  'sampling_rate': 16000},
 'coda_type': '1+1+3',
 'path': 'data/codas.ch1/1+1+3/sw061b4933.flac',
 'sampling_rate': 16000}

In [83]:
new_dataset

Dataset({
    features: ['train'],
    num_rows: 3529
})

In [84]:
new_dataset = pd.DataFrame(dataset)

In [85]:
new_dataset = Dataset.from_pandas(new_dataset)


In [86]:
new_dataset['coda_type'][:10]

['1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3',
 '1+1+3']

In [87]:
new_dataset = new_dataset.cast_column("audio", Audio(sampling_rate=16000))

In [97]:
new_dataset = new_dataset.shuffle()

In [99]:
new_dataset['coda_type'][:10]

['5R1',
 '1+1+3',
 '1+1+3',
 '5R3',
 '1+1+3',
 '1+1+3',
 '5R1',
 '5R2',
 '1+1+3',
 '5R1']

In [100]:
new_dataset = new_dataset.train_test_split(test_size=0.1)

DatasetDict({
    train: Dataset({
        features: ['audio', 'coda_type', 'path', 'sampling_rate'],
        num_rows: 3176
    })
    test: Dataset({
        features: ['audio', 'coda_type', 'path', 'sampling_rate'],
        num_rows: 353
    })
})

### Push to hub


In [101]:
new_dataset.push_to_hub(DATA_REPO)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/3176 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/32 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/353 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/3.52k [00:00<?, ?B/s]

## Test


In [37]:
#dataset = load_dataset(DATA_REPO)

Downloading readme:   0%|          | 0.00/3.41k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/163M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/3529 [00:00<?, ? examples/s]

In [16]:
#dataset["train"][0]