# Creating the audio dataset


<https://huggingface.co/docs/datasets/audio_dataset#create-an-audio-dataset>

<https://huggingface.io/datasets/autumnjohnson/ceti_audio>

- Directory names: “CodaType”
- Filenames: Tag|CodaNum2018.flac
- Recording format: 3 channels, 96 kHz, int16


In [17]:
import IPython.display as ipd
import pandas as pd
from librosa.util import find_files
from datasets import Dataset, Value, Audio, Features, GeneratorBasedBuilder, BuilderConfig, Version, DatasetInfo, load_dataset
import pyarrow as pa 
import pathlib

### Initialize variables


In [18]:
DATA_FOLDER = './data/codas.ch1/'
DATA_REPO = 'autumnjohnson/ceti_audio'

### Get all the audio files


In [19]:
# Recursively finds and returns all.flac files in the given folder (and its subfolders)
def get_files(folder):
    files = find_files(directory=folder, ext='flac', recurse=True)
    dataset = []
    for path in files:
        start, end = path.split('data/')
        dataset.append("data/" + end)
    return dataset

In [20]:
files = get_files(DATA_FOLDER)

In [21]:
files_df = pd.DataFrame(files)
files_df

Unnamed: 0,0
0,data/codas.ch1/1+1+3/sw061b4933.flac
1,data/codas.ch1/1+1+3/sw061b4934.flac
2,data/codas.ch1/1+1+3/sw061b4935.flac
3,data/codas.ch1/1+1+3/sw061b4936.flac
4,data/codas.ch1/1+1+3/sw061b4937.flac
...,...
3524,data/codas.ch1/9i/sw106a8171.flac
3525,data/codas.ch1/9i/sw106a8173.flac
3526,data/codas.ch1/9i/sw106a8174.flac
3527,data/codas.ch1/9i/sw106a8270.flac


In [22]:
dataset_dict = {"audio": files, "coda_type": files.copy(), "path": files.copy(), "sampling_rate": [16000] * len(files)}

In [23]:
dataset_dict

{'audio': ['data/codas.ch1/1+1+3/sw061b4933.flac',
  'data/codas.ch1/1+1+3/sw061b4934.flac',
  'data/codas.ch1/1+1+3/sw061b4935.flac',
  'data/codas.ch1/1+1+3/sw061b4936.flac',
  'data/codas.ch1/1+1+3/sw061b4937.flac',
  'data/codas.ch1/1+1+3/sw061b4938.flac',
  'data/codas.ch1/1+1+3/sw061b4939.flac',
  'data/codas.ch1/1+1+3/sw061b4940.flac',
  'data/codas.ch1/1+1+3/sw061b4941.flac',
  'data/codas.ch1/1+1+3/sw061b4942.flac',
  'data/codas.ch1/1+1+3/sw061b4943.flac',
  'data/codas.ch1/1+1+3/sw061b4944.flac',
  'data/codas.ch1/1+1+3/sw061b4945.flac',
  'data/codas.ch1/1+1+3/sw061b4946.flac',
  'data/codas.ch1/1+1+3/sw061b4947.flac',
  'data/codas.ch1/1+1+3/sw061b4948.flac',
  'data/codas.ch1/1+1+3/sw061b4949.flac',
  'data/codas.ch1/1+1+3/sw061b4958.flac',
  'data/codas.ch1/1+1+3/sw061b4959.flac',
  'data/codas.ch1/1+1+3/sw061b5019.flac',
  'data/codas.ch1/1+1+3/sw061b5020.flac',
  'data/codas.ch1/1+1+3/sw061b5021.flac',
  'data/codas.ch1/1+1+3/sw061b5022.flac',
  'data/codas.ch1/1+1+3/s

In [24]:
features = Features({'audio': Audio(sampling_rate=16000), 'coda_type': Value('string'), 'path': Value('string'), 'sampling_rate': 
Value('int64')})

In [31]:
dataset = Dataset.from_dict(dataset_dict).cast_column("audio", Audio(sampling_rate=16000))
dataset.features

{'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None),
 'coda_type': Value(dtype='string', id=None),
 'path': Value(dtype='string', id=None),
 'sampling_rate': Value(dtype='int64', id=None)}

In [32]:
dataset[0]

{'audio': {'path': 'data/codas.ch1/1+1+3/sw061b4933.flac',
  'array': array([-0.03293778, -0.06520429, -0.05711398, ..., -0.06182573,
         -0.0577666 , -0.06570289]),
  'sampling_rate': 16000},
 'coda_type': 'data/codas.ch1/1+1+3/sw061b4933.flac',
 'path': 'data/codas.ch1/1+1+3/sw061b4933.flac',
 'sampling_rate': 16000}

In [33]:
new_dataset = pd.DataFrame(dataset)

In [34]:
new_dataset = Dataset.from_pandas(new_dataset)

In [35]:
new_dataset = new_dataset.cast_column("audio", Audio(sampling_rate=16000))

### Push to hub


In [36]:
new_dataset.push_to_hub(DATA_REPO)

Map:   0%|          | 0/3529 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/36 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/3.31k [00:00<?, ?B/s]

## Test


In [15]:
#dataset = load_dataset(DATA_REPO)

In [16]:
#dataset["train"][0]