In [None]:
import os 

audio_files = "./../data/raw"

dir = os.walk(audio_files, topdown=True)

labels_raw = next(iter(dir))[1]
labels_raw

In [None]:
import numpy as np

label_names = []
labels_idx = []

for label in labels_raw:
    idx, name = label.split("_", 1)
    label_names.append(name)
    labels_idx.append(int(idx))


label_names = np.array(label_names)
labels_idx = np.array(labels_idx)

label_names = label_names[labels_idx.argsort()]
labels_idx.sort()

In [None]:
label_names

In [None]:
import pandas as pd

df = pd.DataFrame({
    "mid": label_names,
    "display_name": label_names
}, index=labels_idx)

df.to_csv("./../data/processed/class_labels_indices.csv")

In [None]:
from os import makedirs
import re
import librosa
import soundfile as sf
from scipy.signal import resample_poly
import json

dir = os.walk(audio_files, topdown=True)
next(iter(dir))[1]


# CREATE TRAIN, VAL, TEST data 
# Based on songs so they don't leak 
data = []
supported_exts = ('.wav', '.flac', '.ogg', '.aiff', '.au', '.mp3', '.m4a', '.wma')

def create_samples(samples, data_split="train", remove_intros_outros=True):

    for file in samples:
        makedirs(f'./../data/processed/{data_split}', exist_ok=True)
        
        if not file.lower().endswith(supported_exts):
            print("SKIP")
            continue

        wav, sr = librosa.load(path + '/' + str(file))
        
        prev_samples_to_take = 0
        samples_to_take = 10*sr

        samples = []

        while samples_to_take < wav.shape[0]:
            samples.append(wav[prev_samples_to_take:samples_to_take])
            prev_samples_to_take = samples_to_take
            samples_to_take += 10*sr

        if remove_intros_outros:
            # Remove first and last 3 samples due to intro and outros
            samples = samples[3:]
            samples = samples[:-3]

        for idx, sample in enumerate(samples):
            # skip intros
            if idx == 0 or idx == 1:
                continue

            # Split the songs into 10 second segments
            save_path = str(f'./../data/processed/{data_split}')


            output_path = save_path + '/'  + str(idx) + '-'  + (re.sub(r"[^A-Za-z\.\d]", "", str(file))).lower()
            # output_path = re.sub(r'[A-Z\s]', "", output_path)
            some_json =  {
                "wav": output_path,
                "labels": str(label),
                "split": data_split
            }

            data.append(some_json)

            resampled = resample_poly(sample, 16000, sr)


            sf.write(output_path, resampled, 16000)


for files in dir:
    label = ""
    for name in label_names:
        if name in files[0]:
            label = name


    path = files[0]
    arr = np.array(files[2])

    n_arr = arr.shape[0]
    # Approx 10% val, 10% test
    split_proportion = 0.1 

    split_int = int(np.round(n_arr*split_proportion))

    np.random.shuffle(arr)

    test_split = arr[:split_int]
    val_split = arr[split_int:split_int*2]
    train_split = arr[split_int*2:]

    create_samples(test_split, "test")
    create_samples(val_split, "val")
    create_samples(train_split, "train")

traindata_json = {"data": data}

with open("./../data/processed/train_test_val_data.json", "w") as f:
    f.write(json.dumps(traindata_json))