In [1]:
import librosa 
from librosa.feature import melspectrogram
from fastai.data.all import *
from fastai.vision.all import *
from torch.nn.functional import pad

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
path = Path("../data/ESD/")

From the readme we see that the following are the english folders

In [3]:
eng_actor = ['0011', '0012', '0013', '0014', '0015', '0016', '0017', '0018', '0019', '0020']

In [51]:
@Transform
def load_audio(path):
    waveform, sample_rate = librosa.load(path)
    return waveform

@Transform
def mel_transform(y):
    return melspectrogram(y=y)

@Transform
def db_transform(mel):
    return librosa.power_to_db(mel)

In [52]:
def save_mel(mel, save_path):
    plt.imsave(save_path, arr=mel, origin="lower")

def extension_to_png(path):
    return Path(path).with_suffix(".png")

In [61]:
@Transform
def create_png(path):
    path = Path(path)
    png_path = Path(extension_to_png(path))
    if not png_path.exists():
        mel = mel_pipeline(path)
        save_mel(mel, png_path)
    return png_path

@Transform
def load_png(path):
    return PILImage.create(path)

@Transform
def to_float(tens):
    return tens.to(torch.float)

@Transform
def normalize(tens):
    return tens.float() / 255

def Pad_mel(size):
    @Transform
    def _inner(inputs):
        out = []
        for item in inputs:
            tens, label = item
            padded_tens = pad(tens, (0, (size-tens.size(-1))), value=0)
            out.append((padded_tens, label))
        return out
    return _inner

In [62]:
get_audio = FileGetter(extensions=".wav", folders=eng_actor)

def grandparent_label(path):
    return Path(path).parent.parent.name

def _parent_idxs(files, name):
    return L([i for i, path in enumerate(files) if parent_label(path) == name])

def ParentSplitter(train_name='train', valid_name='valid'):
    def _inner(o):
        return _parent_idxs(o, train_name), _parent_idxs(o, valid_name)
    return _inner

In [63]:
categories = {"Angry": 0, "Happy": 1, "Neutral": 2, "Sad": 3, "Surprise": 4}

In [64]:
@Transform
def categorize_dict(key):
    return torch.tensor(categories[key])

In [65]:
mel_pipeline = Pipeline([load_audio, mel_transform, db_transform])
mel_images = Pipeline([create_png, load_png, image2tensor, normalize])
label_pipeline = Pipeline([grandparent_label, categorize_dict, to_float])

In [11]:
files = get_audio(path, folders=eng_actor)
splits = ParentSplitter(valid_name='test')(files)
datasets = Datasets(files, [mel_images, label_pipeline], splits=splits)

In [15]:
dls = datasets.dataloaders(bs=2, before_batch=[Pad_mel(256)])

In [16]:
# learn = vision_learner(dls, resnet18, metrics=accuracy, 
#                        n_out=len(categories), loss_func=F.cross_entropy)

In [17]:
# learn.fine_tune(3)

In [18]:
from tqdm import tqdm

In [19]:
for file in tqdm(files):
    create_png(file)

100%|█████████████████████████████████████| 17500/17500 [08:16<00:00, 35.24it/s]
