In [2]:
import librosa
import numpy as np
from pathlib import Path
import tensorflow as tf
import tensorflow_hub as hub

## Preprocessing steps
* ### Load the data from raw audio files
* ### Max duration is 5 second
* ### Sampling rate of 16K Hz
* ### Create separate csv files for Cry, Scream and Normal voice

In [289]:
# def extract_features(basepath, category, sr=16000, duration=5):
#     """
#     Convert audio files to numpy array.
#     Arg:
#         basepath(pathlib): location for datasets
#         category(str): 'cry', 'scream', 'normal'
#         sr (int): sampling rate
#         duration(float): duration in seconds
#     Remarks:
#         Create a numpy array of size (N, timestep_samples)
#         Saves <category>.csv in the basepath. 
#     """
#     cat_path = basepath/category
#     data = list()
#     for file in cat_path.iterdir():
#         y, _ = librosa.load(file, sr=sr, duration=duration)
#         length = sr*duration
#         if y.shape[0] != length:
#             y = np.pad(y, pad_width=(0,length - y.shape[0]),mode='constant', constant_values = (0,0))
#         data.append(y)
#     data = np.array(data)
#     np.savetxt(basepath/f'{category}.csv', data, delimiter=',')
#     print(f"File {category}.csv saved successfully")
    

In [291]:
# extract_features(basepath, 'cry')
# extract_features(basepath, 'scream')
# extract_features(basepath, 'normal')

In [4]:
basepath = Path.cwd()/'datasets'

In [6]:
def compile_filenames(basepath, category_dict):
    """
    Make list of tuple (filename, category) in basepath
    Args:
        basepath(path): path of the dataset
        categorydict(dict): key - category, value - class label
    Returns:
        list[tuple]: list of tuple (filename, classlabel)
    """
    filenames = list()
    labels = list()
    for cat in category_dict.keys():
        path = basepath/cat
        label = category_dict[cat]
        for file in path.iterdir():
            filenames.append(str(file))
            labels.append(label)
    return filenames, labels

def audio_to_numpy(filename, duration=5, sr = 16000):
    """
    """
    filename = filename.numpy().decode('utf-8')
    y, _ = librosa.load(filename, sr=sr, duration=duration)
    length = sr*duration
    if y.shape[0] != length:
        y = np.pad(y, pad_width=(0,length - y.shape[0]),mode='constant', constant_values = (0,0))
    return y

def convert_audio(filename, label):
    """
    Reads file from disk and convert to time series data.
    Args:
        filename(path): path of audio file
        label(int): class label of audio file
    Returns:
        np.ndarray: numpy array of 1D having values (0,1)
    Remarks:
        The sr is 16000 and time 5 second max
        The shape of the output is (80000,)
        Zero padding will be done if audio is not 5 sec.
    """
    y = tf.py_function(audio_to_numpy, [filename], tf.float32)
    return y, label


In [8]:
category_dict = {'normal':0, 'cry':1, 'scream':2}
filenames, labels = compile_filenames(basepath, category_dict)

In [10]:
dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
dataset = dataset.map(convert_audio)

I0000 00:00:1740336683.286265   62456 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6982 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1070, pci bus id: 0000:01:00.0, compute capability: 6.1


In [11]:
train_size = int(len(filenames)*0.7)
val_size = int(len(filenames)*0.15)
test_size = len(filenames) - train_size - val_size
train_data = dataset.take(train_size)
val_data = dataset.skip(train_size).take(val_size)
test_data = dataset.skip(train_size + val_size)

## Finetune YamNet model

In [157]:
# Model Building
class YamNetLayer(tf.keras.Layer):
    def __init__(self):
        super().__init__()
        yamnet_model_handle = 'https://www.kaggle.com/models/google/yamnet/TensorFlow2/yamnet/1'
        self.yamnet_model = hub.KerasLayer(yamnet_model_handle, trainable=False)
    def call(self, x):
        y = self.yamnet_model(x)[1]
        print(y.shape)
        return y

audio_ip = tf.keras.Input(shape=())
embeddings = YamNetLayer()(audio_ip)
y = tf.keras.layers.Dense(512, activation='relu')(embeddings)
y = tf.keras.layers.Dense(3, activation='softmax')(y)
model = tf.keras.Model(inputs = audio_ip, outputs = y)

(None, 1024)


In [158]:
model.summary()

In [169]:
with tf.device("/cpu:0"):
    # for X,y in train_data:
    #     y = model(X)
    #     print(y.shape)
    ip = np.random.random(size=(16000,))
    y = model(ip)
    print(y)

(2, 1024)
tf.Tensor(
[[0.40163845 0.21706104 0.3813005 ]
 [0.3763554  0.16822122 0.45542333]], shape=(2, 3), dtype=float32)
