In [None]:
# !pip install datasets
# !pip install librosa

In [None]:
import IPython.display as pds
import numpy as np
import pandas as pd
import librosa
import warnings
import random
import zipfile
import os
import re
import json
import tensorflow as tf
warnings.filterwarnings("ignore")

from tensorflow.keras import layers

In [None]:
!git clone https://github.com/Bangkit-Capstone-Team/IOH-Chat-App.git

In [None]:
SPEECH_DATA_DIR = "/content/IOH-Chat-App/Machine Learning/datasets/speech/audio.zip"

zf = zipfile.ZipFile(SPEECH_DATA_DIR)
zf.extractall("/tmp")
zf.close()

In [None]:
DATASET_PATH = "/tmp/audio/"
JSON_PATH = "/tmp/speech.json"

In [None]:
one_sec_signal = 22050

def prepare_dataset(dataset_path, json_path, **kwargs):
  json_data = {
      "mapping": list(),
      "features": list(),
      "labels": list(),
  }

  for i, (dirpath, _, files) in enumerate(os.walk(dataset_path)):
    for audio in sorted(files):
      i += 1
      
      filename = os.path.join(dirpath, audio)
      string_label = re.findall(r"^\w*",audio)[0]

      if os.path.getsize(filename) != 0:
        signal, sr = librosa.load(filename, sr=16000)

        if len(signal) >= one_sec_signal:
          signal = signal[:one_sec_signal]
          mfcc = librosa.feature.mfcc(signal, **kwargs)
        
          json_data["mapping"].append(string_label)
          json_data["features"].append(mfcc)
          json_data["labels"].append(i)
          print(f"{filename}: {string_label}")

  with open(json_path, "w") as fp:
     json.dump(json_data, fp, indent=4)

In [None]:
prepare_dataset(DATASET_PATH, JSON_PATH)

In [None]:
df = pd.read_json(json_data_path)
df.head()

In [None]:
pds.Audio(train_dataset['audio_array'][random_idx], rate=train_dataset['sample_rate'])

In [None]:
def create_model(n_class):
  model = tf.keras.Sequential()
  model.add(layers.Conv1D(32, kernel_size=(3, 3), activation='relu', input_shape=(1,)))
  model.add(layers.BatchNormalization())

  model.add(layers.Conv1D(48, kernel_size=(3, 3), activation='relu'))
  model.add(layers.BatchNormalization())

  model.add(layers.Conv1D(120, kernel_size=(3, 3), activation='relu'))
  model.add(layers.BatchNormalization())

  model.add(layers.MaxPooling1D(2, 2))
  model.add(layers.Dropout(0.5))

  model.add(layers.Flatten())

  model.add(layers.Dense(128, activation='relu'))
  model.add(layers.BatchNormalization())
  model.add(layers.Dropout(0.5))
  model.add(layers.Dense(64, activation='relu'))
  model.add(layers.BatchNormalization())
  model.add(layers.Dropout(0.5))
  model.add(layers.Dense(n_class, activation='softmax'))

  return model

In [None]:
lr = 1e-4
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
loss = tf.keras.losses.sparse_categorical_crossentropy

model = create_model(len(train_dataset))

model.compile(optimizer=optimizer,
              loss=loss,
              metrics=["accuracy"])

In [None]:
x_train = train_dataset['audio_array']
y_train = train_dataset['labels']

x_test = test_dataset['audio_array']
y_test = test_dataset['labels']

epoch = 15

model.fit(x_train, 
          y_train, 
          epochs=epoch, 
          validation_data=(x_train, y_train), 
          batch_size=128)