In [26]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import os, glob
import librosa
import random
import soundfile as sf
from tqdm import tqdm
# import librosa.display
# import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Dense, Dropout, MaxPooling2D, LSTM, Flatten, Input, Lambda

Make Directory to File Path

In [2]:
def load_dir(directory):
    audio_dir = []
    labels = []

    for folder in os.listdir(directory):
        folder_path = os.path.join(directory, folder)
        if os.path.isdir(folder_path):
            label_file = os.path.join(folder_path, 'new_label.txt')
            with open(label_file, 'r') as f:
                sds_score = float(f.read().strip())
                if sds_score < 40:
                    label = 'low'
                elif 41 >= sds_score <= 60:
                    label = 'medium'
                else:
                    label = 'high'
            for file in os.listdir(folder_path):
                if file.endswith('_out.wav'):
                    audio_path = os.path.join(folder_path, file)
                    audio_dir.append(audio_path)
                    labels.append(label)

    return audio_dir, labels

dir = '/content/drive/MyDrive/Capstone/data/EATD-Corpus/'
ds = load_dir(dir)
audio_dir, labels = ds

Split directory for train, validation, test set

In [3]:
# Split train dan test set (X:data, y:label)
X_train, X_val, y_train, y_val = train_test_split(audio_dir, labels, test_size=0.2, random_state=123)
# Split kedua untuk validation set
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=123)

In [50]:
y_train

[0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 2,
 1,
 0,
 0,
 2,
 1,
 2,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 2,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 2,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 1,
 0,
 0,
 2,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 2,
 0,
 0,
 0,
 0,


In [4]:
print(len(X_train))
print(len(X_val))

388
98


PRE-PROCESS DATA

1. Resample
2. Mono to stereo
3. Padding
4. Melspectrogram
5. Melspectrogram augment

In [5]:
def open(file):
    y, sr = librosa.load(file)
    return (y, sr)

def rechannel(audio, channel = 2):
    y, sr = audio
    if (y.shape[0] == channel):
        return y
    if (channel == 1):
        y_rechan = y[:1, :]
    else:
        y_rechan = tf.convert_to_tensor(y, dtype=tf.float32)
        y_rechan = tf.stack([y_rechan, y_rechan], axis=-1)
        y_rechan = y_rechan.numpy()
    return ((y_rechan, sr))

def resample2(audio, nsr):
    y, sr = audio
    resamp = np.zeros((2, int(np.ceil(len(y) / sr * nsr))))
    for i in range(y.shape[1]):
        resamp_chan = librosa.resample(y[:, i], orig_sr = sr, target_sr=nsr)
        if len(resamp_chan) > resamp.shape[1]:
          resamp_chan = resamp_chan[:-1]
        elif len(resamp_chan) < resamp.shape[1]:
          resamp_chan = np.pad(resamp_chan, (0,1), mode='constant')
        resamp[i, :] = resamp_chan
    return((resamp, nsr))

def padding_trunc(audio, max_size):
    y, sr = audio
    rows, y_len = y.shape
    max_len = sr//1000 * max_size

    if (y_len > max_len):
        # lakukan truncating/potong size
        y = y[:, :max_len]

    elif (y_len < max_len):
        #lakukan padding/penambahan size
        len_begin = random.randint(0, max_len - y_len)
        len_end = max_len - y_len - len_begin
        # padding dengan zero
        pad_begin = tf.zeros((rows, len_begin))
        pad_end = tf.zeros((rows, len_end))

        y = tf.concat([pad_begin, y, pad_end], 1)

    return (y, sr)

def time_shift_augment(audio, shift_limit=0.2):
    y, sr = audio
    _, y_len = y.shape

    shift_amount = tf.random.uniform([], -shift_limit, shift_limit) * tf.cast(y_len, tf.float32)
    shift_amount = tf.cast(shift_amount, tf.int32)

    # Melakukan time shift
    shifted_y = tf.roll(y, shift=shift_amount, axis=0)

    return (shifted_y, sr)

def mel_features(audio, n_mels=64, n_fft= 1024, hop_length=None):
    y, sr = audio
    top_db = 80
    if hop_length is None:
        n_fft // 2

    if isinstance(y, tf.Tensor):
        y = y.numpy()

    mels = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length)
    mels_db = librosa.power_to_db(mels, ref=np.max, top_db=top_db)
    # mels_db = tf.convert_to_tensor(mels_db, dtype=tf.float32)

    return mels_db

def mels_augment(mels_db, max_mask_pct=0.1, n_freq_masks=1, n_time_masks=1):
    _, n_mels, n_steps = mels_db.shape
    mask_value = tf.reduce_mean(mels_db)
    augment_mels = tf.identity(mels_db)

    freq_mask_param = int(max_mask_pct * n_mels)
    for _ in range(n_freq_masks):
        # Membuat mask frekuensi
        f = tf.random.uniform(shape=(), minval=0, maxval=freq_mask_param, dtype=tf.int32)
        f0 = tf.random.uniform(shape=(), minval=0, maxval=n_mels - f, dtype=tf.int32)
        mask = tf.concat(
            [
                tf.ones(shape=(f0,), dtype=tf.bool),
                tf.zeros(shape=(f,), dtype=tf.bool),
                tf.ones(shape=(n_mels - f0 - f,), dtype=tf.bool),
            ],
            0,
        )
        mask = tf.reshape(mask, (1, n_mels, 1))  # Ubah shape agar sesuai dengan spectrogram
        augment_mels = tf.where(mask, mask_value, augment_mels)

    time_mask_param = max(1, int(max_mask_pct * n_steps))
    for _ in range(n_time_masks):
        # Membuat mask waktu
        t = tf.random.uniform(shape=(), minval=0, maxval=time_mask_param, dtype=tf.int32)
        t0 = tf.random.uniform(shape=(), minval=0, maxval=n_steps - t, dtype=tf.int32)
    mask = tf.concat(
            [
                tf.ones(shape=(t0,), dtype=tf.bool),
                tf.zeros(shape=(t,), dtype=tf.bool),
                tf.ones(shape=(n_steps - t0 - t,), dtype=tf.bool),
            ],
            0,
        )

    mask = tf.reshape(mask, (1, 1, n_steps))  # Ubah shape agar sesuai dengan spectrogram
    augment_mels = tf.where(mask, mask_value, augment_mels)

    return augment_mels

def preprocess_audio(filepath):
    nsr = 44000
    max_size = 1000
    audio  = open(filepath)
    audio_stereo = rechannel(audio)
    audio_resamp = resample2(audio_stereo, nsr)
    audio_padd = padding_trunc(audio_resamp, max_size)
    audio_mels = mel_features(audio_padd)
    audio_mels_augment = mels_augment(audio_mels)

    return audio_mels_augment

def load_audio(dir, label):
  features = []
  labels = []
  for file, label in zip(dir, label):
    x = preprocess_audio(file)
    features.append(x)
    labels.append(label)

  return features, labels

In [38]:
ii = ['/content/drive/MyDrive/Capstone/data/EATD-Corpus/t_1/negative_out.wav',
      '/content/drive/MyDrive/Capstone/data/EATD-Corpus/t_8/positive_out.wav']
il = [0, 0]

# i = '/content/drive/MyDrive/Capstone/data/EATD-Corpus/t_8/positive_out.wav'
# ii = ['/content/drive/MyDrive/Capstone/data/EATD-Corpus/t_8/positive_out.wav']
# il = [0]

# n5 = open(i)
# n5 = rechannel(n5)
# n5 = resample2(n5, nsr=44000)
# n5 = padding_trunc(n5, max_size=1000)
# y, sr = n5
# y.shape
a, b = load_audio(ii, il)
a

[<tf.Tensor: shape=(2, 64, 172), dtype=float64, numpy=
 array([[[-50.40157124, -50.40157124, -50.40157124, ..., -50.40157124,
          -50.40157124, -50.40157124],
         [-50.40157124, -50.40157124, -50.40157124, ..., -50.40157124,
          -50.40157124, -50.40157124],
         [-50.40157124, -50.40157124, -50.40157124, ..., -50.40157124,
          -50.40157124, -50.40157124],
         ...,
         [-50.40157124, -50.40157124, -50.40157124, ..., -50.40157124,
          -50.40157124, -50.40157124],
         [-50.40157124, -50.40157124, -50.40157124, ..., -50.40157124,
          -50.40157124, -50.40157124],
         [-50.40157124, -50.40157124, -50.40157124, ..., -50.40157124,
          -50.40157124, -50.40157124]],
 
        [[-50.40157124, -50.40157124, -50.40157124, ..., -50.40157124,
          -50.40157124, -50.40157124],
         [-50.40157124, -50.40157124, -50.40157124, ..., -50.40157124,
          -50.40157124, -50.40157124],
         [-50.40157124, -50.40157124, -50.401571

In [6]:
# Label Encoding
encod = LabelEncoder()
y_train = encod.fit_transform(y_train)
y_val = encod.fit_transform(y_val)

In [8]:
print(type(y_train))

<class 'numpy.ndarray'>


In [9]:
X_train = [preprocess_audio(file) for file in X_train]
X_val = [preprocess_audio(file) for file in X_val]

In [11]:
X_train = np.array([tensor.numpy().astype(np.float32) for tensor in X_train])
X_val = np.array([tensor.numpy().astype(np.float32) for tensor in X_val])

In [14]:
y_train = np.array(y_train).astype(np.int32)
y_val = np.array(y_val).astype(np.int32)

In [16]:
# Buat dataset
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))

In [19]:
train_dataset

<_TensorSliceDataset element_spec=(TensorSpec(shape=(2, 64, 172), dtype=tf.float32, name=None), TensorSpec(shape=(), dtype=tf.int32, name=None))>

In [20]:
buffer_size = 310

train_dataset = train_dataset.shuffle(buffer_size=buffer_size).batch(32)
val_dataset = val_dataset.batch(32)

In [28]:
model = Sequential([
    # tf.keras.Input(shape=(train_padding.shape[1], train_padding.shape[2], 1)),
    Input(shape=(2, 64, 172)),
    Conv2D(2, 8, (3, 3), activation='relu', padding='same'),
    MaxPooling2D((2, 2)),
    tf.keras.layers.ZeroPadding2D(padding=(1,1)),
    Conv2D(64, (3, 3), activation='relu', padding='same'),
    MaxPooling2D((2, 2)),
    tf.keras.layers.ZeroPadding2D(padding=(1,1)),
    Conv2D(64, (2, 2), activation='relu', padding='same'),
    MaxPooling2D((2, 2)),
    tf.keras.layers.ZeroPadding2D(padding=(1,1)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(3, activation='softmax')
])

In [29]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_12 (Conv2D)          (None, 2, 64, 32)         49568     
                                                                 
 max_pooling2d_12 (MaxPooli  (None, 1, 32, 32)         0         
 ng2D)                                                           
                                                                 
 zero_padding2d_3 (ZeroPadd  (None, 3, 34, 32)         0         
 ing2D)                                                          
                                                                 
 conv2d_13 (Conv2D)          (None, 3, 34, 64)         18496     
                                                                 
 max_pooling2d_13 (MaxPooli  (None, 1, 17, 64)         0         
 ng2D)                                                           
                                                      

In [31]:
model.fit(train_dataset, epochs = 10, validation_data=val_dataset)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7e6bb03314b0>