<a href="https://www.kaggle.com/code/conweezy/cats-vs-dogs-audio-classication-88-test-acc?scriptVersionId=115764207" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

<font size="5">**1. Import Libraries**</font>

In [None]:
import numpy as np
import librosa
import librosa.display
from pydub import AudioSegment
import math
import os
import sklearn
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam

<font size="5">**2. Create a class to load WAV files.**</font>

In [None]:
class Loader:
  '''Loads the WAV files'''
  def __init__(self, sample_rate, mono, duration):
    self.sample_rate = sample_rate
    self.mono = mono
    self.duration = duration

  def load(self, file_path):
    signal = librosa.load(file_path,
                       sr=self.sample_rate,
                       mono=self.mono)[0]
    return signal

In [None]:
'''Test loader class on a sample cat audio'''
SAMPLE_RATE = 22050
DURATION = 5
MONO = True
FILE_PATH_CAT = '../input/audio-cats-and-dogs/cats_dogs/train/cat/cat_1.wav'
loader = Loader(SAMPLE_RATE, MONO, DURATION)
signal_cat= loader.load(FILE_PATH_CAT)

from IPython.display import Audio

Audio(signal_cat, rate=SAMPLE_RATE)

In [None]:
FILE_PATH_DOG = '../input/audio-cats-and-dogs/cats_dogs/train/dog/dog_barking_1.wav'
loader = Loader(SAMPLE_RATE, DURATION, MONO)
signal_dog = loader.load(FILE_PATH_DOG)

from IPython.display import Audio
sr=22050


Audio(signal_dog, rate=sr)

In [None]:
# Display cat sample
librosa.display.waveshow(signal_cat)

In [None]:
# Display dog sample
librosa.display.waveshow(signal_dog)

<font size="5">**2. Create helper classes.**</font>

Looking at the 2 above samples, as well as other samples, we see the Time is longer than we need for this classification task. Audio samples tend to work better with shorter samples of uniform duration. Additionally, there is not a lot of data for either class. To help solve both these issues we can split all the current samples into 1 second samples. This creates more data, all of uniform length.

To improve upon this, it would also be possible to split the samples precisely on each cat meow or dog bark. For example, in the above dog Wavplot, you could split from 0-2 seconds, then about 3.5-5.

More helper classes are created below which convert the WAV to a spectrgoram, PAD any samples that end up being less than 1 second, then Normalize all the samples using MinMaxNormliaztion.

In [None]:
class SplitWavAudio():
    '''This class splits the audio into uniform duration samples'''
    def __init__(self, load_folder, save_folder, filename):
        self.load_folder = load_folder
        self.filename = filename
        self.filepath = load_folder + '/' + filename
        self.save_folder = save_folder
        self.audio = AudioSegment.from_wav(self.filepath)
    
    def get_duration(self):
        return self.audio.duration_seconds
    
    def single_split(self, from_sec, to_sec, split_filename):
        t1 = from_sec * 1000
        t2 = to_sec * 1000
        split_audio = self.audio[t1:t2]
        split_audio.export(self.save_folder + '/' + split_filename, format="wav")
        
    def multiple_split(self, seconds_per_split):
        total_seconds = math.ceil(self.get_duration())
        for i in range(0, total_seconds, seconds_per_split):
            split_fn = str(i) + '_' + self.filename
            self.single_split(i, i+seconds_per_split, split_fn)
        print('Done')

In [None]:
class spectrogram_extractor:
  '''Converts wav file into a spectrogram by applying STFT.'''   
  def __init__(self, frame_size, hop_length):
    self.frame_size = frame_size
    self.hop_length = hop_length

  def spec_extract(self, signal):
    self.signal = signal
    stft = librosa.stft(self.signal,
                               hop_length=self.hop_length,
                               n_fft=self.frame_size)
    spectrogram = np.abs(stft)
    return spectrogram

In [None]:
# Display a samples spectrgoram of cat audio
FRAME_SIZE = 512
HOP_LENGTH = 256

ext = spectrogram_extractor(FRAME_SIZE, HOP_LENGTH)
spec = ext.spec_extract(signal_cat)

librosa.display.specshow(spec)

In [None]:
'''The raw wav file durations are not interger values, so we end up with some samples that are less than 1 second. 
To fix this, Right Padding is applied.'''

def pad_along_axis(array: np.ndarray, target_length: int, axis: int = 0) -> np.ndarray:

    pad_size = target_length - array.shape[axis]

    if pad_size <= 0:
        return array

    npad = [(0, 0)] * array.ndim
    npad[axis] = (0, pad_size)

    return np.pad(array, pad_width=npad, mode='constant', constant_values=0)

In [None]:
class MinMaxNormaliser:
    """MinMaxNormaliser applies min max normalisation to an array."""

    def __init__(self, min_val, max_val):
        self.min = min_val
        self.max = max_val

    def normalise(self, array):
        norm_array = (array - array.min()) / (array.max() - array.min())
        norm_array = norm_array * (self.max - self.min) + self.min
        return norm_array


In [None]:
def create_dataset(path):
  for i in os.listdir(path):
    file_path = os.path.join(path, i)
    try:
      signal = loader.load(file_path)
      spectrogram = ext.spec_extract(signal)
    except:
        continue
    spec_norm = min_max_normaliser.normalise(spectrogram)
    spec_pad = pad_along_axis(spec_norm, 87, 1)
    X_data.append(spec_pad)
    if "cat" in file_path:
      y_data.append(0)
    else:
      y_data.append(1)


<font size="5">**3. Create the Training and Testing Datasets.**</font>



In [None]:
'''First run the function that splits the wav files into 1 second samples. 
I ran this on all Train and Test folders(need to run 4 times total) and saved the outputs in a new Data folder. 
I will create my own train/test split later'''
for f in os.listdir('../input/audio-cats-and-dogs/cats_dogs/test/test'):
  split = SplitWavAudio('../input/audio-cats-and-dogs/cats_dogs/test/test', '/kaggle/working/Data', f)
  split.multiple_split(1)

In [None]:
'''Create full dataset'''
PATH = '/kaggle/working/Data'

DURATION = 5
MONO = True
FRAME_SIZE = 512
HOP_LENGTH = 256

loader = Loader(SAMPLE_RATE, DURATION, MONO)
ext = spectrogram_extractor(FRAME_SIZE, HOP_LENGTH)
min_max_normaliser = MinMaxNormaliser(0,1)


X_data = []
y_data = []

create_dataset(PATH)

In [None]:
'''Make sure all the files got added to the dataset.
The cats are labeled as 0 in the y_data and dogs 1
Printing the number of Non-Zeros shows that there is still much more cat data than dogs.
To improve overall performance, the model could be adjusted to account for this uneven amount of data'''
X_data = np.array(X_data)
y_data = np.array(y_data)

print(X_data.shape)
print(y_data.shape)
np.count_nonzero(y_data)

In [None]:
'''Use Sklearn to create train/test split and shuffle the data'''
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X_data, y_data, test_size=.2, train_size=.8, shuffle=True)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
'''Convolutional model expects a channel dimension. In this case the audio is Mono, so there is just 1 channel'''
X_train = np.reshape(X_train, (-1, 257, 87, 1))
X_test = np.reshape(X_test, (-1, 257, 87, 1))
print(X_train.shape)
print(X_test.shape)

In [None]:
'''Finally, we convert the numpy arrays into Tensorflow dataset objects'''
# Create Tensorflow Dataset Objects

BATCH_SIZE = 15
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_dataset = train_dataset.batch(BATCH_SIZE, drop_remainder=False)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
test_dataset = test_dataset.batch(BATCH_SIZE, drop_remainder=False)
train_dataset = train_dataset.shuffle(buffer_size=256)
test_dataset = test_dataset.shuffle(buffer_size=256)
print(train_dataset)
print(test_dataset)

<font size="5">**4. Create the Model.**</font>

The model used is a 5 layer convolutional Neural Network. Other model architectures that are used for time series such as LSTM might provide good results as well.

In [None]:
# Create Model
cnn = tf.keras.models.Sequential([
    
    # The first convolution
    tf.keras.layers.Conv2D(16, (3,3), activation='relu', input_shape=(257, 87, 1)),
    tf.keras.layers.MaxPooling2D(2, 2),
    
    # The second convolution
    tf.keras.layers.Conv2D(32, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    
    # The third convolution
    tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    
    # The fourth convolution
    tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),

    # The fifth convolution
    tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
    #tf.keras.layers.MaxPooling2D(2,2),

    # Flatten the results to feed into a DNN
    tf.keras.layers.Flatten(),
    # 512 neuron hidden layer
    tf.keras.layers.Dense(512, activation='relu'),
    # Only 1 output neuron. It will contain a value from 0-1 where 0 for 1 class ('cats') and 1 for the other ('dogs')
    tf.keras.layers.Dense(1, activation='sigmoid')                                
                                    ])

print(cnn.summary())

In [None]:
# Set training parameters and number of epochs
cnn.compile(loss='binary_crossentropy',
              optimizer=Adam(learning_rate=1e-4),
              metrics=['accuracy'])

EPOCHS = 50

<font size="5">**5. Train and Evaluate the Model.**</font>

In [None]:
# Train the model
history = cnn.fit(
      train_dataset,
      epochs=EPOCHS,
      verbose=1)

In [None]:
validate = cnn.evaluate(test_dataset)