# Welcome to the Speech Recognition Challenge!

In [None]:
##################################################
# Imports
##################################################

import numpy as np
import cv2
import os
import pandas as pd
import matplotlib.pyplot as plt
import IPython.display as ipd


##################################################
# Params
##################################################

DATA_BASE_FOLDER = '/kaggle/input/ml-project-speech-recognition-challenge'
SAMPLE_RATE = 16000
HOP_LEN = 512

# Dataset

The dataset is a reduced version of the [`TensorFlow Speech Commands Dataset`](https://www.tensorflow.org/datasets/catalog/speech_commands) and contains audio waveforms of the words:
- `down`, 
- `go`, 
- `left`, 
- `off`, 
- `on`, 
- `right`, 
- `stop`, 
- `up`.


Train / Validation Split
- 1600 train samples, 200 for each class
- 109 validation samples

In [None]:
##################################################
# Load dataset
##################################################

# Load annotations
df_train = pd.read_csv(os.path.join(DATA_BASE_FOLDER, 'train.csv'))
df_validation = pd.read_csv(os.path.join(DATA_BASE_FOLDER, 'validation.csv'))

labels = sorted(list(set(df_train['word'].values)))
y_train = df_train['word'].map(lambda w: labels.index(w)).values
y_validation = df_validation['word'].map(lambda w: labels.index(w)).values


# Load audio
audio_train = np.load(os.path.join(DATA_BASE_FOLDER, 'train_audio.npy'))

# Feature Extraction

The speech is a time series signal and a well known strategy for extracting a good representation of the raw audio is to mimic the processing of the auditory system of the humans. A well established feature representation for speech is the so called "log mel-spectrum". This feature in fact, takes into account how humans perceive both the frequencies and the amplitude of the sound logarithmically. If you want to dig more into this topic [here](https://medium.com/@jonathan_hui/speech-recognition-feature-extraction-mfcc-plp-5455f5a69dd9) you can find some details. 

![auditory-system](https://www.researchgate.net/profile/Morteza_Khaleghi_Meybodi/publication/322343133/figure/fig1/AS:581011472093184@1515535337239/Figure-31-Schematic-of-the-auditory-system-with-its-primary-components-including.png)


For this project these features are precomputed: for each audio waveform of 1 sec duration, the log mel-spectrum is a bi-dimensional representation (frequency vs time) of shape [128, 32]. Here, we first resize the "image" into a [32, 32] matrix and then we flatten the representation into a 32x32 = 1024 vector.

In [None]:
# Load Features
x_train_raw = np.load(os.path.join(DATA_BASE_FOLDER, 'train_feat.npy'))
x_validation_raw = np.load(os.path.join(DATA_BASE_FOLDER, 'validation_feat.npy'))

# Plot audio feature
idx = 1205
time = np.arange(1, SAMPLE_RATE + 1, HOP_LEN) / SAMPLE_RATE
plt.figure(figsize=(10, 5))
plt.title(f'Mel-Spectrogram of audio: {df_train["word"][idx]}', fontweight='bold')
plt.imshow(x_train_raw[idx], aspect='auto', origin='low', cmap='inferno')
xticks = plt.xticks()[0].astype(np.int32)
plt.xticks(xticks[1:-1], [f'{1000 * t:.0f}' for t in time[xticks[1:-1]]])
plt.xlabel('Time [ms]', fontweight='bold')
plt.ylabel('Log Mel-Spectogram', fontweight='bold')
plt.grid(lw=0.4, c='w', alpha=0.4)
plt.show()

# Play audio
ipd.Audio(audio_train[idx], rate=SAMPLE_RATE)

In [None]:
#For test set
x_test_raw = np.load(os.path.join(DATA_BASE_FOLDER, 'test_feat.npy'))


In [None]:
# Resize the features
x_train = []
for x_i in x_train_raw:
    x_train += [cv2.resize(x_i, (32, 32))]
x_train = np.array(x_train)
x_validation = []
for x_i in x_validation_raw:
    x_validation += [cv2.resize(x_i, (32, 32))]
x_validation = np.array(x_validation)

# Plot audio feature
idx = 1205
plt.figure(figsize=(5, 3))
plt.title(f'Mel-Spectrogram of audio: {df_train["word"][idx]}', fontweight='bold')
plt.imshow(x_train[idx], aspect='auto', origin='low', cmap='inferno')
plt.grid(lw=0.4, c='w', alpha=0.4)
plt.show()

# Play audio
ipd.Audio(audio_train[idx], rate=SAMPLE_RATE)

In [None]:
# For test set: Resize the features
x_test = []
for x_i in x_test_raw:
    x_test += [cv2.resize(x_i, (32, 32))]
x_test = np.array(x_test)


# 1) Convolutional Neural Network

In [None]:
import tensorflow as tf
from os import listdir
from os.path import isdir, join
from tensorflow.keras import layers, models
from tensorflow.keras.layers import BatchNormalization
import numpy as np

**Reshaping the feature matrix**

In [None]:
#Here we reshape the input vector to a suitable shape for CNN
print("Shape of x_train before reshaping: ",x_train.shape)
x_train = x_train.reshape(x_train.shape[0], 
                          x_train.shape[1], 
                          x_train.shape[2], 
                          1)

x_validation = x_validation.reshape(x_validation.shape[0], 
                        x_validation.shape[1], 
                        x_validation.shape[2], 
                        1)
#reshaping for x_test
x_test = x_test.reshape(x_test.shape[0], 
                          x_test.shape[1], 
                          x_test.shape[2], 
                          1)
print("After reshaping x_train", x_train.shape)
print("After reshaping x_", x_validation.shape)
print("After reshaping x_", x_test.shape)

**Input Shape for CNN**

In [None]:
#define the sample shape to input for CNN
sample_shape = x_train.shape[1:]

# Training phase

> As specified in the report, in this section, we tried different techniques such as dropout and batch norm to 2 CNN architectures( 2 layers and 3 layers)

The following is the optimal 3 Convolutional layers and 1 Dense Layer with no dropout and batch norm which gives an accuracy of 93% of validation set

In [None]:
#we set the seed to obtain the same results
tf.random.set_seed(1234)

In [None]:
#here we define the layers of the CNN
model = models.Sequential()
model.add(layers.Conv2D(32, 
                        (2, 2), 
                        activation='relu',
                        input_shape=sample_shape))
model.add(layers.MaxPooling2D(pool_size=(2, 2)))

model.add(layers.Conv2D(32, (2, 2), activation='relu'))
model.add(layers.MaxPooling2D(pool_size=(2, 2)))


model.add(layers.Conv2D(64, (2, 2), activation='relu'))
model.add(layers.MaxPooling2D(pool_size=(2, 2)))

# Classifier
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(8, activation='softmax'))

In [None]:
model.compile(loss='sparse_categorical_crossentropy', 
              optimizer='adam', 
              metrics=['acc'])

In [None]:
model.summary()

In [None]:
history = model.fit(x_train, 
                    y_train, 
                    epochs=30,  
                    batch_size=50, validation_data=(x_validation, y_validation))

In [None]:
##################################################
# Evaluate the model here
##################################################

# Use this function to evaluate your model
def accuracy(y_pred, y_true):
    '''
    input y_pred: ndarray of shape (N,)
    input y_true: ndarray of shape (N,)
    '''
    return (1.0 * (y_pred == y_true)).mean()


In [None]:
# Report the accuracy in the train and validation sets.
pred_cnn=model.predict_classes(x_validation)
acc=accuracy(pred_cnn,y_validation)
print("the accuracy of CNN is : ",acc)
#This is the result for Without Dropout no normalization

In [None]:
# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('acc')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()


In [None]:
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
from keras.utils.vis_utils import plot_model
tf.keras.utils.plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

# Neural Network in Keras

In [None]:
#Flatten the features
x_train = x_train.reshape(x_train.shape[0], -1)
x_validation = x_validation.reshape(x_validation.shape[0], -1)
x_test= x_test.reshape(x_test.shape[0], -1)

print(f'Features dimension size: {x_train.shape}')

In [None]:
#we set the seed to obtain the same results
tf.random.set_seed(1223)

In [None]:
NUM_ROWS = 32
NUM_COLS = 32
NUM_CLASSES = 8
BATCH_SIZE = 128
EPOCHS = 100

The following is the optimal ANN architecture with 2 hidden layers. 

In [None]:
# Build neural network
model = models.Sequential()
model.add(layers.Dense(690, activation='relu', input_shape=(NUM_ROWS * NUM_COLS,)))
#model.add(layers.Dense(512, activation='relu')) #uncomment if you want to use three layers (#set.seed(123))
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(8, activation='softmax'))
#for 2 layers set seed 1223


In [None]:
# Compile model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
# Train model
history= model.fit(x_train, y_train,
          epochs=EPOCHS,
          verbose=1,batch_size=BATCH_SIZE,
          validation_data=(x_validation, y_validation))

In [None]:
pred_nn=model.predict_classes(x_validation)
accnn=accuracy(pred_nn,y_validation)
print("the accuracy of NN  is : ",accnn)

In [None]:
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('acc')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

# Send the submission for the challenge

# Here write the model accurate model and send submission 

In [None]:
# Save your test prediction in y_test_pred
##################################################

y_test_pred = pred

# Create submission
submission = pd.read_csv(os.path.join(DATA_BASE_FOLDER, 'sample_submission.csv'))
submission.pop('word')
if y_test_pred is not None:
    submission['word'] = [labels[int(y_i)] for y_i in y_test_pred]
submission.to_csv('trial3.csv', index=False)