<a href="https://colab.research.google.com/github/bilalProgTech/mtech-nmims/blob/master/speech-recognition/Lab-Work/20220821-Lab-5-MTech-AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
os.environ['KAGGLE_CONFIG_DIR']='/content'
!kaggle competitions download -c tensorflow-speech-recognition-challenge
!unzip *.zip
!7za x 'train.7z'

In [2]:
import librosa
import librosa.display
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import IPython.display as ipd
import plotly as py
import plotly.graph_objs as go

In [None]:
files = []
target_series = []
for dirname, _, filenames in os.walk('/content/train/audio/'):
    for filename in filenames:
        filepath = os.path.join(dirname, filename)
        target = filepath.split('/')[-2]
        if target not in '_background_noise_':
            target_series.append(target)
            files.append(filepath)
data = pd.DataFrame(target_series, columns=['target'])
data['filename'] = files
data = data.sample(frac=1)
data = data.reset_index(drop=True)
data.head()

In [None]:
df = pd.DataFrame()
for target in data['target'].unique():
    df = df.append(data[data['target']==target].sample(150))
df = df.sample(frac=1)
df = df.reset_index(drop=True)
df.head()

In [None]:
df.shape

In [6]:
def create_mfcc_features(path):
    mfccs = []
    try:
        x , sr = librosa.load(path, res_type='kaiser_fast')
        mfccs = librosa.feature.mfcc(y=x, sr=sr, n_mfcc=128)
        mfccs = np.mean(mfccs.T,axis=0)
    except:
        print('Error reading audio')
    return mfccs

In [7]:
X_df = pd.DataFrame(df['filename'].apply(lambda x: create_mfcc_features(x)).tolist())

In [None]:
X_df.head()

# MODELS

In [None]:
X_df.shape

In [11]:
encoder = LabelEncoder()
encoder.fit(df['target'])
y = encoder.transform(df['target'])

In [12]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss, confusion_matrix

In [14]:
x_train, x_val, y_train, y_val = train_test_split(np.array(X_df), y, test_size=0.10, stratify=y)

In [15]:
x_train = x_train.reshape(x_train.shape[0], 16, 8, 1)
x_val = x_val.reshape(x_val.shape[0], 16, 8, 1)

In [None]:
x_train.shape

In [None]:
tf.keras.backend.clear_session()
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(16, 8, 1)),
    tf.keras.layers.Conv2D(filters=16, kernel_size=(3, 3), activation='relu', padding = "same"),
    tf.keras.layers.MaxPooling2D(2, 2),
    tf.keras.layers.Conv2D(filters=8, kernel_size=(3, 3), activation='relu', padding = "same"),
    tf.keras.layers.MaxPooling2D(2, 2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(len(data['target'].unique()), activation='softmax')
])
model.summary()

In [21]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit(x_train, y_train, epochs=200)

In [23]:
prob_val = model.predict(x_val)
pred_val = np.argmax(prob_val, axis=1)
pred_val = encoder.inverse_transform(pred_val)

In [None]:
sns.heatmap(confusion_matrix(encoder.inverse_transform(y_val), pred_val))

In [None]:
x , sr = librosa.load('/content/my_audio.mp4')
plt.figure(figsize=(20, 5))
plt.title('Waveplot')
librosa.display.waveplot(x, sr=sr)
plt.show()

In [None]:
x.shape

In [None]:
total_time = x.shape[0]/sr
per_window = x.shape[0]/total_time
total_time, per_window

In [None]:
ipd.Audio('/content/my_audio.mp4')

In [None]:
x , sr = librosa.load('/content/my_audio.mp4')
plt.figure(figsize=(20, 5))
plt.title('Waveplot')
librosa.display.waveplot(x[int(2.3*per_window):int(2.6*per_window)], sr=sr)
plt.show()

In [75]:
mfccs = librosa.feature.mfcc(y=x[int(2.3*per_window):int(2.6*per_window)], sr=sr, n_mfcc=128)
mfccs = np.mean(mfccs.T,axis=0)

In [None]:
mfccs.shape

In [None]:
x_test = mfccs.reshape(1, 16, 8, 1)
x_test

In [78]:
prob_val = model.predict(x_test)
pred_val = np.argmax(prob_val, axis=1)
pred_val = encoder.inverse_transform(pred_val)

In [None]:
pred_val