<a href="https://colab.research.google.com/github/bilalProgTech/mtech-nmims/blob/master/speech-recognition/Lab-Work/20220807-Lab-3-MTech-AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = '/content/'
!kaggle datasets download -d ejlok1/toronto-emotional-speech-set-tess
!unzip *.zip

In [None]:
import tensorflow as tf
import librosa
import librosa.display
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss, confusion_matrix
import IPython.display as ipd

In [None]:
for dirname, _, filenames in os.walk('/content/emotion_speech/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
files = []
class_series = []

In [None]:
for dirname, _, filenames in os.walk('/content/emotion_speech/'):
    for filename in filenames:
        filepath = os.path.join(dirname, filename)
        class_series.append(filepath.split('/')[-1].replace('.wav','').split('_'))
        files.append(filepath)
data = pd.DataFrame(class_series, columns=['actress', 'word', 'class'])
data['filename'] = files
data = data.sample(frac=1)
data = data.reset_index(drop=True)
data.head()

In [None]:
data.shape

    ss = '/content/emotion_speech/YAF_neutral/YAF_germ_neutral.wav'
    ss.split('/')[-1].replace('.wav','').split('_')

In [None]:
data['class'].value_counts()

In [None]:
def get_audio_feature_plots(path, class_, word, actress):
    x , sr = librosa.load(path)
    plt.figure(figsize=(20, 5))
    plt.title('Waveplot of '+class_+' '+actress+' for word '+word)
    librosa.display.waveplot(x, sr=sr)
    plt.show()

    plt.figure(figsize=(20, 6))
    plt.title('MFCC Spectral of '+class_+' '+actress+' for word '+word)
    mfccs = librosa.feature.mfcc(y=x, sr=sr) # n_mfcc
    librosa.display.specshow(mfccs, sr=sr, x_axis='time')
    print(mfccs.shape)
    plt.show()

    plt.figure(figsize=(20, 6))
    plt.title('Mel Spectrogram of '+class_+' '+actress+' for word '+word)
    mel_spec = librosa.feature.melspectrogram(y=x, sr=sr)
    librosa.display.specshow(mel_spec, sr=sr, x_axis='time')
    plt.show()

In [None]:
data.head()

In [None]:
data['actress'].value_counts()

In [None]:
sample = data[(data['class'] == 'angry') & (data['word'] == 'calm') & (data['actress'] == 'OAF')]
get_audio_feature_plots(sample['filename'].values[0], sample['class'].values[0], sample['word'].values[0], sample['actress'].values[0])
ipd.Audio(sample.filename.values[0])

In [None]:
sample = data[(data['class'] == 'angry') & (data['word'] == 'calm') & (data['actress'] == 'YAF')]
get_audio_feature_plots(sample['filename'].values[0], sample['class'].values[0], sample['word'].values[0], sample['actress'].values[0])
ipd.Audio(sample.filename.values[0])

In [None]:
sample = data[(data['class'] == 'fear') & (data['word'] == 'calm') & (data['actress'] == 'YAF')]
get_audio_feature_plots(sample['filename'].values[0], sample['class'].values[0], sample['word'].values[0], sample['actress'].values[0])
ipd.Audio(sample.filename.values[0])

In [None]:
sample = data[(data['class'] == 'fear') & (data['word'] == 'calm') & (data['actress'] == 'OAF')]
get_audio_feature_plots(sample['filename'].values[0], sample['class'].values[0], sample['word'].values[0], sample['actress'].values[0])
ipd.Audio(sample.filename.values[0])

# Modelling

## Feature Extractions

In [None]:
def create_mfcc_features(path):
    mfccs = []
    try:
        x , sr = librosa.load(path, res_type='kaiser_fast')
        mfccs = librosa.feature.mfcc(y=x, sr=sr, n_mfcc=128)
        mfccs = np.mean(mfccs.T,axis=0)
    except:
        print('Error reading audio')
    return mfccs

In [None]:
def create_melspec_features(path):
    melspec = []
    try:
        x , sr = librosa.load(path, res_type='kaiser_fast')
        melspec = librosa.feature.melspectrogram(y=x, sr=sr, n_mels=128)
        melspec = np.mean(melspec.T,axis=0)
    except:
        print('Error reading audio')
    return melspec

In [None]:
%%time
X_df = pd.DataFrame(data['filename'].apply(lambda x: create_mfcc_features(x)).tolist())
X_df.head()

In [None]:
X_df.shape

In [None]:
encoder = LabelEncoder()
encoder.fit(data['class'])
y = encoder.transform(data['class'])

In [None]:
data.shape

In [None]:
x_train, x_val, y_train, y_val = train_test_split(np.array(X_df), y, test_size=0.10)

In [None]:
x_train.shape, x_val.shape

In [None]:
x_train = x_train.reshape(x_train.shape[0], 16, 8, 1)
x_val = x_val.reshape(x_val.shape[0], 16, 8, 1)

## CNN Model

In [None]:
tf.keras.backend.clear_session()
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(16, 8, 1)),
    tf.keras.layers.Conv2D(filters=16, kernel_size=(3, 3), activation='relu', padding = "same"),
    tf.keras.layers.MaxPooling2D(2, 2),
    tf.keras.layers.Conv2D(filters=8, kernel_size=(3, 3), activation='relu', padding = "same"),
    tf.keras.layers.MaxPooling2D(2, 2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(len(data['class'].unique()), activation='softmax')
])
model.summary()

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit(x_train, y_train, epochs=20)

In [None]:
model.evaluate(x_val, y_val)

In [None]:
prob_val = model.predict(x_val)
pred_val = np.argmax(prob_val, axis=1)
pred_val = encoder.inverse_transform(pred_val)

In [None]:
confusion_matrix(encoder.inverse_transform(y_val), pred_val)