<a href="https://colab.research.google.com/github/bilalProgTech/mtech-nmims/blob/master/speech-recognition/Lab-Work/20221009-Lab-8-2-MTech-AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
os.environ['KAGGLE_CONFIG_DIR']='/content'
!kaggle datasets download -d sourabhy/hindi-speech-recognition
!unzip hindi-speech-recognition.zip

In [None]:
import librosa
import librosa.display
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [None]:
def preprocessing_num(filenum):
    filenum = str(filenum)
    if len(filenum) == 3:
        filenum = '0' + filenum
    return filenum

In [None]:
actual_trans = pd.read_csv('/content/test/transcription.txt', sep='_', header=None)
actual_trans.columns = ['file', 'transcription']
actual_trans.head()

In [None]:
actual_trans['file'] = actual_trans['file'].apply(lambda x: preprocessing_num(x))
actual_trans.head()

In [None]:
actual_trans['file'] = '/content/test/audio/' + actual_trans['file'] + '_' + actual_trans['transcription'].str[:3] + '.wav'
actual_trans.head()

In [None]:
actual_trans['transcription'] = actual_trans['transcription'].str[3:]
actual_trans.head()

In [None]:
def create_mfcc_features(path):
    mfccs = []
    try:
        x , sr = librosa.load(path, res_type='kaiser_fast')
        mfccs = librosa.feature.mfcc(y=x, sr=sr, n_mfcc=128)
        mfccs = np.mean(mfccs.T,axis=0)
    except:
        print('Error reading audio')
    return mfccs

In [None]:
actual_trans.shape

In [None]:
mfcc_features = {}
for audio_path in actual_trans['file'].values:
    path = audio_path.split('/')[-1]
    mfcc_features[path] = create_mfcc_features(audio_path)

In [None]:
actual_trans['transcription'] = 'startseq ' + actual_trans['transcription'] + ' endseq'

In [None]:
sentences = actual_trans['transcription'].values

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(sentences)
vocab_size = len(tokenizer.word_index) + 1
max_length = max(len(caption.split()) for caption in sentences)

In [None]:
def get_model_input(df):
    X_feature, X_seq, y = list(), list(), list()
    audio_files = df['file'].tolist()

    for audio_file in audio_files:
        path = audio_file.split('/')[-1]
        feature = mfcc_features[path].reshape(16, 8, 1)
        captions = df.loc[df['file']==audio_file, 'transcription'].tolist()
        for caption in captions:
            sequence = tokenizer.texts_to_sequences([caption])[0]

            for i in range(1, len(sequence)):
                in_sequence, out_sequence = sequence[:i], sequence[i]
                in_sequence = tf.keras.preprocessing.sequence.pad_sequences([in_sequence], maxlen=max_length)[0]
                out_sequence = tf.keras.utils.to_categorical([out_sequence], num_classes=vocab_size)[0]
                X_feature.append(feature)
                X_seq.append(in_sequence)
                y.append(out_sequence)

    X_feature, X_seq, y = np.array(X_feature), np.array(X_seq), np.array(y)
    return X_feature, X_seq, y

In [None]:
train, test = train_test_split(actual_trans, test_size=0.2)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

train.shape, test.shape

In [None]:
vocab_size

In [None]:
max_length

In [None]:
train_feature, train_seq, train_y = get_model_input(train)
test_feature, test_seq, test_y = get_model_input(test)

train_feature.shape, train_seq.shape, train_y.shape

In [None]:
from keras.layers import Dropout, Flatten, Dense, Input, Layer
from keras.layers import Embedding, LSTM, add, Concatenate, Conv2D, MaxPooling2D
from keras.models import Model

In [None]:
tf.keras.backend.clear_session()
input_model_1 = Input(shape=(16, 8, 1))
feature_model = Conv2D(filters=16, kernel_size=(3, 3), activation='relu', padding = "same")(input_model_1)
feature_model = MaxPooling2D(2, 2)(feature_model)
feature_model = Conv2D(filters=8, kernel_size=(3, 3), activation='relu', padding = "same")(feature_model)
feature_model = MaxPooling2D(2, 2)(feature_model)
feature_model = Flatten()(feature_model)
feature_model = Dense(256, activation='relu')(feature_model)

# LSTM  model
input_model_2 = Input(shape=(max_length,))
seq_model = Embedding(vocab_size, 128, mask_zero=True)(input_model_2)
seq_model = Dropout(0.5)(seq_model)
seq_model = LSTM(256, activation='relu')(seq_model)

# Merging both models
decoder = add([feature_model, seq_model])
decoder = Dense(256, activation='relu')(decoder)
output_model = Dense(vocab_size, activation='softmax')(decoder)

# [image, seq] [word]
model = Model(inputs=[input_model_1, input_model_2], outputs=output_model)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
from tensorflow.keras.utils import plot_model
plot_model(model)

In [None]:
history = model.fit((train_feature, train_seq), train_y, epochs=20,
                    validation_data=((test_feature, test_seq), test_y))

In [None]:
def inverse_tokenizer(index):
    for word, i in tokenizer.word_index.items():
        if i==index:
            return word
    return None

In [None]:
def speech_to_text(audio_path):
    path = audio_path.split('/')[-1]
    feature = mfcc_features[path].reshape(-1, 16, 8, 1)
    pred_text = "startseq"
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([pred_text])[0]
        sequence = tf.keras.preprocessing.sequence.pad_sequences([sequence], max_length)

        y_pred = model.predict([feature,sequence])
        y_pred = np.argmax(y_pred)
        word = inverse_tokenizer(y_pred)
        if word is None:
            break
        pred_text+= " " + word
        if word == 'endseq':
            break
    return pred_text

In [None]:
sample_df = test.sample(12)
files = sample_df['file'].tolist()
actual_transcription = sample_df['transcription'].tolist()
pred_transcription = []
for file in files:
    transcription = speech_to_text(file)
    pred_transcription.append(transcription)

In [None]:
pred_transcription

In [None]:
sample_df['transcription'].tolist()