# Creating Keyword Spotting Model For Voice Assistant "Numa"

(1) Importing Necessary Libraries

In [1]:
# Important Library
import os
import json
from pprint import pprint
import random
import wave 
import struct

# Audio Data Analysis Library
import librosa
import numpy as np
import pandas as pd

# Data Visualization Library
import librosa.display
import IPython.display as ipd
import matplotlib.pyplot as plt
import seaborn as sns


(2) Reading and Exploring Data 

In [2]:
"""
Creating dataframe of json file.

:param orginal_df (DataFrame) : Dataframe of json data. 
:param df : Copy file of orginal_df.
"""

json_Path_English = "/home/atomyongya/Documents/Herald/Final Year Project/VoiceAssistant(Numa)/VoiceAssistant/_system_Model/2_English_KM/2_English_Json_Output_File/English_Data_JSON.json"

orginal_df = pd.read_json(json_Path_English, orient='index')
df = orginal_df

"""
Applying transpose() function which changes the row elements into column 
elements and the column elements into row elements.
"""

df = df.transpose()
df.head(20)

Unnamed: 0,mappings,labels,MFCCs,files
0,file,0,"[[-757.322509765625, -2.326998710632324, -0.86...",/home/atomyongya/Documents/Herald/Final Year P...
1,open,0,"[[-354.143798828125, 94.80502319335938, -2.048...",/home/atomyongya/Documents/Herald/Final Year P...
2,chrome,0,"[[-820.5107421875, 19.644493103027344, 4.52003...",/home/atomyongya/Documents/Herald/Final Year P...
3,close,0,"[[2.370381116867065, 55.97502899169922, 5.4160...",/home/atomyongya/Documents/Herald/Final Year P...
4,background_noise,0,"[[34.62870407104492, 96.85321807861328, 6.8009...",/home/atomyongya/Documents/Herald/Final Year P...
5,folder,0,"[[-36.8433837890625, 129.1125030517578, -35.90...",/home/atomyongya/Documents/Herald/Final Year P...
6,google,0,"[[163.28610229492188, 71.62339782714844, -21.8...",/home/atomyongya/Documents/Herald/Final Year P...
7,numa,0,"[[-768.443603515625, 0.0, 0.0, 0.0, 0.0, 0.0, ...",/home/atomyongya/Documents/Herald/Final Year P...
8,shutdown,0,"[[-855.1250610351562, 0.0, 0.0, 0.0, 0.0, 0.0,...",/home/atomyongya/Documents/Herald/Final Year P...
9,computer_noise,0,"[[-413.906005859375, 172.31414794921875, -4.17...",/home/atomyongya/Documents/Herald/Final Year P...


In [3]:
# Finding null value
df.isna().sum()

mappings    1572
labels         0
MFCCs          0
files          0
dtype: int64

In [4]:
# Shape of dataframe
df.shape

(1584, 4)

In [5]:
# Unique class of "mappings" feature.
mapping_Unique_Class = df["mappings"].unique()
mapping_Unique_Class

array(['file', 'open', 'chrome', 'close', 'background_noise', 'folder',
       'google', 'numa', 'shutdown', 'computer_noise', 'play', 'youtube',
       None], dtype=object)

In [6]:
# Unique class of "labels" feature
labels_Unique_Class = df["labels"].unique()
labels_Unique_Class

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], dtype=object)

In [7]:
# Unique class of "files" feature
files_Unique_Class = df["files"].unique()
len(files_Unique_Class)

1584

In [8]:
df.describe()

Unnamed: 0,mappings,labels,MFCCs,files
count,12,1584,1584,1584
unique,12,12,1525,1584
top,file,7,"[[-676.4949951171875, 85.03929138183594, 5.804...",/home/atomyongya/Documents/Herald/Final Year P...
freq,1,201,2,1


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1584 entries, 0 to 1583
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   mappings  12 non-null     object
 1   labels    1584 non-null   object
 2   MFCCs     1584 non-null   object
 3   files     1584 non-null   object
dtypes: object(4)
memory usage: 49.6+ KB


(3) Data Visualization

In [10]:
"""
Lode json file.

:param data (dict) : Dictonary which store the JSON data. 
"""

with open(json_Path_English) as json_Data:
    data = json.load(json_Data)

# pprint(data)

In [11]:
# Analyzing audio data before preparing dataset to get more information of our audio data.

class Meta_Data():
    """
    The information from class "Meta_Data" will be used later to create a model.
    """
    
    def __init__(self, audio_Data_Path):
        """
        :param audio_Data_Path : path of audio data.
        """
        
        self.path = audio_Data_Path
    
    # Method to play audio
    def play_Audio(self):
        """
        Playing Audio to make sure we are using the correct input to extract MFCCs.
        
        :param audio : audio file in .wav format.
        """
        
        audio = ipd.Audio(self.path)
        
        return audio
    
    # Method to Extract MFCCs
    def extracting_MFCCs(self):
        """
        MFCCs (Mel Frequency Cepstrum Coefficent) is representation of the short-term power spectrum of an audio or sound.
        
        :param signal : 
        :param sample_Rate : Number of sample in one second.
        :param n_mfcc : Number of coefficent (y-intercept).
        :paramm mfccs: Feature of audio data in 2D array.
        """
        
        # Loading audio files
        signal, sample_Rate = librosa.load(self.path)
        
        # Extracting MFCCs Feature
        mfccs = librosa.feature.mfcc(y=signal, n_mfcc=13, sr=sample_Rate)
        print("Shape of an audio: ", mfccs.shape)
        
        return signal, sample_Rate, mfccs
    
    # Method to Visualise MFCCs in json
    def visualising_MFCCs(self):
        signal, sample_Rate, mfccs = self.extracting_MFCCs()
        plt.figure(figsize=(25, 10))
        librosa.display.specshow(mfccs, x_axis="time", sr=sample_Rate)
        plt.title("Mel Frequency Cepstrum Coefficent (MFCCs)")
        plt.colorbar(format="%+2.f")
        plt.show()

In [12]:
# Creating class for data visualization from dataframe
class Graph_Plot():
    
    """
    :param x_Axis : input feature
    :param y_Axis : output
    
    """
    def it__(self, x_Axis, y_Axis):
        self.x_Axis = x_Axis
        self.y_Axis = y_Axis

    def figure_Size(self): 
        figure_Size = plt.figure(figsize=(15, 7))

    def bar_Plot(self):
        self.figure_Size()
        sns.barplot(x=self.x_Axis, y=self.y_Axis, data=df)

    def box_Plot(self):
        self.figure_Size()
        plt.boxplot(x=self.x_Axis, data=df)
        plt.show()

    def swarmp_Plot(self):
        self.figure_Size()
        sns.swarmplot(x=self.x_Axis, y=self.y_Axis, data=df)
        plt.show()

In [13]:
# # Creating first object of class Meta_Data randomly
# audio_Path = random.choice(data["files"])
# random_Audio_Object = Meta_Data(audio_Path)

# # Calling play_Audio() method from class Meta_Data for first object
# random_Audio_Object.play_Audio()

In [14]:
# """
# :param name_Of_File : Split the directory and store in list. [-2] Return the second last index value.
# """
# name_of_File = audio_Path.split("/")[-2]
# print(name_of_File + " Audio\n")

# random_Audio_Object.visualising_MFCCs()

In [15]:
# # Creating second object of Meta_Data 
# audio_Path2 = random.choice(data["files"])
# random_Audio_Object2 = Meta_Data(audio_Path2)

# # Calling play_Audio() method from class Meta_Data for first object
# random_Audio_Object2.play_Audio()

In [16]:
# file_Path_Name = audio_Path2.split("/")[-2]
# print(file_Path_Name + " Audio\n")
# random_Audio_Object2.visualising_MFCCs()

In [17]:
# mfccs = df["MFCCs"]
# mapping = df["mappings"]
# labels = df["labels"]

# mfccs.hist()

In [18]:
# mapping.hist()

In [19]:
# labels.hist()

(4) Creating Model

In [20]:
# Importing Library to create model
from sklearn.model_selection import train_test_split
import tensorflow.keras as keras
import tensorflow as tf

In [21]:
"""
Creating keyword spotting model for speech recognition system.

"""

class Create_Model():
    """
    Class to create model.
    """
    
    def __init__(self, data_Path, save_Model_Path, batch_size, epochs, learning_Rate):
        """
        Constructor function.
        
        :param data_Path : Path of the JSON file.
        :param save_Model : Path where .h5 model will be saved.
        :param batch_size : Number of sample processed before the model is updated.
        :param epochs : one entire transit of the training data through the algorithm.
        :param learning_Rate : Hyperparameter of lstm algorithm which decide how training process data will be selected 
                                due to which the building model can take long or short period of time.
        """
        
        self.data_Path = data_Path
        self.save_Model_Path = save_Model_Path
        self.batch_size = batch_size
        self.epochs = epochs
        self.learning_Rate = learning_Rate
    
    def load_Dataset(self, data_Path):
        """
        Loading JSON dataset.
        
        :var data : Dictionary to store json data after loading.
        :var X : Input feature.
        :var y : Output.
        """
        # Loading JSON File
        with open(data_Path, "r") as json_Data:
            data = json.load(json_Data)
            
        # Extract Inputs and Targets (or Labels) Features
        X = np.asarray(data["MFCCs"], dtype=object)
        y = np.asarray(data["labels"], dtype=object)
        
        return X, y
    
    def get_Data_Splits(self, data_Path, test_size=0.1, test_validation=0.1):
        """
        Splitting the data in train, test, validation.
        
        :param test_size : Size of test data from whole dataset.
        :param test_validation : Size of validation data from remaining tranning dataset.
        
        :var X_train : Input training data for model.
        :var X_test : Input testing data for model.
        :var X_validation : Input Validation data for model.
        
        :var y_train : Output training data for model.
        :var y_test : Output testing data for model. 
        :var y_validation : Output validation data for model.
        """
        # Load Dataset
        X, y = self.load_Dataset(self.data_Path)
        
        # split train/Validation/test 
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
        X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=test_validation)
        
        X_train = tf.convert_to_tensor(X_train, dtype=tf.float32) 
        X_validation = tf.convert_to_tensor(X_validation, dtype=tf.float32)
        X_test = tf.convert_to_tensor(X_test, dtype=tf.float32)
        y_train = tf.convert_to_tensor(y_train, dtype=tf.float32) 
        y_validation = tf.convert_to_tensor(y_validation, dtype=tf.float32)
        y_test = tf.convert_to_tensor(y_test, dtype=tf.float32)
        
        return X_train, X_validation, X_test, y_train, y_validation, y_test 
    
    def build_Model(self, input_shape, learning_Rate, error="sparse_categorical_crossentropy", number_Keywords=12):
        """
        Building the keyspotting model.
        
        :param error : Algorithm to calculate the error of model.
        :param number_Keywords : Number of word we collected for traning purpose.
        
        :var model : Model type.
        :var optimiser : Algorithm that helps to reduce loss or error and increase the accuracy of model.
        """
        # Build network
        model = keras.Sequential()
        
        # 2 LSTM Layer
        model.add(keras.layers.LSTM(64, input_shape=input_shape, return_sequences=True))
        model.add(keras.layers.LSTM(64))
        
        # Dense Layer
        model.add(keras.layers.Dense(64, activation="relu"))
        model.add(keras.layers.Dropout(0.3))
        
        # Softmax classifier (or Output layer)
        model.add(keras.layers.Dense(number_Keywords, activation="softmax")) # [0.1, 0.7, 0.2] 0.7 will be output
        
        # Compile the model
        optimiser = keras.optimizers.Adam(learning_rate=self.learning_Rate)
        model.compile(optimizer=optimiser, loss=error, metrics=["accuracy"])
        
        # Print model overview
        model.summary()
        
        return model
        
    def main(self):
        """
        Main function from where the process of creating model start.
        
        :var input_shape : input shape of the first node in neural network.
        :var test_error : Error of our model.
        :var test_accuracy : Accuracy of our model.
        """
        # train /  validation / test data splits
        X_train, X_validation, X_test, y_train, y_validation, y_test = self.get_Data_Splits(self.data_Path)
        
        # Building LSTM Model
        input_shape = (X_train.shape[1], X_train.shape[2])
        model = self.build_Model(input_shape, self.learning_Rate)
        
        # Train the data using model
        model.fit(X_train, y_train, epochs=self.epochs, batch_size=self.batch_size, validation_data=(X_validation, y_validation))
        
        # Evaluating the model
        test_error, test_accuracy = model.evaluate(X_test, y_test)
        print(f"Test error: {test_error}, Test accuracy: {test_accuracy}")
        
        # Saving model
        model.save(self.save_Model_Path)
        

In [22]:
"""
Creating object of class Create_Model for English language.

:var json_Path_English : Path of JSON dataset for English language.
:var english_Model_Path : Path where english keyspotting model get saved. 
"""

learning_Rate = 0.0001
epochs = 100
batch_size = 32

# Data Path
json_Path_English = "/home/atomyongya/Documents/Herald/Final Year Project/VoiceAssistant(Numa)/VoiceAssistant/_system_Model/2_English_KM/2_English_Json_Output_File/English_Data_JSON.json"

# Model Path
english_Model_Path = "/home/atomyongya/Documents/Herald/Final Year Project/VoiceAssistant(Numa)/VoiceAssistant/_system_Model/2_English_KM/3_English_Model_File/english_Model.h5"

# Creating model object for English
english_Model_Object = Create_Model(json_Path_English, english_Model_Path, batch_size, epochs, learning_Rate)

In [23]:
"""
Creating object of class create_Model for Nepali Language.
"""





'\nCreating object of class create_Model for Nepali Language.\n'

In [24]:
# Calling main function of object english_Model_Object
english_Model_Object.main()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 44, 64)            19968     
                                                                 
 lstm_1 (LSTM)               (None, 64)                33024     
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 12)                780       
                                                                 
Total params: 57,932
Trainable params: 57,932
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoc

Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Test error: 0.38580402731895447, Test accuracy: 0.8742138147354126


In [81]:
# Calling main function of object nepali_Model_Object



Making Prediction Using our model

In [None]:
"""
Importing Library necessary to predict our model
"""
import sounddevice as sd
import scipy.io.wavfile as sw
from scipy.io.wavfile import write

from tensorflow.keras.models import load_model
from scipy import fftpack
import noisereduce as nr
import soundfile as sf
import io
import subprocess
import wavefile

from scipy.io import wavfile as wav
from scipy.io.wavfile import write
import sounddevice as sd
from playsound import playsound
import noisereduce as nr

In [87]:
"""
Making prediction using english model we create.

:param fps: frame per second.
:param duaration : Record time duration.
:param filename : audio path.
:param mapping_Data : Loding the data to compare with our real time audio.
"""

fps = 44100
duration = 1
filename = "prediction.wav"
mapping_Data = data["mappings"]

# English keyword spotting model
model = load_model("/home/atomyongya/Documents/Herald/Final Year Project/VoiceAssistant(Numa)/VoiceAssistant/_system_Model/2_English_KM/3_English_Model_File/english_Model.h5")

print("Prediction Started: ")
while True:
    
    """
    :param myrecording :  Audio to predict real time user voice.
    :param prediction :  Prediction of real time audio voice.
    :param predicted_index : Hold the max prediction value of our model.
    :param predicted_keyword : Text word with which our voice will get compared. 
    """
    try:
        # Real time audio recording.  
        print("Say Now: ")
        myrecording = sd.rec(int(duration * fps), samplerate=fps, channels=2)
        sd.wait()
        write(filename, fps, myrecording)
        
        # Removing noise.
        rate, reduced_data = sw.read('prediction.wav') 
        noise_Reduce = nr.reduce_noise(np.reshape(reduced_data, (2, -1)), rate)
        
        write("noise_Reduce.wav", fps, reduced_data)
        
        # Loading the recorded file using librosa.
        signal, sample_rate = librosa.load('prediction.wav')
        
        # Extracting the MFCC feature of an audio
        mfcc = librosa.feature.mfcc(signal, sample_rate, n_mfcc=13, hop_length=512, n_fft=2048)
        
        # Making prediction and comparing our audio mfcc with the mfcc of train audio data
        prediction = model.predict(tf.expand_dims(mfcc.T, axis=0))
        
        # Finding max prediction value and mapping with the index of mapping_Data from json. 
        predicted_index = np.argmax(prediction)
        predicted_keyword = mapping_Data[predicted_index]
        print(predicted_keyword)
        
        # To stop the audio record.
        stop = input("Enter S or s to stop: ")
        if stop == "s" or stop == "S":
            break

        else:
            continue
            
    except Exception as error:
        print(error)
        break
    

Prediction Started: 
Say Now: 
numa
Enter S or s to stop: 
Say Now: 
shutdown
Enter S or s to stop: 
Say Now: 
shutdown
Enter S or s to stop: 
Say Now: 
numa
Enter S or s to stop: 
Say Now: 
google
Enter S or s to stop: 
Say Now: 
open
Enter S or s to stop: 
Say Now: 
google
Enter S or s to stop: 
Say Now: 
shutdown
Enter S or s to stop: 
Say Now: 
close
Enter S or s to stop: 
Say Now: 
numa
Enter S or s to stop: 
Say Now: 
close
Enter S or s to stop: 
Say Now: 
shutdown
Enter S or s to stop: 
Say Now: 
close
Enter S or s to stop: 
Say Now: 
numa
Enter S or s to stop: 
Say Now: 
shutdown
Enter S or s to stop: s


In [55]:
from scipy.io import wavfile as wav
from scipy.io.wavfile import write
import sounddevice as sd
from playsound import playsound
import noisereduce as nr

fs = 44100  # Sample rate
seconds = 2   # Duration of recording
print("Say Now: ")
myrecording = sd.rec(int(seconds * fs), samplerate=fs, channels=2)
sd.wait()  # Wait until recording is finished
write('prediction.wav', fs, myrecording)

rate, data = sw.read('prediction.wav') 
reduced_noise = nr.reduce_noise(np.reshape(data, (2, -1)), rate)

write("reduced_noise.wav", fs, data)
playsound("reduced_noise.wav")

Say Now: 
