In [1]:
# modules used
import os 
import pandas as pd 
import librosa 
import numpy as np
import speech_recognition as sr 
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten,Conv2D,MaxPooling2D
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import StandardScaler
wav, sample_rate = librosa.load('3 speakers/audio_class_1.wav')
wav2, sample_rate = librosa.load('3 speakers//audio_class_2.wav')
wav3, sample_rate = librosa.load('3 speakers/audio_class_3.wav')
print('sr:', sample_rate)
# print('wav shape:', wav.shape)
# print('length:', wav.shape[0]/sample_rate, 'secs')


sr: 22050


In [2]:
# neural network with 2048 features
def NN():
    model = Sequential()
    model.add(Dense(2048, input_shape=(2048,), activation = 'relu')) 
    model.add(Dense(1028, activation = 'relu'))
    model.add(Dropout(0.25))
    model.add(Dense(248, activation = 'relu'))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation = 'sigmoid')) 
    model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')
    early_stop = EarlyStopping(monitor='val_loss', min_delta=0, patience=100, verbose=1, mode='auto')
    return model
hop_size = 15 #ms
FFT_size = 2048

In [3]:
# extraction of features and data preparation
def frame_audio(audio, FFT_size=2048, hop_size=10, sample_rate=22050):
    # hop_size in ms
    
    audio = np.pad(audio, int(FFT_size / 2), mode='reflect')
    frame_len = np.round(sample_rate * hop_size / 1000).astype(int)
    frame_num = int((len(audio) - FFT_size) / frame_len) + 1
    frames = np.zeros((frame_num,FFT_size))
    
    for n in range(frame_num):
        frames[n] = audio[n*frame_len:n*frame_len+FFT_size]
    
    return frames
audio_framed1 = frame_audio(wav, FFT_size=FFT_size, hop_size=hop_size, sample_rate=sample_rate)
print("Framed audio shape: {0}".format(audio_framed1.shape))

y_train1 = np.full((audio_framed1.shape[0]), 0)
audio_framed2 = frame_audio(wav2, FFT_size=FFT_size, hop_size=hop_size, sample_rate=sample_rate)
print("Framed audio shape: {0}".format(audio_framed2.shape))
y_train2 = np.full((audio_framed1.shape[0]), 1)
audio_framed3 = frame_audio(wav3, FFT_size=FFT_size, hop_size=hop_size, sample_rate=sample_rate)
print("Framed audio shape: {0}".format(audio_framed3.shape))
y_train3 = np.full((audio_framed2.shape[0]), 2)
X_train=np.concatenate([audio_framed1,audio_framed2,audio_framed3],axis=0)
y_train=np.concatenate([y_train1,y_train2,y_train3])
y_train=y_train.flatten()
y_train=pd.DataFrame(y_train)
y_train=pd.get_dummies(y_train[0:][0])
y_train

Framed audio shape: (11990, 2048)
Framed audio shape: (11990, 2048)
Framed audio shape: (11990, 2048)


Unnamed: 0,0,1,2
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
...,...,...,...
35965,0,0,1
35966,0,0,1
35967,0,0,1
35968,0,0,1


In [4]:
# training the model
model=NN()
model.fit(X_train, y_train, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x26c7c60c748>

In [5]:
# extraction of labels
wav4, sample_rate = librosa.load('3 speakers/test_audio.wav')
X_test = frame_audio(wav3, FFT_size=FFT_size, hop_size=hop_size, sample_rate=sample_rate)
y_predicted=model.predict(X_test)
y_predicted_labels = [np.argmax(i) for i in y_predicted]
y_predicted_labels

[2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 2,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 0,
 2,
 2,
 1,
 0,
 1,
 2,
 2,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 2,
 0,
 0,
 2,
 2,
 0,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 0,
 2,
 2,
 1,
 2,
 2,
 2,
 2,
 1,
 2,
 2,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,


In [None]:
# Speech to text using google api
r = sr.Recognizer()
filename=' 3 speakers/test_audio.wav'
frame_length=11990/sample_rate
j=0
i=0
for j in range(len(y_predicted_labels)):
    if y_predicted_labels[j]==0:
        print('Speaker 1:\t')
        k=0
        while y_predicted_labels[j]==0:
            k=k+1
            if j+k> len(y_predicted_labels):
                break
        with sr.AudioFile(filename) as source:
            audio_data=r.record(source,duration=frame_length*k,offset=frame_length*j)
            text=r.recognize_google(audio_data)
            print(text)
            j=j+k
    elif y_predicted_labels[j]==1:
        print('Speaker 2:\t')
        k=0
        while y_predicted_labels[j]==1 :
            k=k+1
            if j+k> len(y_predicted_labels):
                break
        try:
            with sr.AudioFile(filename) as source:
                audio_data=r.record(source,duration=frame_length*k,offset=frame_length*j)
                text=r.recognize_google(audio_data)
                print(text)
                j=j+k
        except:
            j=j+k
    elif y_predicted_labels[j]==2:
        print('Speaker 3:\t')
        k=0
        while y_predicted_labels[j]==2 :
            k=k+1
            if j+k> len(y_predicted_labels):
                break
        try:
            with sr.AudioFile(filename) as source:
                audio_data=r.record(source,duration=frame_length*k,offset=frame_length*j)
                text=r.recognize_google(audio_data)
                print(text)
                j=j+k

        except:
            j=j+k
    

    i=j
        
        



Speaker 3:	
Maharashtra Military bases and with this India stallion covid-19 Dairy induced infection has caused Holi restrictions meaning of religious and political reasons
Speaker 3:	
one day trip places and with this India Alia with covid-19 virus induced infection is caused Holi
Speaker 3:	
Maharashtra cases and with this India stallion covid-19 deri induced infection has caused Holi restrictions meaning of religious and political reasons
Speaker 3:	
Maharashtra cases and with this India stallion covid-19 virus induced infection has caused Holi restrictions for religious and political reasons
Speaker 3:	
one day trip places and with this India Alia with covid-19 virus induced infection is caused Holi
Speaker 3:	
cases and with this India Aliya with covid-19 virus induced infection has caused Holi images
Speaker 3:	
crime cases and with this India covid-19 virus induced infection has cost quality
Speaker 3:	
cases and with this India Aliya with covid-19 virus induced infection is cau