In [1]:
import os
import shutil
import numpy as np
import tensorflow as tf
from tensorflow import keras
from pathlib import Path

SAMPLING_RATE = 16000
num_classes=6
SCALE = 0.5
BATCH_SIZE = 128
SHUFFLE_SEED = 43

model = tf.keras.models.load_model('sample.h5')
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              [(None, 16000, 1)]   0                                            
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 16000, 16)    64          input[0][0]                      
__________________________________________________________________________________________________
activation (Activation)         (None, 16000, 16)    0           conv1d_1[0][0]                   
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 16000, 16)    784         activation[0][0]                 
_______________________________________________________________________________________

In [2]:
def paths_and_labels_to_dataset(audio_paths):

    path_ds = tf.data.Dataset.from_tensor_slices(audio_paths) 
    audio_ds = path_ds.map(lambda x: path_to_audio(x))  
    #label_ds = tf.data.Dataset.from_tensor_slices(labels)  

    return tf.data.Dataset.zip((audio_ds))  


def path_to_audio(path):

    audio = tf.io.read_file(path) #Reads the contents of file.
    audio, _ = tf.audio.decode_wav(audio, 1, SAMPLING_RATE) #??
    return audio


def add_noise(audio, noises=None, scale=0.5):
    if noises is not None:

        tf_rnd = tf.random.uniform((tf.shape(audio)[0],), 0, noises.shape[0], dtype=tf.int32)
        noise = tf.gather(noises, tf_rnd, axis=0) 
        prop = tf.math.reduce_max(audio, axis=1) / tf.math.reduce_max(noise, axis=1)
        prop = tf.repeat(tf.expand_dims(prop, axis=1), tf.shape(audio)[1], axis=1)
        audio = audio + noise * prop * scale

    return audio


def audio_to_fft(audio):

    audio = tf.squeeze(audio, axis=-1) 
    fft = tf.signal.fft(tf.cast(tf.complex(real=audio, imag=tf.zeros_like(audio)), tf.complex64))
    fft = tf.expand_dims(fft, axis=-1) #Returns a tensor with a length 1 axis inserted at index axis

    return tf.math.abs(fft[:, : (audio.shape[1]), :]) #get the shape of audio 

In [3]:
import pyaudio
import wave

def record(filename):

    """ Taking the voice input """

    chunk = 1024 
    sample_format = pyaudio.paInt16  
    channels = 2
    fs = 16000  
    seconds = 2
    #filename = "audio\yasasi\data.wav"

    p = pyaudio.PyAudio()  

    print("-------------------------------------------------------------------------------------------")
    print('Recording')

    stream = p.open(format=sample_format,
                    channels=channels,
                    rate=fs,
                    frames_per_buffer=chunk,
                    input=True)

    frames = []  


    for i in range(0, int(fs / chunk * seconds)):
        data = stream.read(chunk)
        frames.append(data)


    stream.stop_stream()
    stream.close()

    p.terminate()

    print('Finished recording')
    print("-------------------------------------------------------------------------------------------")
    # Save
    wf = wave.open(filename, 'wb')
    wf.setnchannels(channels)
    wf.setsampwidth(p.get_sample_size(sample_format))
    wf.setframerate(fs)
    wf.writeframes(b''.join(frames))
    wf.close()

In [13]:
class_names=['Benjamin_Netanyau', 'Jens_Stoltenberg', 'Julia_Gillard', 'Magaret_Tarcher', 'Nelson_Mandela', 'yasasi']

test_file_name='test_voice.wav'
record(test_file_name)


path = [test_file_name]

print(path)

test_ds = paths_and_labels_to_dataset(path)
test_ds = test_ds.shuffle(buffer_size=BATCH_SIZE * 8, seed=SHUFFLE_SEED).batch(BATCH_SIZE)

y_pred = model.predict(test_ds)

label=np.argmax(y_pred,axis=1)[0]
acc=np.max(y_pred,axis=1)[0]
speaker=class_names[label]

print('predicted Speaker:',speaker,' label:',label,' accuracy:',acc)

-------------------------------------------------------------------------------------------
Recording
Finished recording
-------------------------------------------------------------------------------------------
['test_voice.wav']
predicted Speaker: yasasi  label: 5  accuracy: 0.84825927
