<a href="https://colab.research.google.com/github/azizbekb/NLP/blob/main/Spoken_Digit_Recognition_Youtube.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#add the dataset
!wget https://github.com/Jakobovski/free-spoken-digit-dataset/archive/refs/heads/master.zip
#unzip the dataset folder
! unzip master.zip


In [None]:
!pip install gradio

Collecting gradio
  Downloading gradio-3.44.1-py3-none-any.whl (20.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.2/20.2 MB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.103.1-py3-none-any.whl (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.2/66.2 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.1.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client==0.5.0 (from gradio)
  Downloading gradio_client-0.5.0-py3-none-any.whl (298 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.2/298.2 kB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx (from gradio)
  Downloading httpx-0.25.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!unzip audio_uz.zip

Archive:  audio_uz.zip
   creating: audio_uz/
  inflating: audio_uz/0_AkmalAshirmatov_0.wav  
  inflating: audio_uz/0_AkmalAshirmatov_1.wav  
  inflating: audio_uz/0_AkmalAshirmatov_10.wav  
  inflating: audio_uz/0_AkmalAshirmatov_11.wav  
  inflating: audio_uz/0_AkmalAshirmatov_12.wav  
  inflating: audio_uz/0_AkmalAshirmatov_13.wav  
  inflating: audio_uz/0_AkmalAshirmatov_14.wav  
  inflating: audio_uz/0_AkmalAshirmatov_15.wav  
  inflating: audio_uz/0_AkmalAshirmatov_16.wav  
  inflating: audio_uz/0_AkmalAshirmatov_17.wav  
  inflating: audio_uz/0_AkmalAshirmatov_18.wav  
  inflating: audio_uz/0_AkmalAshirmatov_19.wav  
  inflating: audio_uz/0_AkmalAshirmatov_2.wav  
  inflating: audio_uz/0_AkmalAshirmatov_20.wav  
  inflating: audio_uz/0_AkmalAshirmatov_21.wav  
  inflating: audio_uz/0_AkmalAshirmatov_22.wav  
  inflating: audio_uz/0_AkmalAshirmatov_3.wav  
  inflating: audio_uz/0_AkmalAshirmatov_4.wav  
  inflating: audio_uz/0_AkmalAshirmatov_5.wav  
  inflating: audio_uz/0_Akmal

In [None]:
#
import os
from os import listdir
from os.path import isfile, join
from matplotlib import pyplot as plt
import scipy.io.wavfile as wav
import numpy as np
from keras.preprocessing import image
from keras.utils import to_categorical


In [None]:
def wav_to_spectrogram(audio_path, save_path, spectrogram_dimensions=(64, 64), noverlap=16, cmap='gray_r'):
  sample_rate, samples = wav.read(audio_path)
  fig = plt.figure()
  fig.set_size_inches((spectrogram_dimensions[0]/fig.get_dpi(), spectrogram_dimensions[1]/fig.get_dpi()))
  ax = plt.Axes(fig, [0., 0., 1., 1.])
  ax.set_axis_off()
  fig.add_axes(ax)
  ax.specgram(samples, cmap=cmap, Fs=2, noverlap=noverlap)
  ax.xaxis.set_major_locator(plt.NullLocator())
  ax.yaxis.set_major_locator(plt.NullLocator())
  fig.savefig(save_path, bbox_inches="tight", pad_inches=0)


In [None]:
#
def dir_to_spectrogram(audio_dir, spectrogram_dir, spectrogram_dimensions=(64, 64), noverlap=16, cmap='gray_r'):
  file_names = [f for f in listdir(audio_dir) if isfile(join(audio_dir, f)) and '.wav' in f]
  for file_name in file_names:
    print(file_name)
    audio_path = audio_dir + file_name
    spectogram_path = spectrogram_dir + file_name.replace('.wav', '.png')
    wav_to_spectrogram(audio_path, spectogram_path, spectrogram_dimensions=spectrogram_dimensions, noverlap=noverlap, cmap=cmap)


In [None]:
audio_folder = "audio_uz/"
spectrogram_folder = "spectrograms/"
dir_to_spectrogram(audio_folder, spectrogram_folder)


In [None]:
imagesDir = "spectrograms/"
trainset = []
testset = []
for file in os.listdir(imagesDir):
  label = file.split('_')[0]
  sample_number = file.split('_')[2]
  img = image.load_img(imagesDir+file)
  if sample_number in ['0.png','1.png','2.png']:
    testset.append([image.img_to_array(img), label])
  else:
    trainset.append([image.img_to_array(img), label])


In [None]:
#
# Get only images in the train list not the Labels
X_train = [item[0] for item in trainset]
# Get only Labels in the train list not the images
y_train = [item[1] for item in trainset]
# Get only images in the test list not the Labels
X_test = [item[0] for item in testset]
# Get only Labels in the test list not the images
y_test = [item[1] for item in testset]


In [None]:
len(X_train)

854

In [None]:
# Convert list to numpy array in order to define input shape
X_train = np.asanyarray(X_train)
y_train = np.asanyarray(y_train)
X_test = np.asanyarray(X_test)
y_test = np.asanyarray(y_test)


In [None]:
# convert to one hot representation
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

#Normalize the images
X_train /= 255
X_test /= 255








In [None]:
#N MODEL DESIGN

In [None]:
from keras.models import Sequential
from keras import optimizers
from keras.layers import Dense, Activation, Flatten, Conv2D, MaxPooling2D, Dropout, BatchNormalization
from keras import models


In [None]:
data_shape = (X_train.shape[1], X_train.shape[2], X_train.shape[3])
def basic_cnn():
  model = Sequential()
  model.add(Conv2D(32, kernel_size=(2, 2), activation='relu', input_shape=data_shape))
  model.add(BatchNormalization())
  model.add(MaxPooling2D(pool_size=(2, 2)))
  model.add(Conv2D(16, kernel_size=(2, 2), activation='relu'))
  model.add(BatchNormalization())

  model.add(MaxPooling2D(pool_size=(3, 3)))
  model.add(Dropout(0.2))
  model.add(Flatten())
  model.add(Dense(64, activation='relu'))
  model.add(BatchNormalization())
  model.add(Dropout(0.35))
  model.add(Dense(32, activation='relu'))
  model.add(BatchNormalization())
  model.add(Dropout(0.4))
  model.add(Dense(10, activation='softmax'))
  model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model


In [None]:
#
model0 = basic_cnn()
model0.summary()


Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_18 (Conv2D)          (None, 63, 63, 32)        416       
                                                                 
 batch_normalization_31 (Ba  (None, 63, 63, 32)        128       
 tchNormalization)                                               
                                                                 
 max_pooling2d_7 (MaxPoolin  (None, 31, 31, 32)        0         
 g2D)                                                            
                                                                 
 conv2d_19 (Conv2D)          (None, 30, 30, 16)        2064      
                                                                 
 batch_normalization_32 (Ba  (None, 30, 30, 16)        64        
 tchNormalization)                                               
                                                      

In [None]:
model0.fit(X_train, y_train, batch_size = 150, validation_split=0.2, epochs = 100, verbose = 1)


In [None]:
model0.evaluate(X_test, y_test)



[2.1609044075012207, 0.3294117748737335]

In [None]:
!pip install pyyaml h5py  # Required to save models in HDF5 format
model0.save("spoken_digit_recognition_.h5")

In [None]:
index = 23
print('ground Truth',np.argmax(y_test[index]))
print('Prediction' ,np.argmax(model0.predict(X_test[index].reshape(1,64,64,3))))

ground Truth 5
Prediction 5


In [None]:
import gradio as gr

import numpy as np

from pydub import AudioSegment
import os
import scipy.io.wavfile as wav
import keras.utils as image
from keras.models import load_model
import matplotlib.pyplot as plt

import keras.utils as image

In [None]:
def wav_to_spectrogram(audio_path, save_path, spectrogram_dimensions=(64, 64), noverlap=16, cmap='gray_r'):
  print(save_path, audio_path)
  sample_rate, samples = wav.read(audio_path)
  fig = plt.figure()
  fig.set_size_inches((spectrogram_dimensions[0]/fig.get_dpi(), spectrogram_dimensions[1]/fig.get_dpi()))
  ax = plt.Axes(fig, [0., 0., 1., 1.])
  ax.set_axis_off()
  fig.add_axes(ax)
  ax.specgram(samples, cmap=cmap, Fs=2, noverlap=noverlap)
  ax.xaxis.set_major_locator(plt.NullLocator())
  ax.yaxis.set_major_locator(plt.NullLocator())
  fig.savefig(save_path, bbox_inches="tight", pad_inches=0)

def predict_digit(audio_path,path_for_model):

    pid = os.getpid()

    rand = np.random.randint(1000000)

    spectrogram_path = f"predictions/{pid}_testing_{rand}.png"
    wav_to_spectrogram(audio_path, spectrogram_path)

    img = image.load_img(spectrogram_path)
    X = image.img_to_array(img)
    X = np.asanyarray(X)
    X /= 255
    # load model and predict

    #model_predict = load_model(path_for_model)

    return np.argmax(model0.predict(X.reshape(1,64,64,3)))


In [None]:
def gradio_interface(audio_file):
    print(audio_file)

    return predict_digit(audio_path = audio_file,path_for_model='models/spoken_digit_recognition_12.h5')



In [None]:
import gradio as gr
gr.Interface(fn=gradio_interface, inputs=gr.inputs.Audio(
            source="upload", type="filepath", optional=True, label="Record"
        ), outputs="text").launch(debug=True,share= True)

  gr.Interface(fn=gradio_interface, inputs=gr.inputs.Audio(
  gr.Interface(fn=gradio_interface, inputs=gr.inputs.Audio(


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://75933f2ee616c9bc4d.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


/tmp/gradio/f457ed304e775872b2bcecf523d0c918520963ed/0_AkmalAshirmatov_8-0-100.wav
predictions/662_testing_866832.png /tmp/gradio/f457ed304e775872b2bcecf523d0c918520963ed/0_AkmalAshirmatov_8-0-100.wav




/tmp/gradio/72b2edc35293956478d8a950a34079d80ff4fb26/1_DiyoraSayfiddinova_12-0-100.wav
predictions/662_testing_643833.png /tmp/gradio/72b2edc35293956478d8a950a34079d80ff4fb26/1_DiyoraSayfiddinova_12-0-100.wav
/tmp/gradio/c2be4de4899d11ba934afe27a5766317dcbdb3b2/1_KamilaSaydullaeva_5-0-100.wav
predictions/662_testing_728271.png /tmp/gradio/c2be4de4899d11ba934afe27a5766317dcbdb3b2/1_KamilaSaydullaeva_5-0-100.wav
/tmp/gradio/7dfaf66bef08f5c60fb9f0b2758b548c83a94134/2_AmirSayfiddinov_2-0-100.wav
predictions/662_testing_737226.png /tmp/gradio/7dfaf66bef08f5c60fb9f0b2758b548c83a94134/2_AmirSayfiddinov_2-0-100.wav
/tmp/gradio/60e75429498ffe822f89d7ea8865781c89e703e9/2_AmirSayfiddinov_5-0-100.wav
predictions/662_testing_891141.png /tmp/gradio/60e75429498ffe822f89d7ea8865781c89e703e9/2_AmirSayfiddinov_5-0-100.wav
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://75933f2ee616c9bc4d.gradio.live


