In [1]:
!git clone https://github.com/shivigup/EE603-Machine-Learning-for-Signal-Processing
%cd EE603-Machine-Learning-for-Signal-Processing/project

Cloning into 'EE603-Machine-Learning-for-Signal-Processing'...
remote: Enumerating objects: 345, done.[K
remote: Counting objects: 100% (345/345), done.[K
remote: Compressing objects: 100% (331/331), done.[K
remote: Total 345 (delta 103), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (345/345), 448.10 MiB | 19.02 MiB/s, done.
Resolving deltas: 100% (103/103), done.
Checking out files: 100% (192/192), done.
/content/EE603-Machine-Learning-for-Signal-Processing/project


In [None]:
## need two directories to run this code
## music_samples contains music files (.wav)
## speech_samples contains speech files (.wav)

In [2]:
import glob
import numpy as np
import pandas as pd
import librosa
import random as rd
import matplotlib.pyplot as plt
import tensorflow as tf

In [3]:
def readAudio(filename):
    x, sr = librosa.load(filename, sr=16000)
    return x, sr

#calculate spectrogram
def calc_spec(x):
    n_fft = 1024
    hop_length = 512
    win_length = 1024
    X = np.abs(librosa.stft(x, n_fft = n_fft, hop_length = hop_length, win_length = win_length, window='hann', dtype = np.complex256))
    X = librosa.power_to_db(X**2,ref=np.max)
    return X

def saveSpectrogram(X, outfilename):
    assert outfilename[-4:]=='.npy'  #'outfilename extension should be .npy'
    np.save(outfilename, X)
    return

def readSpectrogram(infilename):
    X = np.load(infilename)
    return X

In [4]:
## creating a dataset from music files

Fs = 16000
music_dataset = np.array([])
music_labels = np.array([])
for i in glob.glob('music_samples'+'/*.wav'):
  print(i)
  x, sr = readAudio(i)
  j = 0
  while(j<x.shape[0]):
    k = j + int((5 + 5*rd.uniform(0,1))*Fs)
    if(k>x.shape[0] and x.shape[0]-j < 5*Fs):
      j = j+k
      continue
    if(x[j:k].shape[0]==k-j):
      music_dataset = np.append(music_dataset, x[j:k])
      music_labels = np.append(music_labels, np.ones((k-j)))
      #print(music_dataset.shape)
      #print(music_labels.shape)
      z = np.zeros((int((5 + 5*rd.uniform(0,1))*Fs)))
      music_dataset = np.append(music_dataset, z)
      music_labels = np.append(music_labels, z)
      #print(music_dataset.shape)
      #print(music_labels.shape)
    j = j+k

music_samples/12.wav
music_samples/1.wav
music_samples/8.wav
music_samples/5.wav
music_samples/3.wav
music_samples/7.wav
music_samples/16.wav
music_samples/13.wav
music_samples/11.wav
music_samples/4.wav
music_samples/18.wav
music_samples/15.wav
music_samples/2.wav
music_samples/17.wav
music_samples/9.wav
music_samples/14.wav
music_samples/10.wav
music_samples/6.wav


In [5]:
## Creating a dataset from speech files

Fs = 16000
speech_dataset = np.array([])
speech_labels = np.array([])
for i in glob.glob('speech_samples'+'/*.wav'):
  print(i)
  x, sr = readAudio(i)
  j = 0
  while(j<x.shape[0]):
    k = j + int((5 + 5*rd.uniform(0,1))*Fs)
    if(k>x.shape[0] and x.shape[0]-j < 5*Fs):
      j = j+k
      continue
    if(x[j:k].shape[0]==k-j):
      speech_dataset = np.append(speech_dataset, x[j:k])
      speech_labels = np.append(speech_labels, np.multiply(np.ones((k-j)),2))
      z = np.zeros((int((5 + 5*rd.uniform(0,1))*Fs)))
      speech_dataset = np.append(speech_dataset, z)
      speech_labels = np.append(speech_labels, z)
    j = j+k

speech_samples/27.wav
speech_samples/12.wav
speech_samples/1.wav
speech_samples/8.wav
speech_samples/5.wav
speech_samples/23.wav
speech_samples/3.wav
speech_samples/29.wav
speech_samples/7.wav
speech_samples/21.wav
speech_samples/16.wav
speech_samples/13.wav
speech_samples/11.wav
speech_samples/20.wav
speech_samples/4.wav
speech_samples/18.wav
speech_samples/15.wav
speech_samples/24.wav
speech_samples/25.wav
speech_samples/26.wav
speech_samples/28.wav
speech_samples/2.wav
speech_samples/17.wav
speech_samples/9.wav
speech_samples/19.wav
speech_samples/22.wav
speech_samples/30.wav
speech_samples/14.wav
speech_samples/10.wav
speech_samples/6.wav


In [6]:
print(music_dataset.shape)
print(music_labels.shape)

print(speech_dataset.shape)
print(speech_labels.shape)

(16583553,)
(16583553,)
(16926697,)
(16926697,)


In [7]:
speech_dataset = speech_dataset[:music_dataset.shape[0]]
speech_labels = speech_labels[:music_dataset.shape[0]]
## to remove skewness

In [8]:
print(music_dataset.shape)
print(music_labels.shape)

print(speech_dataset.shape)
print(speech_labels.shape)

(16583553,)
(16583553,)
(16583553,)
(16583553,)


In [9]:
## adding noise
STD_n= 0.01
speech_dataset = speech_dataset + np.random.normal(0, STD_n, speech_dataset.shape[0])
music_dataset = music_dataset + np.random.normal(0, STD_n, music_dataset.shape[0])

In [10]:
speech_dataset[0:10]

array([ 4.89400388e-05,  2.69982439e-03, -9.02215279e-03,  6.85640440e-03,
        2.05212724e-03,  1.21720528e-02, -1.54846461e-03, -1.33195980e-02,
       -1.61984223e-02, -8.61528982e-03])

In [11]:
spec_speech = calc_spec(speech_dataset)
spec_music = calc_spec(music_dataset)
print(spec_speech.shape)
print(spec_music.shape)

(513, 32390)
(513, 32390)


In [12]:
n = spec_speech.shape[1]
labels_spec_speech = np.zeros((n))
labels_spec_music = np.zeros((n))

for i in range(1024, speech_labels.shape[0]):
  if(speech_labels[i]!=0):
    labels_spec_speech[(i-1024)//512 +1] = speech_labels[i]
  if(music_labels[i]!=0):
    labels_spec_music[(i-1024)//512 +1] = music_labels[i]

In [13]:
X = np.vstack([np.swapaxes(spec_speech, 0, 1), np.swapaxes(spec_music, 0, 1)])
y = np.append(labels_spec_speech, labels_spec_music)

print(X.shape)
print(y.shape)

#music 1, speech 2

(64780, 513)
(64780,)


In [14]:
## train-test split
N = X.shape[0]
train_n = int(0.8*N)
indices = np.random.permutation(N)
training_idx, test_idx = indices[:train_n], indices[train_n:]
training_X, test_X = X[training_idx,:], X[test_idx,:]
training_Y, test_Y = y[training_idx], y[test_idx]

In [15]:
model = tf.keras.models.Sequential([tf.keras.layers.Flatten(), 
                                    tf.keras.layers.Dense(513, activation=tf.nn.relu), 
                                    tf.keras.layers.Dense(64, activation=tf.nn.relu), 
                                    tf.keras.layers.Dense(32, activation=tf.nn.relu), 
                                    tf.keras.layers.Dense(3, activation=tf.nn.softmax)])

In [16]:
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [17]:
model.fit(training_X, training_Y, epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7f9f5d9d9c50>

In [18]:
model.evaluate(test_X, test_Y)



[0.3400569260120392, 0.8925594091415405]

In [27]:
model.save("model")

INFO:tensorflow:Assets written to: model/assets
