In [None]:
import numpy as np
import librosa 
import glob
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from scipy.stats import multivariate_normal

In [None]:
def readAudio(filename,Fs):
    x, sr = librosa.load(filename, sr=Fs)
    return x, sr

#calculate spectrogram
def calc_spec(x):
    n_fft = 1024
    hop_length = 512
    win_length = 1024
    X = np.abs(librosa.stft(x, n_fft = n_fft, hop_length = hop_length, win_length = win_length, window='hann', dtype = np.complex256))
    X = librosa.power_to_db(X**2,ref=np.max)
    return X

def saveSpectrogram(X, outfilename):
    assert outfilename[-4:]=='.npy'  #'outfilename extension should be .npy'
    np.save(outfilename, X)
    return

def readSpectrogram(infilename):
    X = np.load(infilename)
    return X


In [None]:
def read_and_combine_speech_music_spec():
  p=0
  for i in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '14', '15', '16', '17', '18']:
    name = 'music_samples'+'/' + i + '.wav'
    print(name)
    audio, sr = readAudio(name,16000)
    if p==0:
      music_data = audio
      p=1
    elif p==1:
      music_data = np.hstack((music_data,audio))
  p=0
  for i in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30']:
    name = 'speech_samples'+'/' + i + '.wav'
    print(name)
    audio, sr = readAudio(name,16000)
    if p==0:
      speech_data = audio
      p=1
    elif p==1:
      speech_data = np.hstack((speech_data,audio))
  speech_spec=calc_spec(speech_data)
  label_speech = np.zeros((speech_spec.shape[1],))
  music_spec=calc_spec(music_data)
  label_music = np.ones((music_spec.shape[1],))
  label = np.hstack([label_speech,label_music])
  combine = np.hstack([speech_spec,music_spec])
  return speech_spec, music_spec, label, combine

In [None]:
def define_NN():
  model = tf.keras.models.Sequential([tf.keras.layers.Flatten(), 
                                    tf.keras.layers.Dense(513, activation=tf.nn.relu), 
                                    tf.keras.layers.Dense(64, activation=tf.nn.relu), 
                                    tf.keras.layers.Dense(32, activation=tf.nn.relu), 
                                    tf.keras.layers.Dense(2, activation=tf.nn.softmax)])
  model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
  return model

In [None]:
if __name__=="__main__":
  speech_spec, music_spec, label, combine = read_and_combine_speech_music_spec()
  model = define_NN()
  model.fit(combine.T, label, epochs=20)
  model.save('NN_model')
  #l=label_timestamp_NN(model,spec,time_stamp)
  #for i  in range(0,time_stamp.shape[0]):
  #  time_stamp[i,1]= ((time_stamp[i,1]-1)*512 + 1024)/16000
  #  time_stamp[i,2]= ((time_stamp[i,2]-1)*512 + 1024)/16000
  #final_time_stamp_NN=np.hstack([time_stamp,l])
  #df = pd.read_csv('labels.csv')
  #print(final_time_stamp_NN,'orig', df)

music_samples/1.wav
music_samples/2.wav
music_samples/3.wav
music_samples/4.wav
music_samples/5.wav
music_samples/6.wav
music_samples/7.wav
music_samples/8.wav
music_samples/9.wav
music_samples/10.wav
music_samples/11.wav
music_samples/12.wav
music_samples/14.wav
music_samples/15.wav
music_samples/16.wav
music_samples/17.wav
music_samples/18.wav
speech_samples/1.wav
speech_samples/2.wav
speech_samples/3.wav
speech_samples/4.wav
speech_samples/5.wav
speech_samples/6.wav
speech_samples/7.wav
speech_samples/8.wav
speech_samples/9.wav
speech_samples/10.wav
speech_samples/11.wav
speech_samples/12.wav
speech_samples/14.wav
speech_samples/15.wav
speech_samples/16.wav
speech_samples/17.wav
speech_samples/18.wav
speech_samples/19.wav
speech_samples/20.wav
speech_samples/21.wav
speech_samples/22.wav
speech_samples/23.wav
speech_samples/24.wav
speech_samples/25.wav
speech_samples/26.wav
speech_samples/27.wav
speech_samples/28.wav
speech_samples/29.wav
speech_samples/30.wav
Epoch 1/20
Epoch 2/20
E