In [1]:
## folder called "spectrogram" contains all the test files
## folder called "Model_parameters" contains all the models
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd drive/MyDrive

/content/drive/MyDrive


In [3]:
import numpy as np
import librosa 
import glob
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from tensorflow import keras

In [4]:
def load_spectogram(dirname):
  i=0;
  filename= 'temp'
  n = 9
  for j in range(n+1):
    name = dirname + "/test_sample-" + str(j) + ".npy"
    spectog = np.load(name)
    name = name[12:len(name)-4]
    if(i==0):
      spec= np.empty((0,spectog.shape[0],spectog.shape[1]))
      filename = name
      i=1
    elif(i==1):
      filename = np.vstack([filename,name])
    spectog=spectog.reshape((1,spectog.shape[0],spectog.shape[1]))
    spec=np.vstack([spec,spectog])
  #print(filename)
  return filename,spec

In [5]:
def meanfilt (x, k):
    """Apply a length-k mean filter to a 1D array x.
    Boundaries are extended by repeating endpoints.
    """
    
    import numpy as np
    x = np.reshape(x,(x.shape[1]))
    assert k % 2 == 1, "Median filter length must be odd."
    assert x.ndim == 1, "Input must be one-dimensional."
    
    k2 = (k - 1) // 2
    y = np.zeros ((len (x), k), dtype=x.dtype)
    y[:,k2] = x
    for i in range (k2):
        j = k2 - i
        y[j:,i] = x[:-j]
        y[:j,i] = x[0]
        y[:-j,-(i+1)] = x[j:]
        y[-j:,-(i+1)] = x[-1]
    return np.mean (y, axis=1)

In [6]:
def calc_threshold_and_rmse(spec):
  thres=np.empty((spec.shape[0],1))
  rmse = np.empty((0,spec[0].shape[1]))
  for i in range(0,spec.shape[0]):
      a=librosa.decompose.nn_filter(spec[i],aggregate=np.average)
      rmse_val = librosa.feature.rms(S=a,frame_length=1024, hop_length=512)
      rmse_val = meanfilt(rmse_val,11)
      rmse=np.vstack([rmse,rmse_val])
      max=np.max(rmse_val[15:rmse_val.shape[0]])
      min=np.min(rmse_val)
      thres[i] =  (0.965)*max+(0.035)*(min)
      thresarr = np.full((spec[0].shape[1],1),thres[i])
     # plt.figure()
     # plt.plot(rmse_val, label='RMS Energy')
     # plt.plot(thresarr)
  return thres,rmse

In [7]:
def calc_timestamp(thres,rmse,filename):
  time_stamp = np.empty((0,3))
  a = np.empty((1,3))
  names = np.empty((0,1))
  for i in range(0,rmse.shape[0]):
      j=0
      prev=rmse[j];
      j=1
      while j<rmse.shape[1]:
          if(rmse[i][j]<=thres[i]):
              index=i
              onset=j
              while (j<rmse.shape[1] and rmse[i][j]<=thres[i]):
                  j=j+1
              offset=j
              a[0][0]=index
              a[0][1]=onset
              a[0][2]=offset
              #print(filename[index],onset,offset,index)
              if(offset - onset >=25):
                time_stamp=np.vstack([time_stamp,a])
                names = np.vstack([names,filename[index]])
          if(j<rmse.shape[1] and rmse[i][j]>thres[i]):
              j=j+1;
  return time_stamp,names

In [8]:
def aggregate_label_NN(a):
  count_speech=0
  count_music = 0
  for i in range(0,a.shape[0]):
    if(a[i][1]>a[i][0]):
      count_music=count_music+1*a[i][1]
    else:
      count_speech=count_speech+1*a[i][0]
  #print(count_music,count_speech)
  if(count_speech>count_music):
    return 0
  else:
    return 1

In [9]:
def label_timestamp_NN(model,spec,time_stamp):
  l = np.full((time_stamp.shape[0],1),0) 
  k=0
  for i in time_stamp:
    temp_spec = spec[int(i[0])][:,int(i[1]):int(i[2])]
    a = model.predict(temp_spec.T)
    if(aggregate_label_NN(a)==0):
      l[k][0]=0
    else:
      l[k][0]=1
    k=k+1
  return l

In [10]:
def audio_tag(filename,names,b):
    sp = np.full((filename.shape[0],),0)
    ms = np.full((filename.shape[0],),0)
    k=0
    l=0
    for name in filename:
        count_s=0
        count_m=0
        for i in range(0,names.shape[0]):
            if(name==names[i]):
                #print(b[i])
                if(b[i]=='speech'):
                    count_s=count_s+1
                else:
                    count_m=count_m+1
        if(count_m>0):
            ms[k] = 1
        if(count_s>0):
            sp[k] = 1
        k=k+1
    DF1 = pd.DataFrame(filename,columns=['filename'])
    DF2 = pd.DataFrame(ms,columns=['Music'])
    DF3 = pd.DataFrame(sp,columns=['Speech'])
    frames = [DF1,DF2,DF3]
    result = pd.concat(frames,axis=1)
    return result

In [11]:
if __name__=="__main__":
    filename,spec = load_spectogram('spectrogram')########
    thres,rmse = calc_threshold_and_rmse(spec)
    time_stamp,names = calc_timestamp(thres,rmse,filename)
    model = keras.models.load_model('Model_parameters/NN_model')
    l=label_timestamp_NN(model,spec,time_stamp)
    for i  in range(0,time_stamp.shape[0]):
        time_stamp[i,1]= ((time_stamp[i,1]-1)*512 + 1024)/16000
        time_stamp[i,2]= ((time_stamp[i,2]-1)*512 + 1024)/16000
    final_time_stamp_NN=np.hstack([time_stamp,l])
    #df = pd.read_csv('labels.csv')
    d= np.hstack([final_time_stamp_NN,names])
    k=0
    for i in range(0,l.shape[0]):
        if(l[i][0]==0):
            if(k==0):
                b='speech'
                k=1
            else:
                b=np.vstack([b,'speech'])
        else:
            if(k==0):
                b='music'
                k=1
            else:
                b=np.vstack([b,'music'])
    DF1 = pd.DataFrame(names)
    DF2 = pd.DataFrame(final_time_stamp_NN[:,1])
    DF3 = pd.DataFrame(final_time_stamp_NN[:,2])
    DF4 = pd.DataFrame(b)
    frames = [DF1,DF2,DF3,DF4]
    result = audio_tag(filename,names,b)
    #final = np.hstack([names.reshape((names.shape[0],)).T,final_time_stamp_NN[:,1].T,final_time_stamp_NN[:,2].T,b.reshape((b.shape[0],)).T])
    print(result)
    result.to_csv('NN_tag_detection.csv',index=False)

        filename  Music  Speech
0  test_sample-0      1       0
1  test_sample-1      1       0
2  test_sample-2      1       0
3  test_sample-3      0       1
4  test_sample-4      0       1
5  test_sample-5      0       0
6  test_sample-6      0       1
7  test_sample-7      1       0
8  test_sample-8      0       1
9  test_sample-9      0       1
