In [None]:
import numpy as np
import librosa 
import glob
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from scipy.stats import multivariate_normal

In [None]:
def load_spectogram(dirname):
  i=0;
  filename= 'temp'
  for name in glob.glob(dirname+'/*.npy'):
    spectog = np.load(name)
    name = name[12:len(name)-4]
    if(i==0):
      spec= np.empty((0,spectog.shape[0],spectog.shape[1]))
    #  name = name[12:len(name)-4]
      filename = name
      i=1
    elif(i==1):
      filename = np.vstack([filename,name])
    spectog=spectog.reshape((1,spectog.shape[0],spectog.shape[1]))
    spec=np.vstack([spec,spectog])
  #print(filename)
  return filename,spec

In [None]:
def meanfilt (x, k):
    """Apply a length-k mean filter to a 1D array x.
    Boundaries are extended by repeating endpoints.
    """
    
    import numpy as np
    x = np.reshape(x,(x.shape[1]))
    assert k % 2 == 1, "Median filter length must be odd."
    assert x.ndim == 1, "Input must be one-dimensional."
    
    k2 = (k - 1) // 2
    y = np.zeros ((len (x), k), dtype=x.dtype)
    y[:,k2] = x
    for i in range (k2):
        j = k2 - i
        y[j:,i] = x[:-j]
        y[:j,i] = x[0]
        y[:-j,-(i+1)] = x[j:]
        y[-j:,-(i+1)] = x[-1]
    return np.mean (y, axis=1)

In [None]:
def calc_threshold_and_rmse(spec):
  thres=np.empty((spec.shape[0],1))
  rmse = np.empty((0,spec[0].shape[1]))
  for i in range(0,spec.shape[0]):
      a=librosa.decompose.nn_filter(spec[i],aggregate=np.average)
      rmse_val = librosa.feature.rms(S=a,frame_length=1024, hop_length=512)
      rmse_val = meanfilt(rmse_val,11)
      rmse=np.vstack([rmse,rmse_val])
      max=np.max(rmse_val[15:rmse_val.shape[0]])
      min=np.min(rmse_val)
      thres[i] =  (0.965)*max+(0.035)*(min)
      thresarr = np.full((spec[0].shape[1],1),thres[i])
      #plt.figure()
      #plt.plot(rmse_val, label='RMS Energy')
      #plt.plot(thresarr)
  return thres,rmse

In [None]:
def calc_timestamp(thres,rmse,filename):
  time_stamp = np.empty((0,3))
  a = np.empty((1,3))
  names = np.empty((0,1))
  for i in range(0,rmse.shape[0]):
      j=0
      prev=rmse[j];
      j=1
      while j<rmse.shape[1]:
          if(rmse[i][j]<=thres[i]):
              index=i
              onset=j
              while (j<rmse.shape[1] and rmse[i][j]<=thres[i]):
                  j=j+1
              offset=j
              a[0][0]=index
              a[0][1]=onset
              a[0][2]=offset
              #print(filename[index],onset,offset,index)
              if(offset - onset >=25):
                time_stamp=np.vstack([time_stamp,a])
                names = np.vstack([names,filename[index]])
          if(j<rmse.shape[1] and rmse[i][j]>thres[i]):
              j=j+1;
  return time_stamp,names

In [None]:
def aggregate_label_GMM(a):
  count_speech=0
  count_music = 0
  for i in range(0,a.shape[0]):
    if(a[i][1]>a[i][0]):
      count_music=count_music+1
    else:
      count_speech=count_speech+1
  #print(count_music,count_speech)
  if(count_speech>count_music):
    return 0
  else:
    return 1

In [None]:
def label_timestamp_GMM(gmm_music,gmm_speech,spec,time_stamp):
  l = np.full((time_stamp.shape[0],1),0)
  k=0
  for i in time_stamp:
    temp_spec = spec[int(i[0])][:,int(i[1]):int(i[2])]
    #print(temp_spec.shape)
    reduced_spec = PCA(np.float64(temp_spec).T)
    a = np.full((reduced_spec.shape[1],2),0.0) 
    #print(a[:,1].shape)
    #print((gmm_music.predict(reduced_spec.T).reshape((reduced_spec.shape[1],))).shape)
    a[:,1] = gmm_music.predict(reduced_spec.T).reshape((reduced_spec.shape[1],))
    a[:,0] = gmm_speech.predict(reduced_spec.T).reshape((reduced_spec.shape[1],))
    #print(a)
    if(aggregate_label_GMM(a)==0):
      l[k][0]=0
    else:
      l[k][0]=1
    k=k+1
  return l

In [None]:
def PCA(spec):
    spec2 = spec - np.average(spec,axis=0)
    cov_mat =np.cov(spec2, rowvar = False) #each column represents a variable
    eigen_values, eigen_vectors = np.linalg.eig(cov_mat)
    sorted_index = np.argsort(eigen_values)[::-1]
    sorted_eigenvalue = eigen_values[sorted_index]
    sorted_eigenvectors = eigen_vectors[:,sorted_index]
    n_com = 16
    eigenvector_subset = sorted_eigenvectors[:,0:n_com]
    spec_reduced = np.dot(eigenvector_subset.transpose(),spec2.transpose()).transpose()
    return spec_reduced.T

In [None]:
class GMM_music:
    def __init__(self,k,max_iter=5):
        self.k = k
        self.max_iter = int(max_iter)
    
    def initialize(self, X):
        self.shape = X.shape
        self.n, self.m = self.shape
        
        self.pi = np.full(shape=self.k, fill_value=1/self.k)
        self.gamma = np.full(shape =self.shape, fill_value=1/self.k) #P(zi = j/x,theta)probability that xi comes from cluster j

        random_row = np.random.randint(low=0, high=self.n, size=self.k)
        self.mu = [  X[row_index,:] for row_index in random_row ]
        self.sigma = [ np.cov(X.T) for _ in range(self.k) ]
       # print(X.shape)
       # print(self.pi.shape,self.gamma.shape,len(self.mu),len(self.sigma))

    def e_step(self, X):
        self.gamma = self.predict_proba(X)
        self.pi = self.gamma.mean(axis=0)
        
    def m_step(self, X):
        for i in range(self.k):
            gamma = self.gamma[:, [i]]
            total_gamma = gamma.sum()
            self.mu[i] = (X * gamma).sum(axis=0)/total_gamma
            self.sigma[i] = np.cov(X.T, 
                aweights=(gamma/total_gamma).flatten(), 
                bias=True)
        
    def fit(self, X):
        self.initialize(X)
        for iteration in range(self.max_iter):
            self.e_step(X)
            self.m_step(X)
        self.saveweights(self.k,self.mu,self.sigma,self.pi, 'music_k.npy', 'music_mu.npy', 'music_sigma.npy', 'music_pi.npy')
    
    def saveweights(self,k, mu , sigma, pi,kn,mun,sigman,pin):
      assert kn[-4:]=='.npy'
      assert mun[-4:]=='.npy' 
      assert pin[-4:]=='.npy'
      assert sigman[-4:]=='.npy' 
      np.save(kn, k)
      np.save(mun, mu)
      np.save(sigman, sigma)
      np.save(pin, pi)
      return

    def readweights(self,kn, mun,sigman,pin):
      self.k = np.load(kn) 
      self.mu = np.load(mun)
      self.sigma = np.load(sigman)
      self.pi = np.load(pin)
      return self.k, self.mu, self.sigma, self.pi

    
        
    
    def predict_proba(self, X):
        likelihood = np.zeros( (X.shape[0], self.k) )
        for i in range(self.k):
            #print(self.mu[i])
            distribution = multivariate_normal(
                mean=self.mu[i], 
                cov=self.sigma[i])
            likelihood[:,i] = distribution.pdf(X)
        #print(likelihood)
        numerator = likelihood * self.pi
      #  print('h',numerator.shape,likelihood.shape,numerator[0,0])
        denominator = numerator.sum(axis=1)[:, np.newaxis]
       # print(denominator)
        gamma = numerator / denominator
        return gamma
        
    def predict(self, X):
        likelihood = np.zeros( (X.shape[0], self.k) )
        for i in range(self.k):
            distribution = multivariate_normal(
                mean=self.mu[i], 
                cov=self.sigma[i])
            likelihood[:,i] = distribution.pdf(X)
        numerator = likelihood * self.pi
        sum_numerator=numerator.sum(axis=1)[:, np.newaxis]
        return sum_numerator

In [None]:
class GMM_speech:
    def __init__(self,k,max_iter=5):
        self.k = k
        self.max_iter = int(max_iter)
    
    def initialize(self, X):
        self.shape = X.shape
        self.n, self.m = self.shape
        
        self.pi = np.full(shape=self.k, fill_value=1/self.k)
        self.gamma = np.full(shape =self.shape, fill_value=1/self.k) #P(zi = j/x,theta)probability that xi comes from cluster j

        random_row = np.random.randint(low=0, high=self.n, size=self.k)
        self.mu = [  X[row_index,:] for row_index in random_row ]
        self.sigma = [ np.cov(X.T) for _ in range(self.k) ]
      #  print(X.shape)
      #  print(self.pi.shape,self.gamma.shape,len(self.mu),len(self.sigma))

    def e_step(self, X):
        self.gamma = self.predict_proba(X)
        self.pi = self.gamma.mean(axis=0)
        
    def m_step(self, X):
        for i in range(self.k):
            gamma = self.gamma[:, [i]]
            total_gamma = gamma.sum()
            self.mu[i] = (X * gamma).sum(axis=0)/total_gamma
            self.sigma[i] = np.cov(X.T, 
                aweights=(gamma/total_gamma).flatten(), 
                bias=True)
        
    def fit(self, X):
        self.initialize(X)
        for iteration in range(self.max_iter):
            self.e_step(X)
            self.m_step(X)
        self.saveweights(self.k,self.mu,self.sigma,self.pi, 'speech_k.npy', 'speech_mu.npy', 'speech_sigma.npy', 'speech_pi.npy')
    
    def saveweights(self,k, mu , sigma, pi,kn,mun,sigman,pin):
      assert kn[-4:]=='.npy'
      assert mun[-4:]=='.npy' 
      assert pin[-4:]=='.npy'
      assert sigman[-4:]=='.npy' 
      np.save(kn, k)
      np.save(mun, mu)
      np.save(sigman, sigma)
      np.save(pin, pi)
      return

    def readweights(self,kn, mun,sigman,pin):
      self.k = np.load(kn) 
      self.mu = np.load(mun)
      self.sigma = np.load(sigman)
      self.pi = np.load(pin)
      return self.k, self.mu, self.sigma, self.pi
        
    
    def predict_proba(self, X):
        likelihood = np.zeros( (X.shape[0], self.k) )
        for i in range(self.k):
            #print(self.mu[i])
            distribution = multivariate_normal(
                mean=self.mu[i], 
                cov=self.sigma[i])
            likelihood[:,i] = distribution.pdf(X)
       # print(likelihood)
        numerator = likelihood * self.pi
        print('h',numerator.shape,likelihood.shape,numerator[0,0])
        denominator = numerator.sum(axis=1)[:, np.newaxis]
       # print(denominator)
        gamma = numerator / denominator
        return gamma
        
    def predict(self, X):
        likelihood = np.zeros( (X.shape[0], self.k) )
        for i in range(self.k):
            distribution = multivariate_normal(
                mean=self.mu[i], 
                cov=self.sigma[i])
            likelihood[:,i] = distribution.pdf(X)
        numerator = likelihood * self.pi
        sum_numerator=numerator.sum(axis=1)[:, np.newaxis]
        return sum_numerator

In [None]:
if __name__=="__main__":
    filename,spec = load_spectogram('spectrogram')########
    thres,rmse = calc_threshold_and_rmse(spec)
    time_stamp,names = calc_timestamp(thres,rmse,filename)
    np.random.seed(42)
    gmm_speech = GMM_speech(k=1, max_iter=20)
    gmm_speech.readweights('GMM_weights/speech_k.npy', 'GMM_weights/speech_mu.npy', 'GMM_weights/speech_sigma.npy', 'GMM_weights/speech_pi.npy')
    np.random.seed(42)
    gmm_music = GMM_music(k=1, max_iter=20)
    gmm_music.readweights('GMM_weights/music_k.npy', 'GMM_weights/music_mu.npy', 'GMM_weights/music_sigma.npy', 'GMM_weights/music_pi.npy')
    l=label_timestamp_GMM(gmm_music,gmm_speech,spec,time_stamp)
    for i  in range(0,time_stamp.shape[0]):
        time_stamp[i,1]= ((time_stamp[i,1]-1)*512 + 1024)/16000
        time_stamp[i,2]= ((time_stamp[i,2]-1)*512 + 1024)/16000
    final_time_stamp_GMM=np.hstack([time_stamp,l])
    k=0
    for i in range(0,l.shape[0]):
        if(l[i][0]==0):
            if(k==0):
                b='Speech'
                k=1
            else:
                b=np.vstack([b,'Speech'])
        else:
            if(k==0):
                b='Music'
                k=1
            else:
                b=np.vstack([b,'Music'])
    DF1 = pd.DataFrame(names,columns=['filename'])
    DF2 = pd.DataFrame(b,columns=['event'])
    DF3 = pd.DataFrame(final_time_stamp_GMM[:,1],columns=['onset'])
    DF4 = pd.DataFrame(final_time_stamp_GMM[:,2],columns=['offset'])
    frames = [DF1,DF2,DF3,DF4]
    result = pd.concat(frames,axis=1)
    #final = np.hstack([names.reshape((names.shape[0],)).T,final_time_stamp_NN[:,1].T,final_time_stamp_NN[:,2].T,b.reshape((b.shape[0],)).T])
    print(result)
    result.to_csv('GMM_event_detection.csv',index=False)
    #df = pd.read_csv('labels.csv')
    #final_time_stamp_kmeans[:,0] = names  
    #d= np.hstack([final_time_stamp_GMM,names])
    #print(final_time_stamp_kmeans,names, df, d)

  return array(a, dtype, copy=False, order=order)


                filename   event  onset  offset
0                   S009  Speech  0.320   2.528
1                   S009  Speech  3.520   5.600
2                   S009  Speech  6.656   9.280
3           music_noisy8   Music  0.256   2.592
4           music_noisy8   Music  4.128   7.136
..                   ...     ...    ...     ...
96          music_noisy3   Music  3.296   8.736
97   music+speech_noisy5   Music  0.224   2.016
98   music+speech_noisy5   Music  2.400   3.840
99   music+speech_noisy5   Music  4.576   6.336
100  music+speech_noisy5   Music  7.136   8.512

[101 rows x 4 columns]
