In [1]:
import numpy as np
import librosa 
import glob
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from scipy.stats import multivariate_normal

In [2]:
def readAudio(filename):
    x, sr = librosa.load(filename, sr=16000)
    return x, sr

#calculate spectrogram
def calc_spec(x):
    n_fft = 1024
    hop_length = 512
    win_length = 1024
    X = np.abs(librosa.stft(x, n_fft = n_fft, hop_length = hop_length, win_length = win_length, window='hann', dtype = np.complex256))
    X = librosa.power_to_db(X**2,ref=np.max)
    return X

def saveSpectrogram(X, outfilename):
    assert outfilename[-4:]=='.npy'  #'outfilename extension should be .npy'
    np.save(outfilename, X)
    return

def readSpectrogram(infilename):
    X = np.load(infilename)
    return X


In [11]:
def read_and_combine_speech_music_spec():
  p=0
  for i in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '14', '15', '16', '17', '18']:
    name = 'music_samples'+'/' + i + '.wav'
    print(name)
    audio, sr = readAudio(name,16000)
    if p==0:
      music_data = audio
      p=1
    elif p==1:
      music_data = np.hstack((music_data,audio))
  p=0
  for i in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30']:
    name = 'speech_samples'+'/' + i + '.wav'
    print(name)
    audio, sr = readAudio(name,16000)
    if p==0:
      speech_data = audio
      p=1
    elif p==1:
      speech_data = np.hstack((speech_data,audio))
  speech_spec=calc_spec(speech_data)
  label_speech = np.zeros((speech_spec.shape[1],))
  music_spec=calc_spec(music_data)
  label_music = np.ones((music_spec.shape[1],))
  label = np.hstack([label_speech,label_music])
  combine = np.hstack([speech_spec,music_spec])
  return speech_spec, music_spec, label, combine

In [4]:
import random as rd
class Kmeans:
    def __init__(self,X,lab,K):
        self.X=X
        self.Output={}
        self.Centroids=np.array([]).reshape(self.X.shape[1],0)
        self.K=K
        self.m=self.X.shape[0]
        self.lab=lab
        self.labb=[0,1]
    def kmeanspp(self,X,K):
        for i in range(K):
            rand=rd.randint(0,self.m-1)
            self.Centroids=np.c_[self.Centroids,X[rand]]
        return self.Centroids
    def fit(self,n_iter):
        self.Centroids=self.kmeanspp(self.X,self.K)
        Output={}
        for j in range(n_iter):
            EuclidianDistance=np.array([]).reshape(self.m,0)
            for k in range(self.K):
                tempDist=np.sum((self.X-self.Centroids[:,k])**2,axis=1)
                EuclidianDistance=np.c_[EuclidianDistance,tempDist]
            a = np.average(EuclidianDistance,axis=0)
            b = np.average(a)
            C=np.argmin(EuclidianDistance,axis=1)+1
            Y={}
            la={}
            for k in range(self.K):
                Y[k+1]=np.array([]).reshape(513,0)
                la[k+1]=np.array([]).reshape(1,0)
                
            for i in range(self.m):
                Y[C[i]]=np.c_[Y[C[i]],self.X[i]]
                la[C[i]]=np.append(la[C[i]],self.lab[i])
     
            for k in range(self.K):
                Y[k+1]=Y[k+1].T
                la[k+1]=la[k+1].T
                print(Y[k+1].shape)
                
            for k in range(self.K):
                self.Centroids[:,k] = np.mean(Y[k+1],axis=0)
            print('cluster1:',np.average(la[1]),'cluster2:',np.average(la[2]),'speech:0, music:1')
            plt.figure()
            color=['red','blue','green','cyan','magenta']
            labels=['cluster1','cluster2','cluster3','cluster4','cluster5']
            for k in range(self.K):
                plt.scatter(np.average(Y[k+1][:,1:250],axis=1),np.average(Y[k+1][:,250:513],axis=1),c=color[k],label=labels[k])
            plt.scatter(np.average(self.Centroids[1:250,:],axis=0),np.average(self.Centroids[250:513,:],axis=0),s=300,c='yellow',label='Centroids')
            plt.title('Clusters ')
            plt.xlabel('Average spectogram feature values (1:250)')
            plt.ylabel('Average spectogram feature values (250:513)')
            plt.legend()
            plt.show()
            if(np.average(la[1])>np.average(la[2])):
              self.labb[0]=1
              self.labb[1]=0
            else:
              self.labb[0]=0
              self.labb[1]=1
            self.Output=Y
            print(self.labb)
        self.saveweights(self.Centroids, self.labb, 'centroid.npy', 'labb.npy')      
    
    def saveweights(self,centroid, label ,centroid_fn, label_fn):
      assert centroid_fn[-4:]=='.npy'
      assert label_fn[-4:]=='.npy'  
      np.save(centroid_fn, centroid)
      np.save(label_fn, label)
      return

    def readweights(self,centroid_fn, label_fn):
      self.Centroids = np.load(centroid_fn) 
      a = np.load(label_fn)
      self.labb[0] = a[0]
      self.labb[1] = a[1]
      print(self.labb)
      return self.Centroids, self.labb
    
    def predict(self,X):
        EuclidianDistance=np.array([]).reshape(X.shape[0],0)
        for k in range(self.K):
            tempDist=np.sum((X-self.Centroids[:,k])**2,axis=1)
            EuclidianDistance=np.c_[EuclidianDistance,tempDist]
        a = np.average(EuclidianDistance,axis=0)
        b = np.average(a)
        ll = np.zeros((X.shape[0],))
        for i in range(0,EuclidianDistance.shape[0]):
          if(EuclidianDistance[i][0]<EuclidianDistance[i][1]):
            ll[i]=self.labb[0]
          else:
            ll[i]=self.labb[1]
        C=np.argmin(EuclidianDistance,axis=1)+1
        Y={}
        for k in range(self.K):
            Y[k+1]=np.array([]).reshape(513,0)
                
        for i in range(X.shape[0]):
            Y[C[i]]=np.c_[Y[C[i]],X[i]]
     
        for k in range(self.K):
            Y[k+1]=Y[k+1].T

        return Y,self.Centroids.T,EuclidianDistance,ll

In [5]:
def train_Kmeans(combine,label,K,n_iter,speech_spec,music_spec):
  kmeans=Kmeans(combine.T,label,K)
  kmeans.fit(n_iter)
  Output,Centroids,distance,label_speech = kmeans.predict(speech_spec.T)
  Output,Centroids,distance,label_music = kmeans.predict(music_spec.T)
  weight_speech = 1+abs((np.average(label_speech)-0)*2)
  weight_music = 1+abs((np.average(label_music)-1)*3)
  np.save('wt_speech.npy', weight_speech)
  np.save('wt_music.npy', weight_music)
  return weight_speech, weight_music, kmeans

In [6]:
def aggregate_label_kmeans(x,weight_speech,weight_music):
  count_s = 0
  count_m = 0
  for i in x:
    if(i==0):
      count_s=count_s+1
    if(i==1):
      count_m=count_m+1
  if(count_s*weight_speech > count_m*weight_music):
    return 0
  else:
    return 1

In [7]:
def label_timestamp_kmeans(spec,time_stamp,wt_speech,wt_music,kmeans):
  l = np.full((time_stamp.shape[0],1),0) 
  k=0
  for i in time_stamp:
    temp_spec = spec[int(i[0])][:,int(i[1]):int(i[2])]
    Output,Centroids,distance,labels = kmeans.predict(temp_spec.T)
    if(aggregate_label_kmeans(labels,wt_speech,wt_music)==0):
      l[k][0]=0
    else:
      l[k][0]=1
    k=k+1
  return l

In [12]:
if __name__=="__main__":
  #audio_data,spec,sr,music_data, speech_data, filename, onset, offset, label  = readDir('wav')
  #filename, spec = load_spectogram('/content/EE603-Machine-Learning-for-Signal-Processing/project/wav_2/EE603-Machine-Learning-for-Signal-Processing/project/wav_2/spectrogram')########
  #thres,rmse = calc_threshold_and_rmse(spec)
  #time_stamp = calc_timestamp(thres,rmse)
  #speech_spec, music_spec, label, combine = combine_speech_music_spec(speech_data,music_data)
  speech_spec, music_spec, label, combine = read_and_combine_speech_music_spec() ###############################

  ##############Kmeans################################
  wt_speech,wt_music,kmeans = train_Kmeans(combine,label,2,30,speech_spec,music_spec)
  #kmeans=Kmeans(combine.T,label,2)#################
  #kmeans.readweights('centroid.npy','labb.npy')#####################
  #wt_speech=np.load('wt_speech.npy');####################
  #wt_music=np.load('wt_music.npy');##################
  #l=label_timestamp_kmeans(spec,time_stamp,wt_speech,wt_music,kmeans)
  #for i  in range(0,time_stamp.shape[0]):
  #  time_stamp[i,1]= ((time_stamp[i,1]-1)*512 + 1024)/16000
  #  time_stamp[i,2]= ((time_stamp[i,2]-1)*512 + 1024)/16000
  #final_time_stamp_kmeans=np.hstack([time_stamp,l])
  #df = pd.read_csv('labels.csv')
  #print(final_time_stamp_kmeans,'orig', df)

music_samples/1.wav
music_samples/2.wav
music_samples/3.wav
music_samples/4.wav
music_samples/5.wav
music_samples/6.wav
music_samples/7.wav
music_samples/8.wav
music_samples/9.wav
music_samples/10.wav
music_samples/11.wav
music_samples/12.wav
music_samples/14.wav
music_samples/15.wav
music_samples/16.wav
music_samples/17.wav
music_samples/18.wav
speech_samples/1.wav
speech_samples/2.wav
speech_samples/3.wav
speech_samples/4.wav
speech_samples/5.wav
speech_samples/6.wav
speech_samples/7.wav
speech_samples/8.wav
speech_samples/9.wav
speech_samples/10.wav
speech_samples/11.wav
speech_samples/12.wav
speech_samples/14.wav
speech_samples/15.wav
speech_samples/16.wav
speech_samples/17.wav
speech_samples/18.wav
speech_samples/19.wav
speech_samples/20.wav
speech_samples/21.wav
speech_samples/22.wav
speech_samples/23.wav
speech_samples/24.wav
speech_samples/25.wav
speech_samples/26.wav
speech_samples/27.wav
speech_samples/28.wav
speech_samples/29.wav
speech_samples/30.wav


KeyboardInterrupt: 