In [66]:
import librosa
import os
import numpy as np
import scipy.signal
import random
import math
from scipy.io import loadmat
from IPython.display import Audio
import os
import random,librosa

In [230]:
#pos_x and pos_y are integers from 1 to 9; snr_ratio should be from 0 to 1
def generate_examples(speech_dir,noise_dir,srir_dir,pos_x,pos_y,snr_ratio):
    #load in mono speech
    src_audio, sr = librosa.load(speech_dir, sr=16000, mono=True)
    src_audio /= np.abs(src_audio).max()
    #print('src first',src_audio)
    #convolve speech with srir
    grid_x=pos_x
    grid_y=pos_y
    ch_out_list = []
    sh_names = ["W", "X", "Y", "Z"]
    for sh_str in sh_names:
        #ch_ir_path = os.path.join(srir_dir, sh_str,
        #                      "{}x{:02d}y{:02d}.wav".format(sh_str, grid_x, grid_y))
        ch_ir, sr = librosa.load(os.path.join(srir_dir,sh_str,"{}x{:02d}y{:02d}.wav".format(sh_str, grid_x, grid_y)), sr=16000)
        
        ch_ir_len = ch_ir.shape[0]
        src_len = src_audio.shape[0]
    
        if ch_ir_len > src_len:
            pad_len = ch_ir_len - src_len
            src_audio = np.pad(src_audio, (0, pad_len), mode='constant')
        elif ch_ir_len < src_len:
            pad_len = src_len - ch_ir_len
            ch_ir = np.pad(ch_ir, (0, pad_len), mode='constant')
        #print("ir",np.abs(ch_ir).max())
        ch_out = scipy.signal.fftconvolve(src_audio, ch_ir, mode='full')[:src_len]#this step makes the signal extremely small
        ch_out_list.append(ch_out)

    src_bformat = np.array(ch_out_list)
    #print('src before',src_bformat)   
    #load in bformat noise
    noise_data = None
    while noise_data is None or noise_data.shape[1] < src_bformat.shape[1]:
        noise_data, sr = librosa.load(noise_dir, sr=16000, mono=False)
    
    #align bformat noise and bformat speech
    clip_len = src_bformat.shape[1]
    start_idx = np.random.randint(0, noise_data.shape[1] - clip_len)
    noise_data = noise_data[:,start_idx:start_idx + clip_len]
    
    #designate snr and scale
    snr = 10 * np.log10(np.mean(src_bformat[0,:] ** 2) / np.mean(noise_data[0,:] ** 2))
    snr_target = snr_ratio * 40.0 - 20.0
    alpha = 10.0**((snr_target - snr) / 20.0)#scaling factor
    src_bformat *= alpha
    #print("alpha",alpha)
    #print('src',np.abs(src_bformat).max())
    #print("noise",noise_data)
    #combine the noise+speech
    mix_bformat = src_bformat + noise_data
    
    return mix_bformat,src_bformat,noise_data
    
    

In [231]:
speech_dir='./vctk-p225/p225_002.wav'
noise_dir = './ambiencelondonstreet.wav'
srir_dir='./isophonics/octagon/'
pos_x=0
pos_y=0
snr_ratio=0.8
mix_bformat,src_bformat,noise_data=generate_examples(speech_dir,noise_dir,srir_dir,pos_x,pos_y,snr_ratio)

  complex_result = (np.issubdtype(in1.dtype, complex) or
  np.issubdtype(in2.dtype, complex))


In [233]:
def rotate_90(audio):
    return scipy.signal.hilbert(audio).imag
sr=44100
# Mix to stereo according to https://en.wikipedia.org/wiki/Ambisonic_UHJ_format#UHJ_encoding_and_decoding_equations
# S = 0.9396926*W + 0.1855740*X
# D = j(-0.3420201*W + 0.5098604*X) + 0.6554516*Y
# Left = (S + D)/2.0
# Right = (S - D)/2.0
S = 0.9396926 * mix_bformat[0] + 0.1855740 * mix_bformat[1]
D = rotate_90(-0.3420201 * mix_bformat[0] + 0.5098604 * mix_bformat[1]) + 0.6554516 * mix_bformat[2]
L = (S + D)/2.0
R = (S - D)/2.0
mix_mono = S
mix_stereo = np.stack([L,R])

import IPython.display as ipd

ipd.Audio(mix_stereo,rate=16000)

# Feature extraction Methods

1. steering matrix
2. beamformer (pseudo-inverse of steering matrix)
3. concatenate s_hat, n_hat, x_w to be the feature matrix

In [1]:
#generate matrix of steering vectors that include azimuth & elevation of certain range
def steer_vector(azis,eles):
    #azi_res is resolution of azimuth angle from -180 to 180
    #ele_res is resolution of elevation angle from -90 to 90
    #theta=np.arange(-180,180,azi_res)
    #phi=np.arange(-90,90,ele_res)
    
    #azis and eles are pairs of chosen directions
    m=azis.shape[0]
    n=eles.shape[0]
    D=np.zeros((4,m*n))#the steering matrix is of size (4,len(pairs)))
  
    for idx, azi in enumerate(azis):
        for idx2, ele in enumerate(eles):            
            d=np.array([1,np.sqrt(3)*np.cos(azi)*np.cos(ele),np.sqrt(3)*np.sin(azi)*np.cos(ele),np.sqrt(3)*np.sin(ele)])
            D[:,idx*n+idx2]=d
    return D

In [4]:
#simple anechoic beamformer is the pseudo inverse of steering matrix.
def beamformer(pair,steer_mat):
    #pair is the index of desired pair of azimuth/elevation, D=(m,n), inv(D)=(n,m)
    u=np.zeros(steer_mat.shape[1])#(1,n)
    u[pair]=1
    beamformer=np.linalg.pinv(steer_mat)*u[:,None]#output=(n,m) should compute it only once
    return beamformer

function that given azi,ele info, output index in the steering matrix, subject to change

In [5]:
#index of the position pair (azi, ele) in the steering matrix
def pairidx(azi,ele):
    return int((azi+180)/30*11+(ele+50)/10)

#pair_idx=pairidx(0,0)
#bf=beamformer(pair_idx,D)

In [20]:
def featurematrix(azi,ele,clip,D):
    #azi and ele are two numbers corresponding to position of targeted speech
    #clip is the synthesized 4-channel audio clip
    #D is the steering matrix, which is constant for all calculations
    tgt_idx=pairidx(azi,ele)
    clip_w=clip[0,:]
    bf=beamformer(tgt_idx,D)
    bf_tgt=bf[tgt_idx,:]#1 by 4 vector
    #compute stft of 4 channels of audioclip
    t,f,x_sp_w=signal.stft(clip[0,:], fs=16e3, window='bohman', nperseg=1024, noverlap=None)#also xw
    t,f,x_sp_x=signal.stft(clip[1,:], fs=16e3, window='bohman', nperseg=1024, noverlap=None)
    t,f,x_sp_y=signal.stft(clip[2,:], fs=16e3, window='bohman', nperseg=1024, noverlap=None)
    t,f,x_sp_z=signal.stft(clip[3,:], fs=16e3, window='bohman', nperseg=1024, noverlap=None)
    x_sp=np.stack((x_sp_w,x_sp_x,x_sp_y,x_sp_z), axis=2)#dimension should be (#time frame, #freq bin,4)
    #s_hat should be (#time frames, #frenquency bins)
    s_hat=np.abs(np.sum(bf_tgt[None,None,:]*x_sp,axis=2))# this part remains a question, broadcasting not equivalent to matrix multiplication
    #n_hat should be (#time frames, #frequency bins, #however many other directions we count as distraction)
    #problematic!!
    #dis_idx=random.randint(0,D.shape[0]-1)#for now just pick a random direction
    #bf_dis=bf[dis_idx,:]
    #n_hat=np.sum(bf_dis[None,None,:]*x_sp,axis=2)
    #the final concatenated feature (#time frames, 3*#frequency bins)
    feature=np.stack((x_sp_w,s_hat),axis=2)
    return feature
    

testing part

In [12]:
import numpy as np
a=np.ones((2,3,4))*0.5
b=np.ones((2,3,4))*(-0.5)
print(np.sum((a*b),axis=2).shape)

(2, 3)


In [17]:
import random
random.randint(0,1)

0

# Ground Truth Masks Generation

1. compute mask from speech and noise signals in w channel
2. obtain multichannel weiner filter(MWF) using GEVD approach to replace common mask 

In [218]:
#input src and noise should be only in the W channel
def compute_masks(src, noise):
    #assuming each audio clip is sampled at 16kHz,compute the STFT
    #with a sinusoidal window of 1024 samples and 50% overlap.
    #window=signal.get_window('bohman',1024)
    t,f,sw=scipy.signal.stft(src, fs=16e3, window='bohman', nperseg=1024, noverlap=512)#need to check dimensions of these
    tn,fn,nw=scipy.signal.stft(noise, fs=16e3, window='bohman', nperseg=1024, noverlap=512)
    Ms=sw*np.conj(sw)/(sw*np.conj(sw)+nw*np.conj(nw))#mask should be (#time frame, #frequency bin) is it point-wise multiplication?
    Ms=np.real(Ms)
    Mn=1-Ms
    return Ms,Mn

In [179]:
print(src_bformat.shape,noise_data.shape)

(4, 178988) (4, 178988)


In [180]:
f,t,sw=scipy.signal.stft(src_bformat[0,:], fs=16e3, window='bohman', nperseg=1024, noverlap=None)

In [181]:
sw.shape

(513, 351)

In [234]:
Ms,Mn=compute_masks(src_bformat[0,:],noise_data[0,:])

In [235]:
Ms

array([[1.50236664e-05, 5.37447506e-04, 5.79673578e-06, ...,
        7.76040947e-05, 1.14668873e-03, 1.95079251e-04],
       [1.84361431e-05, 1.49252506e-06, 1.41977647e-04, ...,
        8.04773894e-03, 8.79944136e-04, 4.77376183e-04],
       [3.08390926e-05, 1.34886148e-03, 1.13456944e-03, ...,
        7.72094156e-01, 1.64676280e-02, 9.63333630e-03],
       ...,
       [5.57061257e-13, 4.42247958e-06, 1.55328006e-07, ...,
        1.79374943e-07, 2.92952803e-01, 2.92568664e-01],
       [5.77934772e-13, 4.27057931e-07, 1.43561556e-07, ...,
        1.64962673e-07, 3.18756512e-01, 2.92460562e-01],
       [9.47545308e-14, 5.27734851e-07, 9.88979703e-09, ...,
        1.02100550e-07, 2.36167914e-01, 2.92365498e-01]])

In [203]:
def compute_masks_librosa(src, noise):
    #assuming each audio clip is sampled at 16kHz,compute the STFT
    #with a sinusoidal window of 1024 samples and 50% overlap.
    #window=signal.get_window('bohman',1024)
    sw=librosa.core.stft(src,n_fft=1024, hop_length=512,window='hann')#need to check dimensions of these
    nw=librosa.core.stft(noise, n_fft=1024, hop_length=512,window='hann')
    Ms=sw*np.conj(sw)/(sw*np.conj(sw)+nw*np.conj(nw))#mask should be (#time frame, #frequency bin) is it point-wise multiplication?
    Ms=np.real(Ms)
    Mn=1-Ms
    return Ms,Mn

In [204]:
Ms,Mn=compute_masks_librosa(src_bformat[0,:],noise_data[0,:])
print(Ms)

[[3.2365888e-06 2.1090667e-05 4.0717001e-04 ... 2.7594350e-03
  7.5720280e-01 6.3245170e-02]
 [8.4632497e-09 2.5507701e-05 3.4195718e-03 ... 7.1195677e-02
  1.6962573e-01 2.8020290e-01]
 [2.4190032e-07 6.6277971e-07 9.7751543e-03 ... 4.3780887e-01
  6.9587171e-02 9.1780372e-02]
 ...
 [1.7648667e-11 1.9298669e-12 2.8927924e-10 ... 1.3341936e-10
  4.3249446e-11 3.0464679e-03]
 [2.0792094e-11 3.3492827e-12 1.0398013e-11 ... 3.9973849e-10
  5.7614108e-10 7.0487699e-03]
 [9.6829871e-12 6.6110374e-11 4.3140418e-11 ... 7.0781581e-10
  1.4522761e-10 9.8854142e-01]]


# Pass MWF back to mixture and reconstruct desired signals

In [221]:
#compute speech s_hat from mask, then covariance matrix PHI_ss/PHI_nn from s_hat, then PHI_ss-r1/PHI_nn-r1, then wGEVD

#input should be predicted mask and mixture signal, output is the isolated speech
def get_GEVD(Mask_s,mix_sig):
    
    #Mask_s,Mask_n, mix are all of size (#time, #freq,4)
    #output MWF (freq,4), speech and noise ()
    Mask_n=1-Mask_s
    t,f,mix_w=scipy.signal.stft(mix_sig[0,:], fs=16e3, window='bohman', nperseg=1024, noverlap=512)
    t,f,mix_x=scipy.signal.stft(mix_sig[1,:], fs=16e3, window='bohman', nperseg=1024, noverlap=512)
    t,f,mix_y=scipy.signal.stft(mix_sig[2,:], fs=16e3, window='bohman', nperseg=1024, noverlap=512)
    t,f,mix_z=scipy.signal.stft(mix_sig[3,:], fs=16e3, window='bohman', nperseg=1024, noverlap=512)
    mix=np.stack((mix_w,mix_x,mix_y,mix_z), axis=2)
    
    s_bar=Mask_s[:,:,None]*mix#pointwise multiplication (#time,#freq,4)
    T,F=s_bar.shape[0:2]
    phi_ss=1/T*np.sum(s_bar[:,:,np.newaxis,:]*s_bar[:,:,:,np.newaxis],axis=0)#should be of (#freq,4,4)
   
    sn_bar=Mask_n[:,:,None]*mix
    Tn,Fn=sn_bar.shape[0:2]
    phi_nn=1/Tn*np.sum(sn_bar[:,:,np.newaxis,:]*sn_bar[:,:,:,np.newaxis],axis=0)#(#freq,4,4)
   
    #rank-1 approximation
    u1=np.zeros((phi_nn.shape[1],))
    u1[0]=1
    phi_interm=np.zeros((F,4,4))
    phi_ss_r1=np.zeros((F,4,4))
    wGEVD=np.zeros((F,4))
    speech_sp=np.zeros((T,F))
    for i in range(phi_nn.shape[0]):
        phi_interm[i,:,:]=np.linalg.lstsq(phi_nn[i,:,:],phi_ss[i,:,:])[0]#(#freq,4,4)
        u,s,v=np.linalg.svd(phi_interm[i,:,:], full_matrices=False)
        eig=np.argmax(s)
        phi_ss_r1[i,:,:]=s[eig] * np.outer(u[:,eig], v[eig,:])
        wGEVD[i,:]=np.sum(np.linalg.lstsq(phi_ss_r1[i,:,:]+phi_nn[i,:,:],phi_ss_r1[i,:,:])[0]*u1[:,None],axis=1)#(#freq,4)
    #get wGEVD
    speech_sp=np.sum(wGEVD[None,:,:]*mix[:,:,:],axis=2)
    t,speech_sig=scipy.signal.istft(speech_sp,fs=16e3,window='hann', nperseg=1024, noverlap=512)
    return wGEVD,speech_sig




In [236]:
wGEVD,speech=get_GEVD(Ms,mix_bformat)
speech=speech/np.abs(speech).max()
ipd.Audio(speech,rate=16000)



In [209]:
def get_GEVD_librosa(Mask_s,mix_sig):
    
    #Mask_s,Mask_n, mix are all of size (#time, #freq,4)
    #output MWF (freq,4), speech and noise ()
    Mask_n=1-Mask_s
    mix_w=librosa.core.stft(mix_sig[0,:],n_fft=1024, hop_length=512,window='hann')
    mix_x=librosa.core.stft(mix_sig[1,:], n_fft=1024, hop_length=512,window='hann')
    mix_y=librosa.core.stft(mix_sig[2,:], n_fft=1024, hop_length=512,window='hann')
    mix_z=librosa.core.stft(mix_sig[3,:], n_fft=1024, hop_length=512,window='hann')
    mix=np.stack((mix_w,mix_x,mix_y,mix_z), axis=2)
    
    s_bar=Mask_s[:,:,None]*mix#pointwise multiplication (#time,#freq,4)
    T,F=s_bar.shape[0:2]
    phi_ss=1/T*np.sum(s_bar[:,:,np.newaxis,:]*s_bar[:,:,:,np.newaxis],axis=0)#should be of (#freq,4,4)
   
    sn_bar=Mask_n[:,:,None]*mix
    Tn,Fn=sn_bar.shape[0:2]
    phi_nn=1/Tn*np.sum(sn_bar[:,:,np.newaxis,:]*sn_bar[:,:,:,np.newaxis],axis=0)#(#freq,4,4)
   
    #rank-1 approximation
    u1=np.zeros((phi_nn.shape[1],))
    u1[0]=1
    phi_interm=np.zeros((F,4,4))
    phi_ss_r1=np.zeros((F,4,4))
    wGEVD=np.zeros((F,4))
    speech_sp=np.zeros((T,F))
    for i in range(phi_nn.shape[0]):
        phi_interm[i,:,:]=np.linalg.lstsq(phi_nn[i,:,:],phi_ss[i,:,:])[0]#(#freq,4,4)
        u,s,v=np.linalg.svd(phi_interm[i,:,:], full_matrices=False)
        eig=np.argmax(s)
        phi_ss_r1[i,:,:]=s[eig] * np.outer(u[:,eig], v[eig,:])
        wGEVD[i,:]=np.sum(np.linalg.lstsq(phi_ss_r1[i,:,:]+phi_nn[i,:,:],phi_ss_r1[i,:,:])[0]*u1[:,None],axis=1)#(#freq,4)
    #get wGEVD
    speech_sp=np.sum(wGEVD[None,:,:]*mix[:,:,:],axis=2)
    speech_sig=librosa.core.istft(speech_sp, hop_length=512,window='hann')
    return wGEVD,speech_sig



In [237]:
wGEVD,speech=get_GEVD_librosa(Ms,mix_bformat)
speech=speech/np.abs(speech).max()
ipd.Audio(speech,rate=sr)

ValueError: operands could not be broadcast together with shapes (513,128,1) (513,127,4) 

In [238]:
print(Ms.shape,mix_bformat.shape)


(513, 128) (4, 64939)
