In [74]:
import keras
from keras.layers import Input, LSTM, Dense, TimeDistributed
from keras.models import Model
from keras.callbacks import EarlyStopping
from keras.optimizers import Nadam
from keras.regularizers import l2
import pescador

Using TensorFlow backend.


ModuleNotFoundError: No module named 'pescador'

In [None]:
# Training hyperparams
wd = 1e-4
lr = 1e-3
patience= 10
hidden_units = 512
num_frames = 25
fft_size = 1024
hop_size = 512
steps_per_epoch = 1024
num_epochs = 512
valid_steps = 1024

In [None]:
train_gen = None
valid_gen = None

In [None]:
def create_mask_model(num_frames, fft_size, hidden_units,
                      weight_decay=1e-4, dropout=0.5):
    feature_size = fft_size // 2 + 1
    inp = Input((num_frames, feature_size))
    net = LSTM(hidden_units, activation='tanh',
               return_sequences=True,
               dropout=dropout,
               recurrent_dropout=dropout,
               kernel_regularizer=l2(weight_decay),
               recurrent_regularizer=l2(weight_decay),
               bias_regularizer=l2(weight_decay))(inp)
    out = TimeDistributed(Dense(feature_size. activation='sigmoid',
                                kernel_regularizer=l2(weight_decay),
                                bias_regularizer=l2(weight_decay)))(net)

    model = Model(inputs=inp, outputs=out)
    
    return model

model = create_mask_model(num_frames, fft_size, hidden_units, weight_decay=wd)

In [None]:
model.compile(loss='mse', optimizer=Nadam(lr=lr), metrics=['accuracy'])
model_filepath = 'model.h5'
callbacks = []
callbacks.append(EarlyStopping(patience=patience))
callbacks.append(ModelCheckpoint(model_filepath, save_best_only=True))

model.fit_generator(train_gen, steps_per_epoch=steps_per_epoch,
                    epochs=num_epochs, callbacks=callbacks,
                    validation_data=valid_gen,
                    validation_steps=valid_steps)

# feature extraction 

In [26]:
import numpy as np
np.arange(-90,90,10)
a=np.array([1,2,3,4])
#a.reshape((4,1))
D=np.zeros((4,4))
D[:,1]=a
Dm=np.matrix(D)
print(np.matmul(D,D))


[[0. 2. 0. 0.]
 [0. 4. 0. 0.]
 [0. 6. 0. 0.]
 [0. 8. 0. 0.]]


# HOA steering matrix
steering matrix dimension: 1. choose azi-ele pairs? 2. choose a range of azi-ele and compute a huge matrix?

In [24]:
#generate matrix of steering vectors that include azimuth & elevation of certain range
def steer_vector(azis,eles):
    #azi_res is resolution of azimuth angle from -180 to 180
    #ele_res is resolution of elevation angle from -90 to 90
    #theta=np.arange(-180,180,azi_res)
    #phi=np.arange(-90,90,ele_res)
    
    #azis and eles are pairs of chosen directions
    m=azis.shape[0]
    n=eles.shape[0]
    D=np.zeros((4,m*n))#the steering matrix is of size (4,len(pairs)))
  
    for idx, azi in enumerate(azis):
        for idx2, ele in enumerate(eles):            
            d=np.array([1,np.sqrt(3)*np.cos(azi)*np.cos(ele),np.sqrt(3)*np.sin(azi)*np.cos(ele),np.sqrt(3)*np.sin(ele)])
            D[:,idx*n+idx2]=d
    return D
        

In [28]:
eles=np.linspace(-50,50,11)
azis=np.linspace(-180,180,13)
D=steer_vector(azis,eles)
print(D.shape)

(4, 143)


In [56]:
#simple anechoic beamformer is the pseudo inverse of steering matrix.
#D=steer_vector(azis,eles)
def beamformer(pair,steer_mat):
    #pair is the index of desired pair of azimuth/elevation, D=(m,n), inv(D)=(n,m)
    u=np.zeros(steer_mat.shape[1])#(1,n)
    u[pair]=1
    beamformer=np.linalg.pinv(steer_mat)*u[:,None]#output=(n,m) should compute it only once
    return beamformer

In [71]:
#index of the position pair (azi, ele) in the steering matrix
def pairidx(azi,ele):
    return int((azi+180)/30*11+(ele+50)/10)

pair_idx=pairidx(0,0)
bf=beamformer(pair_idx,D)
print(bf[71,:])#the only non-zero row of this beamformer, corresponding to (0,0) position

[ 6.92250461e-03  1.65436255e-02 -6.05547736e-18  4.03148705e-18]


# GEVD MWF
ground truth masks

In [None]:
 from scipy import signal

In [None]:
#input src and noise should be only in the W channel
def compute_masks(src, noise):
    #assuming each audio clip is sampled at 16kHz,compute the STFT
    #with a sinusoidal window of 1024 samples and 50% overlap.
    #window=signal.get_window('bohman',1024)
    sw=signal.stft(src, fs=16e3, window=('bohman',1024), nperseg=1024, noverlap=None)#need to check dimensions of these
    nw=signal.stft(noise, fs=16e3, window=('bohman',1024), nperseg=1024, noverlap=None)
    Ms=sw**2/(sw**2+nw**2)
    Mn=1-Ms
    return Ms,Mn
      

In [None]:
#compute speech s_hat from mask, then covariance matrix PHI_ss/PHI_nn from s_hat, then PHI_ss-r1/PHI_nn-r1, then wGEVD

def get_GEVD(Mask_s,Mask_n,mix,speech,noise):
    s_hat=np.matmul(Mask_s,mix)
    T=s_hat.shape[0]
    phi_ss=np.matrix(1/T*np.sum(s_hat*s_hat.conjugate().transpose(),axis=0))
    sn_hat=np.matmul(Mask_n,mix)
    Tn=sn_hat.shape[0]
    phi_nn=np.matrix(1/T*np.sum(sn_hat*sn_hat.conjugate().transpose(),axis=0))
    #rank-1 approximation
    u,s,v=np.linalg.svd(phi_ss, full_matrices=False)
    phi_ss_r1=s[0] * np.outer(u.T[0], v[0])
    un,sn,vn=np.linalg.svd(phi_nn, full_matrices=False)
    phi_nn_r1=sn[0]* np.outer(un.T[0],vn[0])
    #get wGEVD
    u1=np.zeros((phi_nn_r1.shape[0]))
    u1[0]=1
    wGEVD=np.matmul(inv(phi_ss_r1+phi_nn_r1),phi_ss_r1)*u1#what is u1 in this case


# synthesize an example file

In [79]:
import librosa
import os
import numpy as np
import scipy.signal
import random
import math
from scipy.io import loadmat
from IPython.display import Audio

In [80]:
import os
import random,librosa
# Pick a random speaker and a random example
speaker_dir = './vctk-p225/'
speech_path = os.path.join(speaker_dir, random.choice(os.listdir(speaker_dir)))
src_audio, sr = librosa.load(speech_path, sr=44100, mono=True)
src_audio /= src_audio.max()

In [90]:
speech_path

'./vctk-p225/p225_003.wav'

In [82]:
grid_x=0
grid_y=3
srir_dir = './isophonics/greathall'
ch_out_list = []
sh_names = ["W", "X", "Y", "Z"]
for sh_str in sh_names:
    ch_ir_path = os.path.join(srir_dir, sh_str,
                              "{}x{:02d}y{:02d}.wav".format(sh_str, grid_x, grid_y))
    ch_ir, sr = librosa.load(ch_ir_path, sr=44100)
    
    ch_ir_len = ch_ir.shape[0]
    src_len = src_audio.shape[0]
    
    if ch_ir_len > src_len:
        pad_len = ch_ir_len - src_len
        src_audio = np.pad(src_audio, (0, pad_len), mode='constant')
    elif ch_ir_len < src_len:
        pad_len = src_len - ch_ir_len
        ch_ir = np.pad(ch_ir, (0, pad_len), mode='constant')
        
    ch_out = scipy.signal.fftconvolve(src_audio, ch_ir, mode='full')[:src_len]
    ch_out_list.append(ch_out)

src_bformat = np.array(ch_out_list)



  complex_result = (np.issubdtype(in1.dtype, complex) or
  np.issubdtype(in2.dtype, complex))


In [126]:
ch_ir_path

'./isophonics/greathall/Z/Zx00y03.wav'

In [83]:
src_bformat.shape


(4, 344399)

In [107]:
# Randomly sample noise
noise_path = './ambiencelondonstreet.wav'
noise_data = None

# Make sure we have more noise than source
while noise_data is None or noise_data.shape[1] < src_bformat.shape[1]:
    noise_data, sr = librosa.load(noise_path, sr=44100, mono=False)
#noise_data /= noise_data.max()

In [109]:
#import numpy as np
clip_len = src_bformat.shape[1]
start_idx = np.random.randint(0, noise_data.shape[1] - clip_len)
noise_data = noise_data[:,start_idx:start_idx + clip_len]

In [110]:
print(noise_data.shape,src_bformat.shape)

(4, 344399) (4, 344399)


In [120]:
snr = 10 * np.log10(np.mean(src_bformat[0,:] ** 2) / np.mean(noise_data[0,:] ** 2))

# Sample an SNR [-20, 20] (subject to change)
snr_target = np.random.random() * 40.0 - 20.0

# Compute scaling factor for speech -> this assumes energy is preserved when going to B-format and applying SRIRs
alpha = 10.0**((snr_target - snr) / 20.0)

src_bformat *= alpha

In [121]:
snr_target

5.257133798975374

In [122]:
mix_bformat = src_bformat + noise_data

In [123]:
def rotate_90(audio):
    return scipy.signal.hilbert(audio).imag

In [124]:
# Mix to stereo according to https://en.wikipedia.org/wiki/Ambisonic_UHJ_format#UHJ_encoding_and_decoding_equations
# S = 0.9396926*W + 0.1855740*X
# D = j(-0.3420201*W + 0.5098604*X) + 0.6554516*Y
# Left = (S + D)/2.0
# Right = (S - D)/2.0
S = 0.9396926 * mix_bformat[0] + 0.1855740 * mix_bformat[1]
D = rotate_90(-0.3420201 * mix_bformat[0] + 0.5098604 * mix_bformat[1]) + 0.6554516 * mix_bformat[2]
L = (S + D)/2.0
R = (S - D)/2.0
mix_mono = S
mix_stereo = np.stack([L,R])

In [125]:
import IPython.display as ipd

ipd.Audio(mix_stereo,rate=sr)

# generate features

In [None]:
eles=np.linspace(-50,50,11)
azis=np.linspace(-180,180,13)
D=steer_vector(azis,eles)
pair_idx=pairidx(0,0)
bf=beamformer(pair_idx,D)
#feature is computed by multiplying beamformer with what, the stft of mixture signal??
feat_1=mix_bformat_sp*bf#these are s_hat and there are 3 of them corresponding to X Y Z channel 
feat_2=mix_bformat_sp[0]#this is supposed to be the W channel spectrogram
feat_3=mix_bformat_sp*bf_otherpair#these are n_hat and there are 3 of them corresponding to X Y Z channel