In [1]:
import autoreload
%load_ext autoreload
%autoreload 2

In [2]:
import glob
import random
import os
from librosa import resample, load
from soundfile import write
import logging
import numpy as np
import matplotlib.pyplot as plt

In [5]:
import import_ipynb
from params import model_params
from bandERB import ERBBand, ERB_pro_matrix

importing Jupyter notebook from params.ipynb
importing Jupyter notebook from bandERB.ipynb


In [6]:
mixkwargs={
    "dataset_path": '/home3/user/myhsueh/',
    "speech_txt": '/home3/user/myhsueh/training_set_speech.txt',
    "noise_txt": '/home3/user/myhsueh/training_set_noise.txt',
    "rir_txt": '/home3/user/myhsueh/training_set_rir.txt',
    "RIR": False,
    "multi_noise": False,
    "downsample": True,
    }

In [7]:
def type_check(y):
    if y.dtype == 'int32': y = y / (2**31)
    elif y.dtype == 'int16': my = y / (2**15)
    elif y.dtype == 'uint8': y = y / (2**7)
    elif y.dtype == 'float32': pass
    else: 
        print('Other, data type:', y.dtype)
    return y
    
def sr_check(y, sr, target_sr = 48000):
    length = y.shape[0]
    if not sr == target_sr:
        new_y = resample(y, sr, target_sr)
        logging.info('Change sampling rate from %d to %d'%(sr, target_sr))
    else:
        new_y = y
    return target_sr, new_y

In [8]:
def Augmentation_tool(dataset_path, speech_txt, noise_txt, 
#                       speech_id, noise_id, 
                      rir_txt=None, target_sr = 48000, 
                      RIR=True, multi_noise=False, random_delay=True,
                      speech_gain_dB=[-6,6], SNRs_dB=[-10,-5,0,5,10,20,40,45], downsample=True):
   
    speech_list = open(speech_txt).readlines() 
    noise_list = open(noise_txt).readlines() 
    num_of_speech = len(speech_list) 
    num_of_noise = len(noise_list) 
    if RIR: 
        if os.path.isfile(rir_txt):
            rir_list = open(rir_txt).readlines() 
            num_of_rir = len(rir_list) 
        else:
            RIR = False
######################################################################
##                         Speech selection                         ## 
######################################################################
    speech_idx = random.randint(0,num_of_speech-1)
#     speech_idx = speech_id % num_of_speech
    speech_filename = os.path.join(dataset_path, speech_list[speech_idx].rstrip('\n'))
    
    y, sr = load(speech_filename, sr=None)
    sr, y = sr_check(y, sr)
    y = type_check(y)
    speech_length = y.shape[0]  
    while speech_length<sr*3:
        speech_idx = random.randint(0,num_of_speech-1)
        speech_filename = os.path.join(dataset_path, speech_list[speech_idx].rstrip('\n'))

        y, sr = load(speech_filename, sr=None)
        sr, y = sr_check(y, sr)
        y = type_check(y)
        speech_length = y.shape[0]  
    logging.info('[SPEECH] DONE!')
    
######################################################################
##                         Noise collection                         ##
######################################################################
    noise_idx = random.randint(0,num_of_noise-1)
    noise_filename = os.path.join(dataset_path, noise_list[noise_idx].rstrip('\n'))
    
    noise, noise_sr = load(noise_filename, sr=None)
    noise_sr, noise = sr_check(noise, noise_sr)
    noise = type_check(noise)
    noise_length = noise.shape[0]
    while noise_length<sr*3:
        noise_idx = random.randint(0,num_of_noise-1)
        noise_filename = os.path.join(dataset_path, noise_list[noise_idx].rstrip('\n'))

        noise, noise_sr = load(noise_filename, sr=None)
        noise_sr, noise = sr_check(noise, noise_sr)
        noise = type_check(noise)
        noise_length = noise.shape[0]
    if multi_noise:
        num_noise = random.randint(2,4)
        for _ in range(1,num_noise):
            noise_idx2 = random.randint(0,num_of_noise-1)
            while noise_idx2 == noise_idx:
                noise_idx2 = random.randint(0,num_of_noise-1)
            noise_filename2 = os.path.join(dataset_path, noise_list[noise_idx2].rstrip('\n'))

            noise2, noise_sr2 = load(noise_filename2, sr=None)
            noise_sr2, noise2 = sr_check(noise2, noise_sr2)
            noise2 = type_check(noise2)
            noise_length2 = noise2.shape[0]
            if noise_length2>noise_length:
                start_pad = random.randint(0,noise_length2-noise_length)
                end_pad = noise_length2 - noise_length - start_pad
                noise = np.pad(noise,(start_pad,end_pad))
            elif noise_length2<noise_length:
                start_pad = random.randint(0,noise_length-noise_length2)
                end_pad = noise_length - noise_length2 - start_pad
                noise2 = np.pad(noise2,(start_pad,end_pad))
            else:
                pass
            noise = noise + noise2
            noise_length = noise.shape[0]
            
    logging.info('[NOISE] DONE!')

######################################################################
##                             Add RIR                              ##
######################################################################
    if RIR and (random.randint(0,2) == 0):
        # random select RIR files
        rir_idx = random.randint(0,num_of_rir-1)
        rir_filename = os.path.join(dataset_path, rir_list[rir_idx].rstrip('\n'))
        # read RIR file
        rir, rir_sr = load(rir_filename, sr=None)
        rir_sr, rir = sr_check(rir, rir_sr)
        rir = type_check(rir)
        if max(abs(rir))>1:
            rir /=  (max(abs(rir))+1e-10)
        rir_length = rir.shape[0]
        if rir_length > target_sr:
            rir = rir[:target_sr]
            
        random_num = random.randint(0,2)
        if  random_num % 2 == 0:
            # convolve with RIR
            y_rir = np.convolve(y, rir, mode='same')

            target_rir = rir_attenuation(rir)
            target_y = np.convolve(y, target_rir, mode='same')
            if random_num == 0:
                noise = np.convolve(noise, rir, mode='same')
        else:
            noise = np.convolve(noise, rir, mode='same')
            y_rir = y
            target_y = y
            
        logging.info('[RIR] DONE!')    
    else: 
        y_rir = y
        target_y = y
        logging.info('[RIR] NO file') 
        
######################################################################
##                           Apply Gain                             ##
######################################################################
  
    if y_rir.shape[0]>noise.shape[0]:
        start_pad = random.randint(0,y_rir.shape[0]-noise.shape[0])
        end_pad = y_rir.shape[0] - noise.shape[0] - start_pad
        noise = np.pad(noise,(start_pad,end_pad))
    else:
        start_pad = random.randint(0,noise.shape[0]-y_rir.shape[0])
        noise = noise[start_pad:start_pad+y_rir.shape[0]]

    gain_dB = random.uniform(-6,6)
    gain_linear = 10**(gain_dB/20)
    y_rir *= gain_linear
    target_y *= gain_linear
    
    SNR = random.choice(SNRs_dB)
    SNR_linear = 10**(SNR/20)
    noise_gain = compute_SNR_gain(y_rir, noise, SNR_linear)
    noise *= noise_gain
    
    logging.info('[Gain] Speech: %.2f, Noise: %.2f, SNR(dB): %.2f'%(gain_linear,noise_gain,SNR))
        
    mixture = y_rir + noise

######################################################################
##                         Random downsample                        ##                              
######################################################################
    if downsample and (random.randint(0,10)<3):  # down to 16k and upsample to 48k
        random_cutoff = (random.randrange(6, 48, 2)) * 1000
        # Downsample
        target_sr, mixture_down = sr_check(mixture, sr, target_sr=random_cutoff)
        target_sr, target_y_down = sr_check(target_y, sr, target_sr=random_cutoff)
        # Upsample to 48
        _, mixture = sr_check(mixture_down, sr=target_sr)
        _, target_y = sr_check(target_y_down, sr=target_sr)
    else:
        random_cutoff = 48000

######################################################################
##                          Max value clip                          ##
######################################################################       
    max_val = max(max(abs(mixture)), max(abs(target_y)))
    
    if max_val>1:
        mixture /= (max_val + 1e-10)
        target_y /= (max_val +1e-10)
        logging.info('[MIXTURE] Normalized from %f.'%(max_val))
    
    return mixture, target_y
# , speech_id+1, noise_id+1

In [9]:
def compute_SNR_gain(y, noise, SNR_linear):
    speech_power = np.mean(np.abs(y))
    noise_power = np.mean(np.abs(noise))

    snr = (speech_power / (noise_power + 10**-6))
    noise_gain = snr / SNR_linear
    
    return noise_gain

def rir_attenuation(rir, offset=5, target_sr=48000):
    peak_idx = np.argmax(np.abs(rir))
    peak_idx += int((5/1000)*target_sr)
    weights = np.ones((rir.shape[0],))
    
    for i in range(rir.shape[0]-peak_idx):
        weights[i+peak_idx] = np.exp(-i/target_sr/(-0.2/np.log10(10**-3)))
    return np.multiply(rir, weights)

In [10]:
# mixture, target_y = Augmentation_tool(**mixkwargs)

In [11]:
# import tensorflow as tf
def vorbis_window(FRAME_SIZE, transpose=True):
    FRAME_SIZE = FRAME_SIZE//2
    win = np.zeros((FRAME_SIZE,))
    for i in range(FRAME_SIZE):
        win[i] = np.sin(.5*np.pi*np.sin(.5*np.pi*(i+.5)/FRAME_SIZE) * np.sin(.5*np.pi*(i+.5)/FRAME_SIZE))
    win = np.concatenate((win,np.flip(win)),0)
    if transpose: win = win.T
    return win

In [12]:
def analysis_frame(x, nfft=960, hop=480, normalize=False):
    length = len(x)
    n_frames = length // hop
    out = np.empty((n_frames, nfft//2+1),dtype=complex)
    if not length % hop == 0:
        x = np.pad(x,(0, nfft - length%hop))
    for frame_idx in range(0, n_frames * hop, hop):
        frame = x[frame_idx : frame_idx + nfft]
        if len(frame)<nfft: frame = np.pad(frame,(0,nfft-len(frame)))
#         win = np.hanning(nfft)
        win = vorbis_window(nfft)
        frame = frame.reshape(win.shape)
        frame_win = np.multiply(frame, win)
        x_fft = np.fft.rfft(frame_win, n=p.fft_size) 
        if normalize: x_fft * (p.fft_size ** -0.5)
        out[frame_idx//hop,:] = x_fft
    return out 

In [13]:
import h5py

In [14]:
length = 48000*3

In [15]:
mother_path = '/home3/user/myhsueh/h5_dataset/'

In [16]:
p = model_params('config.ini')
ERBB = ERBBand(N=p.nb_erb, high_lim=p.sr//2, NFFT=p.fft_size)
ERB_Matrix = ERB_pro_matrix(ERBB, NFFT=p.fft_size, mode=0)
iERB_Matrix = ERB_pro_matrix(ERBB, NFFT=p.fft_size, mode=1)

In [17]:
def generator(file_idx=0):
#     speech_id = file_idx * 670
#     noise_id = speech_id
    while True:
        h5f_clean_td = h5py.File(mother_path + '/TD_' + str(file_idx) + '_clean.h5', 'a')
        h5f_noisy_td = h5py.File(mother_path + '/TD_' + str(file_idx) + '_noisy.h5', 'a')
        
        mixture, target_y = Augmentation_tool(
            **mixkwargs) # time domain
        
        data_clean_td = np.empty((target_y.shape[0]//length, length))
        data_noisy_td = np.empty((mixture.shape[0]//length, length))
        for j in range(target_y.shape[0]//length):
            data_clean_td[j] = target_y[j*length:(j+1)*length]
            data_noisy_td[j] = mixture[j*length:(j+1)*length]
        
        idx_add = save_h5(h5f_clean_td,np.array(data_clean_td),'data',max_len=6000//3, flag=True)
        save_h5(h5f_noisy_td,np.array(data_noisy_td),'data',max_len=6000//3)

        h5f_clean_td.close()
        h5f_noisy_td.close()
        
        if idx_add == True: break
#     return speech_id, noise_id

In [19]:
def save_h5(h5f,data,target,max_len,flag=False):
    shape_list=list(data.shape)
    if not h5f.__contains__(target):
        shape_list[0]=None
        dataset = h5f.create_dataset(target, data=data, maxshape=tuple(shape_list), chunks=True)
        return
    else:
        dataset = h5f[target]
    len_old=dataset.shape[0]
    len_new=len_old+data.shape[0]
    if len_old>=max_len:
        pass
    else:
        if len_new>=max_len: 
            len_new = max_len
            data = data[:len_new-len_old]
        else:
            data = data
        shape_list[0]=len_new
        dataset.resize(tuple(shape_list))
        dataset[len_old:] = data
    
    if flag: 
        if len_new>=max_len: 
            return True

In [20]:
if not os.path.exists(mother_path): 
    os.makedirs(mother_path)

In [21]:
import multiprocessing as mp

In [22]:
threads = mp.cpu_count()
pool = mp.Pool(processes=10)
pool.map(generator, range(70))

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [25]:
%run h5_to_tfrecord.ipynb

file directory exist: True
['/home3/user/myhsueh/h5_dataset/TD_26_clean.h5', '/home3/user/myhsueh/h5_dataset/TD_31_clean.h5', '/home3/user/myhsueh/h5_dataset/TD_74KB_clean.h5', '/home3/user/myhsueh/h5_dataset/TD_22_clean.h5', '/home3/user/myhsueh/h5_dataset/TD_20_clean.h5', '/home3/user/myhsueh/h5_dataset/TD_24_clean.h5', '/home3/user/myhsueh/h5_dataset/TD_85KB_clean.h5', '/home3/user/myhsueh/h5_dataset/TD_41_clean.h5', '/home3/user/myhsueh/h5_dataset/TD_71KB_clean.h5', '/home3/user/myhsueh/h5_dataset/TD_78KB_clean.h5', '/home3/user/myhsueh/h5_dataset/TD_23_clean.h5', '/home3/user/myhsueh/h5_dataset/TD_19_clean.h5', '/home3/user/myhsueh/h5_dataset/TD_81KB_clean.h5', '/home3/user/myhsueh/h5_dataset/TD_47_clean.h5', '/home3/user/myhsueh/h5_dataset/TD_88KB_clean.h5', '/home3/user/myhsueh/h5_dataset/TD_82KB_clean.h5', '/home3/user/myhsueh/h5_dataset/TD_87KB_clean.h5', '/home3/user/myhsueh/h5_dataset/TD_86KB_clean.h5', '/home3/user/myhsueh/h5_dataset/TD_44_clean.h5', '/home3/user/myhsueh/h5

Start generate TD_77KB-4-of-4.tfrecord
counter reach 2000
Transform done!
total 2000 data processed

Start generate TD_64-1-of-4.tfrecord

Start generate TD_64-2-of-4.tfrecord

Start generate TD_64-3-of-4.tfrecord

Start generate TD_64-4-of-4.tfrecord
counter reach 2000
Transform done!
total 2000 data processed

Start generate TD_80KB-1-of-4.tfrecord

Start generate TD_80KB-2-of-4.tfrecord

Start generate TD_80KB-3-of-4.tfrecord

Start generate TD_96KB-1-of-4.tfrecord

Start generate TD_80KB-4-of-4.tfrecord

Start generate TD_96KB-2-of-4.tfrecord

Start generate TD_66-1-of-4.tfrecord
counter reach 2000
Transform done!
total 2000 data processed

Start generate TD_96KB-3-of-4.tfrecord

Start generate TD_66-2-of-4.tfrecord

Start generate TD_96KB-4-of-4.tfrecord

Start generate TD_66-3-of-4.tfrecord
counter reach 2000
Transform done!
total 2000 data processed

Start generate TD_66-4-of-4.tfrecord
counter reach 2000
Transform done!
total 2000 data processed

Start generate TD_73KB-1-of-4.t

Process ForkPoolWorker-31:
Process ForkPoolWorker-25:
Process ForkPoolWorker-13:
Process ForkPoolWorker-20:
Process ForkPoolWorker-30:
Process ForkPoolWorker-16:
Process ForkPoolWorker-11:
Process ForkPoolWorker-33:
Process ForkPoolWorker-34:
Process ForkPoolWorker-27:
Process ForkPoolWorker-21:
Process ForkPoolWorker-23:
Process ForkPoolWorker-12:
Process ForkPoolWorker-15:
Process ForkPoolWorker-29:
Process ForkPoolWorker-18:
Process ForkPoolWorker-22:
Process ForkPoolWorker-28:
Process ForkPoolWorker-32:
Process ForkPoolWorker-14:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
