In [1]:
# Few import statements
import os
import scipy.io.wavfile as wav
import numpy as np
import matplotlib.pyplot as plt
from keras.models import load_model, save_model
from keras.models import Sequential
from keras.layers.core import TimeDistributedDense
from keras.layers.recurrent import LSTM
from IPython.display import Audio
from pipes import quote

Using Theano backend.


In [2]:
def read_wav_as_np(file):
    # wav.read returns the sampling rate per second  (as an int) and the data (as a numpy array)
    data = wav.read(file)    
    # Normalize 16-bit input to [-1, 1] range
    np_arr = data[1].astype('float32') / 32767.0
    #np_arr = np.array(np_arr)
    return np_arr, data[0]

In [3]:
def write_np_as_wav(X, sample_rate, file):
    # Converting the tensor back to it's original form
    Xnew = X * 32767.0
    Xnew = Xnew.astype('int16')
    # wav.write constructs the .wav file using the specified sample_rate and tensor
    wav.write(file, sample_rate, Xnew)
    return

In [4]:
def convert_sample_blocks_to_np_audio(blocks):
    # Flattens the blocks into a single list
    song_np = np.concatenate(blocks)
    return song_np

In [5]:
def convert_np_audio_to_sample_blocks(song_np, block_size):

    # Block lists initialised
    block_lists = []

    # total_samples holds the size of the numpy array
    total_samples = song_np.shape[0]
    # print('total_samples=',total_samples)

    # num_samples_so_far is used to loop through the numpy array
    num_samples_so_far = 0

    while (num_samples_so_far < total_samples):

        # Stores each block in the "block" variable
        block = song_np[num_samples_so_far:num_samples_so_far + block_size]

        if (block.shape[0] < block_size):
            # this is to add 0's in the last block if it not completely filled
            padding = np.zeros((block_size - block.shape[0],))
            # block_size is 4400 which is fixed throughout whereas block.shape[0] for the last block is <=44100
            block = np.concatenate((block,padding))
        block_lists.append(block)
        num_samples_so_far += block_size
    return block_lists

In [6]:
def time_blocks_to_fft_blocks(blocks_time_domain):
    # FFT blocks initialized
    fft_blocks = []
    for block in blocks_time_domain:
        # Computes the one-dimensional discrete Fourier Transform and returns the complex nD array
        # i.e The truncated or zero-padded input, transformed from time domain to frequency domain.
        fft_block = np.fft.fft(block)
        # Joins a sequence of blocks along frequency axis.
        new_block = np.concatenate((np.real(fft_block), np.imag(fft_block)))
        fft_blocks.append(new_block)
    return fft_blocks

In [7]:
def fft_blocks_to_time_blocks(blocks_ft_domain):
    # Time blocks initialized
    time_blocks = []
    for block in blocks_ft_domain:
        num_elems = block.shape[0] / 2
        # Extracts real part of the amplitude corresponding to the frequency
        real_chunk = block[0:num_elems]
        # Extracts imaginary part of the amplitude corresponding to the frequency
        imag_chunk = block[num_elems:]
        # Represents amplitude as a complex number corresponding to the frequency
        new_block = real_chunk + 1.0j * imag_chunk
        # Computes the one-dimensional discrete inverse Fourier Transform and returns the transformed
        # block from frequency domain to time domain
        time_block = np.fft.ifft(new_block)
        # Joins a sequence of blocks along time axis.
        time_blocks.append(time_block)
    return time_blocks

In [10]:
sample_frequency = 44100
block_size = 4400
filename = 'F01_22GC010A_BTH.CH0.wav'
filename_noisy = "F01_22GC010A_BUS.CH0.wav"

In [11]:
# wav_array contains normalized data
wav_clean, bitrate = read_wav_as_np(filename)
# wav_array is converted into blocks with zeroes padded to fill the empty space in last block if any
wav_blocks_zero_padded_clean = convert_np_audio_to_sample_blocks(wav_clean, block_size)
wav_array_clean = convert_sample_blocks_to_np_audio(wav_blocks_zero_padded_clean)
print len(wav_blocks_zero_padded_clean), len(wav_array_clean)
wav_noisy, bitrate = read_wav_as_np(filename_noisy)
wav_blocks_zero_padded_noisy = convert_np_audio_to_sample_blocks(wav_noisy, block_size)
wav_array_noisy = convert_sample_blocks_to_np_audio(wav_blocks_zero_padded_noisy)
print len(wav_blocks_zero_padded_noisy), len(wav_array_noisy)

18 79200
17 74800


In [12]:
def making_same_length(clean, noisy):    
    if(len(clean) > len(noisy)):
        pad = len(clean) - len(noisy)
        for i in range(pad):
            noisy.append(np.zeros(block_size))
        return clean, noisy
    else:
        pad = len(noisy) - len(clean)
        
        for i in range(pad):
            clean.append(np.zeros(block_size))
        return clean, noisy

In [13]:
making_same_length(wav_blocks_zero_padded_clean, wav_blocks_zero_padded_noisy)
print len(wav_blocks_zero_padded_clean), len(wav_blocks_zero_padded_noisy)

18 18


In [14]:
print len(wav_blocks_zero_padded_noisy)

18


In [15]:
# Fast fourier transforming the wav blocks into frequency domain
print('Dimension of wav blocks before fft: ',np.shape(wav_blocks_zero_padded_clean))

Y = time_blocks_to_fft_blocks(wav_blocks_zero_padded_clean)
X = time_blocks_to_fft_blocks(wav_blocks_zero_padded_noisy)

print('Dimension of the training dataset (wav blocks after fft): ',np.shape(X))

('Dimension of wav blocks before fft: ', (18L, 4400L))
('Dimension of the training dataset (wav blocks after fft): ', (18L, 8800L))


In [16]:
f1 = "F01_22GC010A_BTH.CH0.wav"
wav_test, bitrate = read_wav_as_np(f1)
print np.shape(wav_test)
test = convert_np_audio_to_sample_blocks(wav_test, block_size)
test = time_blocks_to_fft_blocks(test)
print np.shape(test)
cur_seq = 0
chunks_val = []
max_seq_len = 1
total_seq = len(test)
while cur_seq + max_seq_len <= total_seq:
    chunks_val.append(test[cur_seq:cur_seq + max_seq_len])    
    cur_seq += max_seq_len
# Number of examples
num_examples = len(chunks_val) 
# Imaginary part requires the extra space
num_dims_out = block_size * 2
# Dimensions of the training dataset
out_shape = (num_examples, max_seq_len, num_dims_out)
val_data = np.zeros(out_shape)
# Populating the training dataset
for n in range(num_examples):
    for i in range(max_seq_len):
        val_data[n][i] = chunks_val[n][i]    
        
output = []
for it in range(val_data.shape[0]):
    seed_seq = val_data[it]
    seed_seq = np.reshape(seed_seq, (1, seed_seq.shape[0], seed_seq.shape[1]))
    seedSeqNew = model.predict(seed_seq)    
    for i in range(seedSeqNew.shape[1]):
        output.append(seedSeqNew[0][i].copy())

print np.shape(output)

(78333L,)
(18L, 8800L)


NameError: name 'model' is not defined

In [17]:
cur_seq = 0
chunks_X = []
chunks_Y = []
max_seq_len = 1
total_seq = len(X)
while cur_seq + max_seq_len <= total_seq:
    chunks_X.append(X[cur_seq:cur_seq + max_seq_len])
    chunks_Y.append(Y[cur_seq:cur_seq + max_seq_len])
    cur_seq += max_seq_len
# Number of examples
num_examples = len(chunks_X) 
# Imaginary part requires the extra space
num_dims_out = block_size * 2
# Dimensions of the training dataset
out_shape = (num_examples, max_seq_len, num_dims_out)
x_data = np.zeros(out_shape)
y_data = np.zeros(out_shape)
# Populating the training dataset
for n in range(num_examples):
    for i in range(max_seq_len):
        x_data[n][i] = chunks_X[n][i]
        y_data[n][i] = chunks_Y[n][i]

In [18]:
print np.shape(x_data)

(18L, 1L, 8800L)


In [19]:
print np.shape(x_data), np.shape(y_data)

(18L, 1L, 8800L) (18L, 1L, 8800L)


In [36]:
num_frequency_dimensions = 8800
num_hidden_dimensions = 1024
length = np.shape(x_data)[1]
print('Input layer size: ',num_frequency_dimensions)
print('Hidden layer size: ',num_hidden_dimensions)
print("length: ", length)
# Sequential is a linear stack of layers
model = Sequential()
# This layer converts frequency space to hidden space
model.add(TimeDistributedDense(input_dim=num_frequency_dimensions, output_dim=num_hidden_dimensions, input_length=length))
# return_sequences=True implies return the entire output sequence & not just the last output
model.add(LSTM(input_dim=num_hidden_dimensions, output_dim=num_hidden_dimensions, return_sequences=True))
#add LSTM
model.add(LSTM(input_dim=num_hidden_dimensions, output_dim=num_hidden_dimensions, return_sequences=True))
model.add(LSTM(input_dim=num_hidden_dimensions, output_dim=num_hidden_dimensions, return_sequences=True))
# This layer converts hidden space back to frequency space
model.add(TimeDistributedDense(input_dim=num_hidden_dimensions, output_dim=num_frequency_dimensions))
# Done building the model.Now, configure it for the learning process
model.compile(loss='mean_squared_error', optimizer='rmsprop')
model.summary()

('Input layer size: ', 8800)
('Hidden layer size: ', 1024)
('length: ', 1L)
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
timedistributeddense_3 (TimeDist (None, 1L, 1024)      9012224     timedistributeddense_input_2[0][0
____________________________________________________________________________________________________
lstm_4 (LSTM)                    (None, 1L, 1024)      8392704     timedistributeddense_3[0][0]     
____________________________________________________________________________________________________
lstm_5 (LSTM)                    (None, 1L, 1024)      8392704     lstm_4[0][0]                     
____________________________________________________________________________________________________
lstm_6 (LSTM)                    (None, 1L, 1024)      8392704     lstm_5[0][0]                     
_______________

In [21]:
def train_model(model, x_data, y_data):
    # Number of iterations for training
    num_iters = 10
    # Number of iterations before we save our model
    epochs_per_iter = 3
    # Number of training examples pushed to the GPU per batch.
    batch_size = 256
    # Path to weights file    
    cur_iter = 0
    while cur_iter < num_iters:
        print('Iteration: ' + str(cur_iter))
        # Iterate over the training data in batches
        history = model.fit(x_data, y_data, batch_size=batch_size, nb_epoch=epochs_per_iter, verbose=1, validation_split=0.0)
        cur_iter += epochs_per_iter
    return model

In [39]:
model_name = "model_1"
#model = load_model("weights/model_1.hdf5")
print model.summary()
num_iters = 10
# Number of iterations before we save our model
epochs_per_iter = 3
# Number of training examples pushed to the GPU per batch.
batch_size = 64
# Path to weights file    
cur_iter = 0
while cur_iter < num_iters:
    print('Iteration: ' + str(cur_iter))
    # Iterate over the training data in batches
    history = model.fit(X, Y, batch_size=batch_size, nb_epoch=epochs_per_iter, verbose=1, validation_split=0.0)
    cur_iter += epochs_per_iter

model.save('weights/%s.hdf5'%model_name, overwrite=True)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
timedistributeddense_3 (TimeDist (None, 1L, 1024)      9012224     timedistributeddense_input_2[0][0
____________________________________________________________________________________________________
lstm_4 (LSTM)                    (None, 1L, 1024)      8392704     timedistributeddense_3[0][0]     
____________________________________________________________________________________________________
lstm_5 (LSTM)                    (None, 1L, 1024)      8392704     lstm_4[0][0]                     
____________________________________________________________________________________________________
lstm_6 (LSTM)                    (None, 1L, 1024)      8392704     lstm_5[0][0]                     
___________________________________________________________________________________________

In [40]:
y, x = data_train[10]
print x
x_block = read_file_as_blocks(x, "noisy")
x_data = convert_block_to_data(x_block)
predict_and_dump_output(model, x_data, x)

F01_053C0105_BUS.CH0.wav




In [20]:
# model = load_model("weights/model.hdf5")



In [23]:
output = []
for it in range(x_data.shape[0]):
    seed_seq = x_data[it]
    seed_seq = np.reshape(seed_seq, (1, seed_seq.shape[0], seed_seq.shape[1]))
    seedSeqNew = model.predict(seed_seq)    
    for i in range(seedSeqNew.shape[1]):
        output.append(seedSeqNew[0][i].copy())
    
            
    #output.append(seedSeqNew[])


In [24]:
print len(output)

18


In [23]:
# The path for the generated song
song_path = 'clean_2.wav'
# Reversing the conversions
time_blocks = fft_blocks_to_time_blocks(output)
song = convert_sample_blocks_to_np_audio(time_blocks)
write_np_as_wav(song, 15000, song_path)



In [25]:
def read_file_as_blocks(filename, folder):
    block_size = 4400
    filename = "files/" + folder + "/" + filename    
    wav_test, bitrate = read_wav_as_np(filename)
    test = convert_np_audio_to_sample_blocks(wav_test, block_size)    
    return test

In [26]:
#converting wav file to data
def convert_block_to_data(test):        
    test = time_blocks_to_fft_blocks(test)
    #print np.shape(test)
    cur_seq = 0
    chunks_val = []
    max_seq_len = 1
    total_seq = len(test)
    while cur_seq + max_seq_len <= total_seq:
        chunks_val.append(test[cur_seq:cur_seq + max_seq_len])    
        cur_seq += max_seq_len
    # Number of examples
    num_examples = len(chunks_val) 
    # Imaginary part requires the extra space
    num_dims_out = block_size * 2
    # Dimensions of the training dataset
    out_shape = (num_examples, max_seq_len, num_dims_out)
    data = np.zeros(out_shape)
    # Populating the training dataset
    for n in range(num_examples):
        for i in range(max_seq_len):
            data[n][i] = chunks_val[n][i]
    
    return data

In [27]:
def predict_and_dump_output(model, data, filename):    
    output = []
    for it in range(data.shape[0]):
        seed_seq = data[it]
        seed_seq = np.reshape(seed_seq, (1, seed_seq.shape[0], seed_seq.shape[1]))
        seedSeqNew = model.predict(seed_seq)    
        for i in range(seedSeqNew.shape[1]):
            output.append(seedSeqNew[0][i].copy())
    
    song_path = 'files/predict/'+ filename.split('.')[0] + '_predicted.wav'
    # Reversing the conversions
    time_blocks = fft_blocks_to_time_blocks(output)
    song = convert_sample_blocks_to_np_audio(time_blocks)
    write_np_as_wav(song, 15000, song_path)
    

In [28]:
import os
import sys
clean_names = []
noisy_names = []
walk_dir = "files/clean"

for root, subdirs, files in os.walk(walk_dir):
    for f in files:
        if(f.endswith(".CH0.wav")): 
            clean_names.append(f)

walk_dir = "files/noisy"
for root, subdirs, files in os.walk(walk_dir):
    for f in files:        
        if(f.endswith(".CH0.wav")):            
            noisy_names.append(f)

In [29]:
print len(noisy_names)

410


In [30]:
files = zip(clean_names, noisy_names)

from sklearn.cross_validation import train_test_split

data_train, data_test = train_test_split(files, test_size=0.10, random_state=42)



In [31]:
data_train[1]

('M03_22GC0105_BTH.CH0.wav', 'M03_22GC0105_BUS.CH0.wav')

In [62]:
#model.save('weights/%s.hdf5'%model_name, overwrite=True)

In [32]:
#model = load_model("weights/model.hdf5")
print x

NameError: name 'x' is not defined

In [33]:
print data_train[0]

('M03_051C0102_BTH.CH0.wav', 'M03_051C0102_BUS.CH0.wav')


In [34]:
# print np.shape(X)
# print np.shape(x_data)

# op = np.vstack((X, x_data))
# print np.shape(op)

y,x = data_train[100]
#model = load_model("weights/model_1.hdf5")
x_block = read_file_as_blocks(x, "noisy")
y_block = read_file_as_blocks(y, "clean")

print "Before", np.shape(x_block), np.shape(y_block)
y_block, x_block = making_same_length(y_block, x_block)
print "After", np.shape(x_block), np.shape(y_block)

x_data = convert_block_to_data(x_block)
y_data = convert_block_to_data(y_block)

X = x_data
Y = y_data

np.shape(y_data), np.shape(x_data)

Before (26L, 4400L) (25L, 4400L)
After (26L, 4400L) (26L, 4400L)


((26L, 1L, 8800L), (26L, 1L, 8800L))

In [35]:
for i in range(1, len(data_train)):
    
    y,x = data_train[i]
    #print x, y
    #model = load_model("weights/model_1.hdf5")
    x_block = read_file_as_blocks(x, "noisy")
    y_block = read_file_as_blocks(y, "clean")    
    
    #print "Before", np.shape(x_block), np.shape(y_block)
    
    y_block, x_block = making_same_length(y_block, x_block)
    #print "After", np.shape(x_block), np.shape(y_block)
    
    x_data = convert_block_to_data(x_block)
    y_data = convert_block_to_data(y_block)
    X = np.vstack((X, x_data))
    Y = np.vstack((Y, y_data))
    
    if(i % 10 == 0):
        print np.shape(X), np.shape(Y)

#predict_and_dump_output(model, x_data, x)

(251L, 1L, 8800L) (251L, 1L, 8800L)
(500L, 1L, 8800L) (500L, 1L, 8800L)
(769L, 1L, 8800L) (769L, 1L, 8800L)
(1056L, 1L, 8800L) (1056L, 1L, 8800L)
(1303L, 1L, 8800L) (1303L, 1L, 8800L)
(1546L, 1L, 8800L) (1546L, 1L, 8800L)
(1773L, 1L, 8800L) (1773L, 1L, 8800L)
(2003L, 1L, 8800L) (2003L, 1L, 8800L)
(2219L, 1L, 8800L) (2219L, 1L, 8800L)
(2461L, 1L, 8800L) (2461L, 1L, 8800L)
(2687L, 1L, 8800L) (2687L, 1L, 8800L)
(2908L, 1L, 8800L) (2908L, 1L, 8800L)
(3174L, 1L, 8800L) (3174L, 1L, 8800L)
(3417L, 1L, 8800L) (3417L, 1L, 8800L)
(3629L, 1L, 8800L) (3629L, 1L, 8800L)
(3844L, 1L, 8800L) (3844L, 1L, 8800L)
(4012L, 1L, 8800L) (4012L, 1L, 8800L)
(4241L, 1L, 8800L) (4241L, 1L, 8800L)
(4461L, 1L, 8800L) (4461L, 1L, 8800L)
(4700L, 1L, 8800L) (4700L, 1L, 8800L)
(4954L, 1L, 8800L) (4954L, 1L, 8800L)
(5230L, 1L, 8800L) (5230L, 1L, 8800L)
(5469L, 1L, 8800L) (5469L, 1L, 8800L)
(5695L, 1L, 8800L) (5695L, 1L, 8800L)
(5936L, 1L, 8800L) (5936L, 1L, 8800L)
(6154L, 1L, 8800L) (6154L, 1L, 8800L)
(6366L, 1L, 8800L)

In [176]:
np.shape(X), np.shape(Y)

((14939L, 1L, 8800L), (14939L, 1L, 8800L))

In [None]:
train_model()