In [28]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras
import random
import pandas as pd

tf.logging.set_verbosity(tf.logging.INFO)

!pip install librosa
import librosa



In [0]:
s, sr=librosa.load('train_clean_male.wav', sr=None)
S_sgn=librosa.stft(s, n_fft=1024, hop_length=512)
# librosa phase - maxphase

# print(phase.shape)
# output * phase
# istft

sn, sr=librosa.load('train_dirty_male.wav', sr=None)
X_sgn=librosa.stft(sn, n_fft=1024, hop_length=512)

In [0]:

S_sgn = np.abs(S_sgn.T)
S_sgn = S_sgn
X_sgn_abs = np.abs(X_sgn.T)
X_sgn_abs = X_sgn_abs

** Padding silent frames **
* To make the input data compatible with labels we need to pad 19 silent frames to the input in the beginning.
* As each input cnn image is $20*513$ to have 2459 output values we need to pad this input with random numbers closer to the other values.

In [31]:
def pad_silent(x):
  pad = np.empty([19,513])
  for i in range(0,19):
    pad[i,:] = np.random.rand(1,513)/1000
  y = np.concatenate((pad, x))
  return y

X_sgn = pad_silent(X_sgn_abs)

print(X_sgn.shape)

(2478, 513)


**2-D CNN  structure**


*   For Audio cleaning problem a 2-d convolutional neural network is built with the following:
* Input is 513 shape vector 
* 2 convolutional layers:
  - One with 16 filters, filter size of (3,3)
  - One with 32 filters, filter size of (3,3)
* Each convolution layer is followed by a max-pooling layer with pooling size (2,2) and stride taken as 2 units.
* 1 dense fully connected layer with 2048 and Relu as activation function
* Dropout of 0.4 on dense layer
* Logits layer to output with 513 neurons to match output shape

In [0]:
lr = 1e-3
n_iterations = 1000
batch_size = 128

n_input = 513
n_dense = 2048
n_output = 513  
n_height = 20
filter1 = 16
kernel_size1 = 3
filter2 = 32
kernel_size2 = 3
drop_out = 0.4

In [33]:
X = tf.placeholder("float", [None, n_height, n_input])
Y = tf.placeholder("float", [None, n_output])


X1 = tf.reshape(X, [-1, n_height, n_input, 1])
Y1 = tf.reshape(Y, [-1, n_height, n_output, 1])

# Convolutional Layer #1
conv1 = tf.layers.conv2d(
    inputs=X1,
    filters=filter1,
    kernel_size=(kernel_size1,kernel_size1),
    padding="same",
    activation=tf.nn.relu)

# Pooling Layer #1
pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=(2,2), strides=2)

# Convolutional Layer #2 and Pooling Layer #2
conv2 = tf.layers.conv2d(
    inputs=pool1,
    filters=filter2,
    kernel_size=(kernel_size2,kernel_size2),
    padding="same",
    activation=tf.nn.relu)
pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=(2,2), strides=2)
print(pool2.shape)
# Dense Layer
pool2_flat = tf.reshape(pool2, [-1, 5 * 128 * filter2])
dense = tf.layers.dense(inputs=pool2_flat, units=n_dense, activation=tf.nn.relu)
dropout = tf.layers.dropout(inputs=dense, rate=drop_out)

# Logits Layer
logits = tf.layers.dense(inputs=dropout, units=n_output)
print(logits.shape)
# cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=Y, logits=output_layer))
loss=tf.losses.mean_squared_error(labels=tf.reshape(Y1, [-1, n_output]), predictions=logits)
train_step = tf.train.AdamOptimizer(lr).minimize(loss)
# print(logits)

# print(S_sgn)
correct_pred = tf.equal(tf.argmax(logits, 1), tf.argmax(Y1, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
# print(accuracy)

(?, 5, 128, 32)
(?, 513)


In [0]:
init = tf.global_variables_initializer()
init_l = tf.local_variables_initializer()
sess = tf.Session()
sess.run(init)
sess.run(init_l)

** Data parsing for cnn input **
* With $2459*513$ input we need to form 2459 frames of each $20*513*.
* This is achieved using th emethod below

In [35]:
def split_2d(data):
  input_data = [np.array(data[i:i+20,:]) for i in range(0,len(data)-19)]
  return np.array(input_data)

x = split_2d(X_sgn)
y = S_sgn
print(x.shape, y.shape)

(2459, 20, 513) (2459, 513)


** Mini Batching **

* For mini batches, the input is randomly sampled.
* A batch that is based on the batch size given is taken from the input at a random position.
* Each batch and corresponding labels are fed to the training model at once.

In [0]:
for i in range(n_iterations):
  idx = random.randint(0,19)
  batch_x = x[idx*batch_size:idx*batch_size + batch_size]
  batch_y = y[idx*batch_size:idx*batch_size + batch_size,:]
  sess.run(train_step, feed_dict={X: batch_x, Y: batch_y})
  print(i)

In [0]:
# Method to refine test audio files
def process_test(file, out):

  test, sr = librosa.load(file, sr=None)
  test_sgn = librosa.stft(test, n_fft=1024, hop_length=512)
  magnitude, phase = librosa.magphase(test_sgn)
  test_s = np.abs(test_sgn)
  pad_test_s = pad_silent(test_s.T)
  pred = sess.run(logits, feed_dict={X: split_2d(pad_test_s)})
  pred = np.multiply(np.divide(test_sgn,test_s),pred.T)
  sh_test = librosa.istft(pred, hop_length=512)
  if(out == 1):
    outfile = 'test_s_01_recons.wav'
  else:
    outfile = 'test_s_02_recons.wav'
  librosa.output.write_wav(outfile, sh_test, sr)
  
  
def snr(signal, est):
  num = np.sum(np.square(signal))
  den = np.sum(np.square(signal-est))
  snr = 10* np.log10(num/den)
  return snr

In [0]:
def process_train(file):

  test, sr = librosa.load(file, sr=None)
  test_sgn = librosa.stft(test, n_fft=1024, hop_length=512)
  magnitude, phase = librosa.magphase(test_sgn)
  test_s = np.abs(test_sgn)
  pad_test_s = pad_silent(test_s.T)
  pred = sess.run(logits, feed_dict={X: split_2d(pad_test_s)})
  pred = np.multiply(np.divide(test_sgn,test_s),pred.T)
  sh_test = librosa.istft(pred, hop_length=512, length=len(test))
  
  s, sr = librosa.load('train_clean_male.wav', sr=None)
  print(snr(s, sh_test))
  outfile = 'train_s_recons.wav'
  librosa.output.write_wav(outfile, sh_test, sr)
  return snr(s, sh_test)

In [44]:
# process_test('test_x_01.wav',1)
# process_test('test_x_02.wav',2)
print("SNR for train file :")
snr = process_train('train_dirty_male.wav')

# print(snr(test_sgn_ab, test_ns_ab)) 

SNR for train file :
13.343751430511475


In [0]:
with open("performance.txt", "a") as text_file:
    print(f"CNN 2", file=text_file)
    print(f"SNR: {snr}", file=text_file)
    print(f"LR: {lr}\t Iterations: {n_iterations}\t Batch Size: {batch_size}", file=text_file)
    print(f"Filter1: {filter1}\t Kernel1: {kernel_size1,kernel_size1}", file=text_file)
    print(f"Filter2: {filter2}\t Kernel2: {kernel_size2,kernel_size2}", file=text_file)
    print(f"Dense neurons: {n_dense}\t Drop Out: {drop_out}\n",file= text_file)

** Result **
* With given parameters for 1-d CNN the SNR for train file is 13.34. \
**SNR**: **13.343751430511475** \
**Learning Rate**: 0.001	 **Iterations**: 1000	\
**Batch Size**: 128 \
**Filter1**: 16	 **Kernel1**: 3 \
**Filter2**: 32	 **Kernel2**: 3 \
**Dense neurons**: 2048	 \
Drop out: 0.4
* The output files generated for test audio samples seem reasonable with less noise of chips but the performance did not seem as good as that of fully connected layer as far as audio is concerned.