# Preprocessing & Basic CNN architecture in Tensorflow

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import os
from tqdm import tqdm_notebook
import librosa
import librosa.display
from scipy.misc import imsave
import errno
from keras.preprocessing.image import load_img, img_to_array
from tensorflow import layers
from tensorflow.contrib import distributions as dist
from tensorflow.contrib import slim
from sklearn.decomposition import PCA 

Using TensorFlow backend.


In [None]:
parentdir = os.getcwd()

metadata = pd.read_csv(os.path.join(parentdir, 'UrbanSound8K/metadata/UrbanSound8K.csv'))
metadata.drop_duplicates('class')

Find the distributions of the classes (How balanced is the dataset)

In [None]:
metadata['class'].value_counts().plot(kind='pie', 
                                  figsize=(8,6), 
                                  fontsize=13, 
                                  autopct='%1.1f%%', 
                                  wedgeprops={'linewidth': 5}
                                  )
plt.axis('off')
plt.axis('equal')
plt.show()

In [None]:
# path to dataset
sound_dir = os.path.join(parentdir)

# choosing one sound from each category
unique_sounds = ["14113-4-0-1.wav","24074-1-0-0.wav","15564-2-0-1.wav","7383-3-0-0.wav",
                 "57320-0-0-4.wav","17592-5-1-0.wav","7061-6-0-0.wav", 
                 "98223-7-0-0.wav","40722-8-0-0.wav","21684-9-0-5.wav"]
sound_names = ["air conditioner", "car horn", "children playing", "dog bark", 
               "drilling", "engine idling", "gun shot", 
               "jackhammer", "siren", "street music"]

# raw time-series values for these sounds
raw = []
for u in unique_sounds:
    ts, sr = librosa.load(os.path.join(sound_dir, u))
    raw.append(ts)

# waveplot of unique sound-classes within dataset
def waves(raw_sounds):    
    i = 1
    fig = plt.figure(figsize=(12,15))
    for file,name in zip(raw_sounds,sound_names):
        plt.subplot(10,1,i)
        librosa.display.waveplot(np.array(file), x_axis=None)
        plt.title(name.title())
        i += 1
    plt.suptitle("Waveplot", x=0.52, y=1.02, fontsize=12)
    fig.tight_layout()

waves(raw)

In [None]:
def make_sure_path_exists(path):
    try:
        os.makedirs(path)
    except OSError as exception:
        if exception.errno != errno.EEXIST:
            raise

# Convert each wav file to spectrogram image

In [None]:
# N_FFT = 512
# HOP_LEN = N_FFT // 2
# SR = 16000
# fold_list = ['fold1', 'fold2', 'fold3', 'fold4', 'fold5', 'fold6', 'fold7', 'fold8', 'fold9', 'fold10']
# exception_count = 0
rows = 257
cols = 251

# oof = open('file_exceptions.txt', 'w') # redirect error exceptions for audio files

# target_spectrogram_folder_images = "UrbanSound8K/spectrograms"

# max_number_seconds = 4

# for i in range(10):
#     mypath = 'UrbanSound8K/audio/'+ fold_list[i] + '/'
#     files = [f for f in os.listdir(mypath) if os.path.isfile(os.path.join(mypath, f))]

#     make_sure_path_exists(target_spectrogram_folder_images + "/" + fold_list[i])

#     for f in tqdm_notebook(files):
#         fn = mypath + f
#         try: 
#             data, rate = librosa.load(fn, sr=SR, mono=True)

#             # Zero pad recordings that are less than 4 seconds
#             if data.shape[0] != SR*max_number_seconds:
#                 data = np.append(data, np.zeros((SR*max_number_seconds - data.shape[0], )))

#             X = librosa.stft(data, n_fft=N_FFT, hop_length=HOP_LEN)
#             D = librosa.amplitude_to_db(np.abs(X))
#             D = np.flipud(D)

#     #             plt.imshow(D, cmap='gray')
#     #             plt.show()
#             if D.shape[0] != rows or D.shape[1] != cols:
#                 print(audio_name, D.shape)

#             imsave(target_spectrogram_folder_images + "/" + fold_list[i] + "/" + f.split(".wav")[0] + '.png', D)

#         except Exception as e:
#             print(os.path.basename(os.path.normpath(fn)), file=oof)

# Convert spectrograms to NumPy arrays

In [None]:
def make_labels(data):
    
    # Extract target class from the filepath
    
    label = data.split('-')[1]
    return int(label)

In [None]:
# example = os.path.join(target_spectrogram_folder_images, "fold2", "14780-9-0-0.png")
# print("file path =", example)
# print('')
# print("label:", make_labels(example))

In [None]:
# for i in range(10):
#     spectrograms_array = []
#     labels = []
#     mypath = 'UrbanSound8K/spectrograms/'+ fold_list[i] + '/'
#     files = [f for f in os.listdir(mypath) if os.path.isfile(os.path.join(mypath, f))]
    
#     for f in tqdm_notebook(files):
#         image_file = mypath + f
#         img = load_img(image_file, grayscale=True)
#         x = img_to_array(img)
#         x /= 255.
#         spectrograms_array.append(x)
#         labels.append(make_labels(image_file))
#     np.save("spectrograms_fold_{0}.npy".format(i+1), np.array(spectrograms_array))
#     np.save("labels_fold_{0}.npy".format(i+1), np.array(labels))
# print("\nAll images are converted to numpy arrays!")

# Start the Convolutional Variational Autoencoder !

In [None]:
# X_train = np.load('total_numpys/total_spectrograms.npy')
X_train = np.load('spectrograms_fold_1.npy')
X_val = np.load('spectrograms_fold_2.npy')
# y_train = np.load('total_numpys/total_labels.npy')
y_train = np.load('labels_fold_1.npy')
y_val = np.load('labels_fold_2.npy')

print('Dataset sizes:')
print('X_train:', X_train.shape)
print('X_val:', X_val.shape)
print('y_train:', y_train.shape)
print('y_val:', y_val.shape)

In [None]:
fig = plt.figure(figsize=(10,8))

for i in range(4):
    ax = fig.add_subplot(2, 2, i+1)
    ax.imshow(np.reshape(X_train[i], (257, 251)), cmap='gray')

In [None]:
n_train_samples = X_train.shape[0]
n_valid_samples = X_val.shape[0]
n_classes = 10
y_train_one_hot = np.eye(n_classes)[y_train]
y_valid_one_hot = np.eye(n_classes)[y_val]

# print(X_val[0:32])
# (y_valid_one_hot[0:32])

# A few data augmentations (image-domain)

In [None]:
def pre_process_image(image, training):   
    if training:
        # Randomly crop the input image
        image = tf.random_crop(image, size=[img_size_cropped, img_size_cropped, num_channels])
        # Randomly flip the image horizontally
        image = tf.image.random_flip_left_right(image)
        # Randomly adjust hue, contrast and saturation
        image = tf.image.random_hue(image, max_delta=0.05)
        image = tf.image.random_contrast(image, lower=0.3, upper=1.0)
        image = tf.image.random_brightness(image, max_delta=0.2)
        image = tf.image.random_saturation(image, lower=0.0, upper=2.0)
        # Limit the image pixels between [0, 1] in case of overflow
        image = tf.minimum(image, 1.0)
        image = tf.maximum(image, 0.0)
    else:
        # Crop the input image around the centre so it is the same
        # size as images that are randomly cropped during training
        image = tf.image.resize_image_with_crop_or_pad(image,
                                                       target_height=img_size_cropped,
                                                       target_width=img_size_cropped)

    return image


def pre_process(images, training):
    images = tf.map_fn(lambda image: pre_process_image(image, training), images)

    return images

In [None]:
n_channels = 1
batch_size = 32
lr = 0.001
n_epochs = 30
n_train_batches = n_train_samples // batch_size
n_valid_batches = n_valid_samples // batch_size

In [None]:
X_placeholder = tf.placeholder(tf.float32, [None, rows, cols, n_channels])
y_placeholder = tf.placeholder(tf.float32, [None, n_classes])

input_layer = X_placeholder


conv1 = tf.layers.conv2d(inputs=input_layer,
                        filters=32,
                        kernel_size=[5,5],
                        padding="same",
                        activation=tf.nn.relu)

pool1 = tf.layers.max_pooling2d(inputs=conv1,
                               pool_size=[2,2],
                               strides=2)

conv2 = tf.layers.conv2d(inputs=pool1,
                        filters=32,
                        kernel_size=[5,5],
                        padding="same",
                        activation=tf.nn.relu)

pool2 = tf.layers.max_pooling2d(inputs=conv2,
                               pool_size=[2,2],
                               strides=2)

pool2_flat = tf.reshape(pool2, [-1, 32 * 62 * 64])

fully_connected = tf.layers.dense(inputs=pool2_flat,
                                 units=1024,
                                 activation=tf.nn.relu)

dropout = tf.layers.dropout(inputs=fully_connected,
                           rate=0.4)

logits = tf.layers.dense(inputs=dropout,
                        units=n_classes)

loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits,
                                                                labels=y_placeholder))

optimizer = tf.train.AdamOptimizer(learning_rate=lr)
train_step = optimizer.minimize(loss)

correct_prediction = tf.equal(tf.argmax(logits,1), tf.argmax(y_placeholder,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [None]:
with tf.Session() as sess:
    
    init = tf.global_variables_initializer()
    sess.run(init)
    
    for epoch in range(n_epochs):
        print()
        print("Epoch", epoch)
        
        for i in range(n_train_batches):
            start = i * batch_size
            end = (i + 1) * batch_size
            
            x_batch = X_train[start:end]
            y_batch = y_train_one_hot[start:end]
            _train_step, _loss = sess.run([train_step, loss],
                                        feed_dict=
                                        {X_placeholder:x_batch,
                                         y_placeholder:y_batch
                                        })
    
            
            if i % 100 == 0:
                _loss, _accuracy = sess.run([loss, accuracy],
                                 feed_dict={
                                     X_placeholder:x_batch,
                                     y_placeholder:y_batch
                                 })
                
                print("Minibatch loss:" , _loss)  
                print("Accuracy:", _accuracy)
          
    print()
    print("Optimization done! Calculating test error...")
    
    # TESTING
    losses = []
    acc = []

    for i in range(n_valid_batches):

        start = i * batch_size
        end = (i + 1) * batch_size

        x_test_batch = X_val[start:end]        
        y_test_batch = y_valid_one_hot[start:end]


        test_loss, test_accuracy = sess.run([loss, accuracy],
                                            feed_dict={
                                                X_placeholder:x_test_batch,
                                                y_placeholder:y_test_batch
                                            })

        losses.append(test_loss)
        acc.append(test_accuracy)

    print()
    print("Average loss on test set is:", np.mean(losses))
    print("Accuracy on test set:", np.mean(acc))