# WiDS Tel Aviv
## Tutorial – Dealing with the Lack of Audio Data
In recent years, speech data is receiving spotlight for various applications in deep learning, from Automatic Speech Recognition (ASR) system to source separation. And yet, there are not many augmentation techniques explored for speech data compared to those of image data. Thus, in this track, we will explore various methods to augment speech data. This hands-on tutorial will work along the task of building a simple speech classifier with the Speech Commands Zero to Nine (SC09) dataset available by TensorFlow and go over traditional augmentation techniques, transfer learning, GAN augmentation, and style transfer to increase the classification accuracy. Participants are required to download the libraries and pre-trained models, which will be available in late-January.

## Prepare training data
* Wave file to spectrogram image
* Create text file that contains path information of training data

In [None]:
# Load sample wav file
# Listen to it
# How many data in each directory

In [None]:
# Convert that sample wav file into spectrogram image
# This is not for real training, so it shouldn't handle all files (classes)
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from scipy.io import wavfile
import os
import numpy as np

#classes = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']
classes = os.listdir('./gender_voice')
modes = ['']
LEN = 16000
for label in classes:
    for mode in modes:
        path = os.path.join('./gender_voice', mode, label)
        waves = os.listdir(path)
        target_path = os.path.join('./gender_voice_img', mode, label)

        for wav_file in waves:
            if wav_file[-3:] == 'wav':
                rate, data = wavfile.read(os.path.join(path, wav_file))
                if len(data) > LEN:
                    data = data[:LEN]
                else:
                    data = np.pad(data, (0, max(0, LEN - len(data))), "constant")
                fig,ax = plt.subplots(1)
                fig.subplots_adjust(left=0,right=1,bottom=0,top=1)
                ax.axis('off')
                pxx, freqs, bins, im = ax.specgram(x=data, Fs=rate, noverlap=384, NFFT=512)
                ax.axis('off')
                fig.savefig(os.path.join(target_path, wav_file)[:-4] + '.jpg', dpi=300, frameon='false')
                plt.close()
                print("Modification complete for ", os.path.join(target_path, wav_file)[:-4] + '.jpg')

In [None]:
# Sample directory image files (several files per class)
# Use that to make a text file list
# And then load it to show
import os

labels = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']
#labels = os.listdir('./sc09_img/sc09_cyclegan')
modes = ['sc09_cyclegan']

for mode in modes:
    data_path = '/home/data/speech_commands/sc09_img/' + mode
    f = open('./' + mode +'.txt', 'w+')

    for idx, label in enumerate(labels):
        file_path = os.path.join(data_path, label)
        waves = os.listdir(file_path)

        for wave in waves:
            wav_path = os.path.join(file_path, wave)
            f.write(wav_path + ' ' + str(idx) + '\n')

    f.close()

## Baseline model

In [4]:
# Set it to test/inference mode
# Load checkpoint for baseline model
# Show Tensorboard for training and evaluation
# Describe the code in presentation
# Let people inference with sample data


## Adding noise to train data

In [None]:
# Add noise to one sample data
# Listen to before and after
# Show spectrogram images before and after
# Tell them and show Tensorboard for noisy training
# Let them inference with both noisy and unnoisy data
import random
from scipy.io import wavfile
from tensorflow.python.platform import gfile
import numpy as np

LEN = 16000
NOISE_VOLUME = 0.1
NOISE_FREQUENCY = 0.8

# call noises.wav
noise_paths = '/home/data/speech_commands/etc/_background_noise_/*.wav'
noises = []
for noise_path in gfile.Glob(noise_paths):
    #noise = librosa.core.load(noise_path, sr=None, duration=1)[0] * NOISE_VOLUME
    noise = wavfile.read(noise_path)[1][:LEN] * NOISE_VOLUME
    noises.append(noise)

# call train audio
wav_path = './sc09_wav/train/*/*.wav'
waves = gfile.Glob(wav_path)
random.shuffle(waves)

# add background noise
for idx, wave in enumerate(waves):
    data = wavfile.read(wave)[1]
    if idx <= len(waves) * NOISE_FREQUENCY:
        #data, _ = librosa.core.load(wave, sr=None, duration=1)
        if len(data) < LEN:
            data = np.pad(data, (0, max(0, LEN - len(data))), "constant")
        else:
            data = data[:LEN]
        index = random.randint(0, 4)
        noise_data = noises[index]
        wavfile.write('./sc09_wav/aug/' + wave[17:], LEN, data + noise_data)
    else:
        #librosa.output.write_wav('./sc09_wav/aug/' + wave[17:], file, sr=LEN)
        wavfile.write('./sc09_wav/aug/' + wave[17:], LEN, data)

## Transfer Learning

In [None]:
# What is transfer learning
# Let's see directories of Speech Command dataset without 0~9
# Listen to some sample files
# Show SC training by adjusting num_classes
# Inference with SC09 --> bad eval result
# Now fine tuning model layer is somewhat different

# Let people load checkpoint and turn it into numpy file
import tensorflow as tf
import numpy as np
import sys
from model import AlexNetModel


# Edit just these
FILE_PATH = '/home/finetune/training/alexnet_20190220_005707/checkpoint/model_epoch7.ckpt'
NUM_CLASSES = 20
OUTPUT_FILE = 'sc_epoch7.npy'


if __name__ == '__main__':
    x = tf.placeholder(tf.float32, [128, 227, 227, 3])
    model = AlexNetModel(num_classes=NUM_CLASSES)
    model.inference(x)

    saver = tf.train.Saver()
    layers = ['conv1', 'conv2', 'conv3', 'conv4', 'conv5', 'fc8']
    data = {
        'conv1': [],
        'conv2': [],
        'conv3': [],
        'conv4': [],
        'conv5': [],
        'fc8': []
    }

    with tf.Session() as sess:
        saver.restore(sess, FILE_PATH)

        for op_name in layers:
          with tf.variable_scope(op_name, reuse = True):
            biases_variable = tf.get_variable('biases')
            weights_variable = tf.get_variable('weights')
            data[op_name].append(sess.run(biases_variable))
            data[op_name].append(sess.run(weights_variable))

        np.save(OUTPUT_FILE, data)

In [None]:
# Finetuning.py
# model4tl.py --> different model architecture (addition of fc layers)
# train_layers for finetuning (optimizer) & skip layers for newly added layers (weight load)
# This will be done in the presentation
# And then show finetuned training results
# Inference with SC09 --> good results

## GAN Synthesis

In [None]:
# What is DCGAN
# Why generate
# Results unstable
# Not going to be done here (Tensorboard)
# So the solution? CycleGAN loss (more stable) --> CycleGAN generation needs "features" --> here "gender"
# Explain CycleGAN in the presentation
# Gender dataset load it, explain it, listen to it, compare & contrast two classes
# Tensorboard --> inference (generation) --> use as training data --> f