In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import scipy
import threading
import python_speech_features
from scipy.io import wavfile
import scipy.fftpack as fft
from scipy.signal import get_window
import IPython.display as ipd
import math
%matplotlib inline
import librosa
import pickle
from os import listdir
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import SimpleRNN
from keras.layers import Dropout
from os.path import isfile, join
workDir = "/Volumes/ANKUR'S/Data for audio/voxcelebDataset"
trainDir = "/Volumes/ANKUR'S/Data for audio/voxcelebDataset/FSDKaggle2018.audio_train"
testDir = "/Volumes/ANKUR'S/Data for audio/voxcelebDataset/FSDKaggle2018.audio_test"

In [2]:
trainFiles = [f for f in listdir(trainDir) if isfile(join(trainDir, f))]
testFiles = [f for f in listdir(testDir) if isfile(join(testDir, f))]

In [3]:
print(len(trainFiles))
print(len(testFiles))

18944
3200


In [4]:
def returnMelCoeff(signal, sr):
    return librosa.feature.mfcc(signal, sr)

In [5]:
def windowHann(signal, fr, windowSize):
#     print(len(signal), windowSize)
    window = get_window("hann", windowSize)
    windowed = signal * window
    return windowed

In [6]:
def padSignal(signal, sr, windowSize, hopSize):
    signalLength = len(signal)
    numZeros = windowSize - (signalLength - math.floor(signalLength/hopSize)*hopSize);
    return np.concatenate((signal, np.zeros(numZeros)), axis=None)

In [7]:
def process(signal, sr, windowSize, hopSize):
    origLength = len(signal)
    signal = padSignal(signal, sr, windowSize, hopSize)
    mfccArray = []
#     print(origLength, len(signal))
    for i in range(0, origLength, hopSize):
        toProcess = windowHann(signal[i: i+windowSize], sr, windowSize)
        mfccs = python_speech_features.mfcc(toProcess, sr, winlen=0.056, nfft = 1235, winstep=0.056)
        mfccArray.append(mfccs[0].tolist())
    return mfccArray

In [8]:
metaFile = "/Volumes/ANKUR'S/Data for audio/voxcelebDataset/FSDKaggle2018.meta/train_post_competition.csv"
meta = pd.read_csv(metaFile)

In [9]:
windowSize = 1235
hopSize = 1102
commonData = []
threads = []

for i in trainFiles[:500]:
    fName = os.path.join(trainDir, i)
    signal, sr = librosa.load(fName)
    mfccArray = process(signal, sr, windowSize, hopSize)
    tag = str(meta.loc[meta['fname'] == i]['label']).split()[1]
    a = {tag: mfccArray}
    commonData.append({i:a})
    print("Done with: ", i)

Done with:  00044347.wav
Done with:  001ca53d.wav
Done with:  002d256b.wav
Done with:  0033e230.wav
Done with:  00353774.wav
Done with:  003b91e8.wav
Done with:  003da8e5.wav
Done with:  0048fd00.wav
Done with:  004ad66f.wav
Done with:  0063ab88.wav
Done with:  006f2f32.wav
Done with:  0075d39c.wav
Done with:  00780200.wav
Done with:  0079d310.wav
Done with:  0091fc7f.wav
Done with:  0097160c.wav
Done with:  00ad7068.wav
Done with:  00c5808a.wav
Done with:  00c82919.wav
Done with:  00c934d7.wav
Done with:  00c9e799.wav
Done with:  00cb787c.wav
Done with:  00ce569f.wav
Done with:  00d1fe46.wav
Done with:  00d3bba3.wav
Done with:  00d40fa2.wav
Done with:  00d9fa61.wav
Done with:  00e2b4cd.wav
Done with:  00f88dc5.wav
Done with:  00fbb28b.wav
Done with:  00fcbab2.wav
Done with:  010aa387.wav
Done with:  011a2185.wav
Done with:  0120d246.wav
Done with:  01235a12.wav
Done with:  01257aad.wav
Done with:  01302128.wav
Done with:  013264d3.wav
Done with:  013c3135.wav
Done with:  01506d76.wav


Done with:  09841f13.wav
Done with:  098a3205.wav
Done with:  098c2232.wav
Done with:  0994e80c.wav
Done with:  09986ba4.wav
Done with:  099ce615.wav
Done with:  09a1cfd7.wav
Done with:  09a355a9.wav
Done with:  09a585bf.wav
Done with:  09a63606.wav
Done with:  09a732e6.wav
Done with:  09a7c5ce.wav
Done with:  09a7dc35.wav
Done with:  09a895f3.wav
Done with:  09b51c66.wav
Done with:  09b8a126.wav
Done with:  09b901f8.wav
Done with:  09bc3033.wav
Done with:  09ca1e09.wav
Done with:  09ea5276.wav
Done with:  0a037e96.wav
Done with:  0a0a8d4c.wav
Done with:  0a15b36b.wav
Done with:  0a277f11.wav
Done with:  0a2a5c05.wav
Done with:  0a2b4c80.wav
Done with:  0a32271b.wav
Done with:  0a366772.wav
Done with:  0a484e9f.wav
Done with:  0a49afad.wav
Done with:  0a54c770.wav
Done with:  0a6bba04.wav
Done with:  0a6dbf2c.wav
Done with:  0a82b4d6.wav
Done with:  0a8ac55d.wav
Done with:  0a8d300c.wav
Done with:  0a925754.wav
Done with:  0a98104d.wav
Done with:  0aa17642.wav
Done with:  0aa32376.wav


In [15]:
with open('500dump.pickle', 'wb') as handle:
    pickle.dump(commonData, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [9]:
with open('500dump.pickle', 'rb') as handle:
    readData = pickle.load(handle)

In [10]:
len(readData[0]['00044347.wav']['Hi-hat'])

281

In [11]:
maxL = 0
for i in readData:
    for key, value in i.items():
        for j in i[key]:
            if len(i[key][j])>maxL:
                maxL = len(i[key][j])

In [12]:
maxL

598

In [None]:
inputShape = (598, 1)

def buildModel():
    #Initialize RNN:
    regressor = Sequential()

    #Adding the first RNN layer and some Dropout regularization
    regressor.add(SimpleRNN(units = 50, activation='tanh', return_sequences=True, input_shape= inputShape))
    regressor.add(Dropout(0.2))

    #Adding the second RNN layer and some Dropout regularization
    regressor.add(SimpleRNN(units = 50, activation='tanh', return_sequences=True))
    regressor.add(Dropout(0.2))

    #Adding the third RNN layer and some Dropout regularization
    regressor.add(SimpleRNN(units = 50, activation='tanh', return_sequences=True))
    regressor.add(Dropout(0.2))

    #Adding the fourth RNN layer and some Dropout regularization
    regressor.add(SimpleRNN(units = 50))
    regressor.add(Dropout(0.2))

    #Adding the output layer
    regressor.add(Dense(units = 1))

    #Compile the RNN
    regressor.compile(optimizer='adam', loss='mean_squared_error')
    regressor.define()

#Fitting the RNN to the Training set
# regressor.fit(X_train, y_train, epochs=100, batch_size=32)
buildModel()