In [1]:
from helpers import *
from lstmHelpers import *

import random
from random import randrange, shuffle
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import MinMaxScaler
import time
from collections import deque
import pickle


filedir = "lstmData"

Using TensorFlow backend.


In [2]:
## GENERATE AND WRITE NEW STREAM DATA TO FILE

start = time.time()
total_time = time.time()

NUM_QUERY_FILES = 200  # If 0, use ALL classes; else, use n classes only
num_files_in_stream = 20 * NUM_QUERY_FILES
order_to_use = []

used_classes, unused_classes = load_classes("../", NUM_QUERY_FILES, display=False)

for i in range(NUM_QUERY_FILES):
    order_to_use.extend([i] * int(num_files_in_stream / NUM_QUERY_FILES))
    
shuffle(order_to_use)
generate_data_file("/trainingInfo_" + str(NUM_QUERY_FILES) + ".txt", filedir, used_classes, unused_classes, order_to_use, max_ramp_length=0.25)
shuffle(order_to_use)
generate_data_file("/testingInfo_" + str(NUM_QUERY_FILES) + ".txt", filedir, used_classes, unused_classes, order_to_use, max_ramp_length=0.25)

# code you want to evaluate
end = time.time()
print("Time elapsed: " + str(end-start) + " seconds, or " + str((end-start)/60.0) + " minutes")

Using classes...
Exact Classes: [9423, 8232, 8988, 2376, 9137, 5671, 6858, 5638, 1638, 8470, 2360, 3621, 1731, 3441, 7895, 2894, 7878, 7165, 6504, 6440, 244, 4645, 10127, 8929, 9572, 6287, 1647, 9152, 1548, 3493, 6615, 2749, 9113, 1087, 1656, 2107, 7284, 691, 7082, 3783, 8751, 1985, 3677, 4848, 2360, 10004, 748, 7868, 5960, 7104, 2110, 1487, 1888, 994, 4310, 9078, 1344, 2919, 200, 3809, 8111, 7437, 22, 1876, 118, 7431, 9522, 8888, 9343, 4800, 358, 2800, 1887, 5263, 7459, 4699, 8652, 5011, 10034, 6477, 1821, 4593, 9603, 2638, 6136, 3140, 2673, 4342, 10114, 702, 3607, 8382, 3674, 4454, 9416, 8297, 8520, 6932, 5027, 946, 3362, 6298, 8736, 3756, 1953, 9359, 3607, 8993, 307, 6513, 2801, 6388, 2777, 9907, 344, 826, 4303, 8608, 9829, 4040, 477, 1345, 5365, 4999, 2734, 5236, 8158, 3912, 838, 7588, 7115, 4123, 3972, 8228, 1415, 1270, 10198, 2397, 9019, 8318, 1150, 4427, 6243, 6140, 4013, 3280, 219, 4136, 2902, 9801, 1395]
Matched Classes: [4171, 1120, 1781, 4099, 15, 1734, 3465, 0, 3522, 1359, 

In [3]:
## READ AND GENERATE STREAM DATA FROM FILE

def generate_composite_stream(audio_matrix, data_dict = None):
    cur_percentage = 0
    if data_dict == None:
        data_dict = {}
    
    composite_signal_list = []
    composite_matches_list = []
    
    for i in range(len(audio_matrix)):
        if (round(i / len(audio_matrix) * 100) != cur_percentage):
            cur_percentage = round(i / len(audio_matrix) * 100)
            print(str(cur_percentage) + "%     ", end='')
            
        filename = audio_matrix[i][0]
        if filename in data_dict:
            y = data_dict.get(filename)
        else:
            yt,sr = librosa.load(filename)
            y, idx = librosa.effects.trim(yt, top_db=50)
            data_dict[filename] = y
        warped_y = timewarp(y, audio_matrix[i][6])
        noisey_y = apply_noise(warped_y, audio_matrix[i][5])
        faded_y = apply_ramp(noisey_y, audio_matrix[i][3], audio_matrix[i][4])
        composite_signal_list.append(faded_y)
        composite_matches_list.append(np.full(faded_y.shape, audio_matrix[i][1]))
    composite_signal = np.array(composite_signal_list)
    composite_signal = np.concatenate(composite_signal).ravel()
    composite_matches = np.array(composite_matches_list)
    composite_matches = np.concatenate(composite_matches).ravel()
    return composite_signal, composite_matches, data_dict
    
start = time.time()

audio_matrix = load_stream(name="/trainingInfo_" + str(NUM_QUERY_FILES) + ".txt", filedir=filedir)
composite_signal, composite_matches, preloaded_data = generate_composite_stream(audio_matrix)
#audio_matrix_test = load_stream(name="/testingInfo_" + str(NUM_QUERY_FILES) + ".txt", filedir=filedir)
#composite_signal_test, composite_matches_test, x = generate_composite_stream(audio_matrix_test, data_dict=preloaded_data)

audio_matrix = []
audio_matrix_test = []
preloaded_data = {}

NUM_CLASSES_USED = len(set(composite_matches))

end = time.time()
print("\nTime elapsed: " + str(end-start) + " seconds, or " + str((end-start)/60.0) + " minutes")



1%     2%     3%     4%     5%     6%     7%     8%     9%     10%     11%     12%     13%     14%     15%     16%     17%     18%     19%     20%     21%     22%     23%     24%     25%     26%     27%     28%     29%     30%     31%     32%     33%     34%     35%     36%     37%     38%     39%     40%     41%     42%     43%     44%     45%     46%     47%     48%     49%     50%     51%     52%     53%     54%     55%     56%     57%     58%     59%     60%     61%     62%     63%     64%     65%     66%     67%     68%     69%     70%     71%     72%     73%     74%     75%     76%     77%     78%     79%     80%     81%     82%     83%     84%     85%     86%     87%     88%     89%     90%     91%     92%     93%     94%     95%     96%     97%     98%     99%     100%     
Time elapsed: 130.53299021720886 seconds, or 2.175549836953481 minutes


In [None]:
## MINI-BATCH TESTING CELL

def batch(signal, matches, hop_length = 512/8, print_output=False):
    signal_batch_length = 2048*3
    data = []
    classes = []
    
    batched_frames = []
    cur_frame_count = 0
    cur_percentage = 0
    num_to_add  = signal_batch_length
    while cur_frame_count < len(signal):
        batched_frames.extend(signal[cur_frame_count : cur_frame_count + num_to_add - 1])
        recent_signal = np.asarray(batched_frames)
        recent_signal = np.pad(recent_signal, (0, 2048), 'constant', constant_values=(0.0,0.0))
        spec = get_spectrogram(recent_signal, 22050, n_mels=n_mels, display=False)
        cur_col = 0
        transposed = spec.T
        scaler = MinMaxScaler(feature_range=(0, 1))
        transposed = scaler.fit_transform(transposed)
        comp_cols = []
        for i in range(0,seq_length):
            comp_cols.append(transposed[cur_col + i])
        data.append(comp_cols)
        classes.append(int(matches[cur_frame_count-1]))
        batched_frames = batched_frames[int(hop_length)-1:]  # hop length of spectrogram / 8
        
        if print_output and round((cur_frame_count-1) / len(signal) * 100) != cur_percentage:
            cur_percentage = round((cur_frame_count-1) / len(signal) * 100)
            print(str(cur_percentage) + "%     ", end='')
            
        cur_frame_count += num_to_add
        num_to_add = int(hop_length)
        
    return data, classes

start = time.time()
n_mels = 128
seq_length = 10

start_point = 0
batch_length = 4000  # In # of sequences
audio_window= 64
batch_window = batch_length * audio_window

lstm_out = 350
batch_size = 32
embedding_vector_length = 32
dropout = 0.1
dropout_r = 0.1
number_inputs = NUM_CLASSES_USED

model = Sequential()
model.add(LSTM(lstm_out, input_shape=(seq_length, n_mels), dropout = dropout, recurrent_dropout = dropout_r))
model.add(Dense(NUM_CLASSES_USED,activation='softmax'))  #softmax
model.compile(loss = 'sparse_categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])  # sparse_categorical_crossentropy
print(model.summary())

composite_signal_length = len(composite_signal)

acc_0 = []
acc_1 = []
acc_2 = []
acc_3 = []

while start_point + batch_window < composite_signal_length:
    print("Training Data: " + str(start_point) + " / " + str(composite_signal_length) + " ~= " + str(round(start_point / composite_signal_length * 100)) + "%")
    training_data, training_classes = batch(composite_signal[start_point:start_point+batch_window], composite_matches[start_point:start_point+batch_window])
    training_data = np.array(training_data)
    history = model.fit(training_data, training_classes, epochs=4, batch_size=batch_size, verbose=0, shuffle=True)
    accuracies = history.history['acc']
    acc_0.append(accuracies[0])
    acc_1.append(accuracies[1])
    acc_2.append(accuracies[2])
    acc_3.append(accuracies[3])
    os.write(1, str(str(round(start_point / composite_signal_length * 100)) + "\n").encode())
    start_point += batch_window
    
os.write(1,"Completed Training\n".encode())
end = time.time()
print("Time elapsed: " + str(end-start) + " seconds, or " + str((end-start)/60.0) + " minutes")

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 350)               670600    
_________________________________________________________________
dense_1 (Dense)              (None, 200)               70200     
Total params: 740,800
Trainable params: 740,800
Non-trainable params: 0
_________________________________________________________________
None
Training Data: 0 / 182115840 ~= 0%
Training Data: 256000 / 182115840 ~= 0%
Training Data: 512000 / 182115840 ~= 0%
Training Data: 768000 / 182115840 ~= 0%
Training Data: 1024000 / 182115840 ~= 1%
Training Data: 1280000 / 182115840 ~= 1%


In [None]:
from matplotlib import pyplot as mp

plt.figure(figsize = (15,11))
plt.plot( np.arange(len(acc_0)), acc_0, label='Epoch 1')
plt.plot( np.arange(len(acc_1)), acc_1, label='Epoch 2')
plt.plot( np.arange(len(acc_2)), acc_2, label='Epoch 3')
plt.plot( np.arange(len(acc_3)), acc_3, label='Epoch 4')
plt.legend()

mp.savefig(filedir + '/accuracies_' + str(NUM_QUERY_FILES) + '.png', bbox_inches='tight')

In [None]:
model.save(filedir + "/LSTMModel_" + str(NUM_QUERY_FILES) + ".h5")

## TESTING NETWORK

print("\nTesting Data")
testing_data, testing_classes = batch(composite_signal_test, composite_matches_test)
testing_data = np.array(testing_data)

scores = model.evaluate(testing_data, testing_classes)
print("Accuracy: %.2f%%" % (scores[1]*100))