In [None]:
def graph_spectrogram(wav_file):
#{
    rate, data = wavfile.read(wav_file)
    print("File contains", data.shape[0] / rate, "seconds of data sampled at", rate, "Hz")
    print("Time steps in audio recording before spectrogram", data[:,0].shape)
    
    nfft = 256              # Length of each window segment
    fs = 1000               # Sampling frequencies
    noverlap = 128          # Overlap between windows
    
    if data.ndim == 1:
        pxx, freqs, bins, im = plt.specgram(data, nfft, fs, noverlap = noverlap)
    elif data.ndim == 2:
        pxx, freqs, bins, im = plt.specgram(data[:,0], nfft, fs, noverlap = noverlap)
    
    print("Time steps in input after spectrogram", pxx.shape, "\n")
    return pxx
#}

In [None]:
# Both these files error out due to incomplete wav chunk
########################################################

# graph_spectrogram('../data/Cinema_paradiso_stripped.wav')
# graph_spectrogram('../data/A_ghost_of_a_chance_stripped.wav')

In [None]:
n_freq = 101  # Number of frequencies input to the model at each time step of the spectrogram
Tx = 5511     # The number of time steps input to the model from the spectrogram (per 10s clip => 0.0018s steps)
n_tones = 88  # The number of output nodes = to a vector of 88 on/off piano keys
Ty = 1375     # The number of time steps in the output of our model (per 10s clip => .0072s steps)

# Only the last spectogram actually gets displayed
X1 = graph_spectrogram('../data/audio/Cry_me_a_river_simple.wav')
X2 = graph_spectrogram('../data/audio/A_fine_romance_simple.wav')

# Clip the ends of X1 and X2 to reshape properly
X1_train = song2samples(X1[:,0:137775], Tx, n_freq)
X2_train = song2samples(X2[:,0:170841], Tx, n_freq)
m = X1_train.shape[0] + X2_train.shape[0]

X_train = np.zeros((m, Tx, n_freq))
X_train[0:X1_train.shape[0],:,:] = X1_train[:,:,:]
X_train[X1_train.shape[0]:m,:,:] = X2_train[:,:,:]

print("Discretized into 10 sec samples:")
print("X1 train set size:", X1_train.shape)
print("X2 train set size:", X2_train.shape)
print("Final training set size:", X_train.shape, "\n")

In [None]:
mid1 = MidiFile('../data/Cry_me_a_river_stripped.mid')
mid2 = MidiFile('../data/A_fine_romance_stripped.mid')

Y1 = np.zeros((n_tones, 25 * Ty))
Y2 = np.zeros((n_tones, 31 * Ty))

Y1 = format_label_data(mid1, Y1)
Y2 = format_label_data(mid2, Y2)

Y1_train = song2samples(Y1, Ty, n_tones)
Y2_train = song2samples(Y2, Ty, n_tones)
assert m == Y1_train.shape[0] + Y2_train.shape[0]

Y_train = np.zeros((m, Ty, n_tones))
Y_train[0:Y1_train.shape[0],:,:] = Y1_train[:,:,:]
Y_train[Y1_train.shape[0]:m,:,:] = Y2_train[:,:,:]

print("\nDiscretized into 10 sec samples:")
print("Y1 train set size:", Y1_train.shape)
print("Y2 train set size:", Y2_train.shape)
print("Final training set size:", Y_train.shape)

In [None]:
def create_conv_network(time_batch, n_freq):
#{
    X_input = Input(shape=(time_batch, n_freq, 1))

    # Layer 1: CONV layer
    X = Conv2D(10, kernel_size=(3,10000))(X_input)
    X = Activation('relu')(X)
    X = MaxPooling2D(pool_size=(2, 2))(X)
    X = Dropout(0.8)(X)

    # Layer 2: CONV layer
    X = Conv2D(25, kernel_size=(2,1000))(X)
    X = Activation('relu')(X)
    X = MaxPooling2D(pool_size=(2, 2))(X)
    X = Dropout(0.8)(X)

    # Layer 3: Time-distributed dense layer
    X = Flatten()(X)
    X_output = Dense(time_batch, activation = "sigmoid")(X) 
    
    model = keras.models.Model(inputs = X_input, outputs = X_output)
    model.summary()

    return model
#}

In [None]:
# Generate model and configure for training
model = create_conv_network(time_batch, n_freq)
opt = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, decay=0.01)
early_stop = [keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)]
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['categorical_accuracy'])

In [None]:
# Train model
history = model.fit(X_train, Y_train, batch_size=5, validation_data=(X_val, Y_val), callbacks=early_stop, epochs=10)

In [None]:
#model.save('../data/h5/music_model_19_02_18.h5') 

# List all data in history
print(history.history.keys())

# Plot with respect to accuracy
plt.figure(1)
plt.plot(history.history['categorical_accuracy'])
plt.plot(history.history['val_categorical_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['train', 'validate'], loc='upper left')

# Plot with respect to loss
plt.figure(2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['train', 'validate'], loc='upper left')

In [None]:
def format_spectral_strided_data(spectral_list, time_batch, n_freq, stride):
#{    
    # Zero pad a time_batch at the end
    zpad = np.zeros((time_batch, n_freq))
    spectral_array = np.real(spectral_list)
    spectral_array = np.concatenate((spectral_array, zpad))
    
    strided_steps = np.arange(0, m, stride)
    X = np.zeros((len(strided_steps), n_freq, time_batch))
    
    for i in range(len(strided_steps)):
        j = strided_steps[i]
        X[i,:,:] = spectral_array[j:j+time_batch, :].transpose()
    
    return X
#}

In [None]:
# # Reshape FFT data for DenseNet
# X_train = np.reshape(np.asarray(spectral_train), (m,-1,1))
# X_val = np.reshape(np.asarray(spectral_val), (m,-1,1))
# print(f"Input data shape, train: {X_train.shape}, validate: {X_val.shape}")

# Y_train = generate_contiguous_labels(midi_train, m, sec_per_step_train)
# Y_val = generate_contiguous_labels(midi_val, m, sec_per_step_val)
# print(f"Output data shape, train: {Y_train.shape}, validate: {Y_val.shape}")

In [None]:
def create_dense_network(n_freq):
#{
    # Layer #0: Input
    X_input = Input(shape=(n_freq, 1))
    
    # Layer #1: Conv1D to reduce huge input dim
    X = Conv1D(128, kernel_size=16, strides=4)(X_input)
    X = BatchNormalization()(X)
    X = Activation('relu')(X)
    X = Dropout(0.8)(X)
    
    # Layer #2: Dense layer
    X = Dense(64, activation='relu')(X)
    X = Dropout(0.8)(X)

    # Layer #3: Dense layer
    X = Dense(1, activation='relu')(X)
    X = Dropout(0.8)(X)

    # Layer #4: Collapse to 1D, then Binary sigmoid
    X = Flatten()(X)
    X_output = Dense(1, activation = "sigmoid")(X) 
    
    model = keras.models.Model(inputs = X_input, outputs = X_output)
    model.summary()

    return model
#}

In [86]:
# Generate model and configure for training
model = create_dense_network(n_freq)
opt = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, decay=0.01)
early_stop = [keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)]
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['binary_accuracy'])

Min/Max students uncheated upon: 0/4
Expected number of students uncheated upon: 1.99957


In [None]:
# Train model
history = model.fit(X_train, Y_train, batch_size=m, validation_data=(X_val, Y_val), callbacks=early_stop, epochs=100)

In [None]:
# List all data in history
print(history.history.keys())

# Plot with respect to accuracy
plt.figure(1)
plt.plot(history.history['binary_accuracy'])
plt.plot(history.history['val_binary_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['train', 'validate'], loc='upper left')

# Plot with respect to loss
plt.figure(2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['train', 'validate'], loc='upper left')