### Importing Libraries

In [1]:
import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd 
import librosa 

In [2]:
import tensorflow as tf
from tensorflow import keras

### Choosing Model

In [3]:
my_model = keras.Sequential([
    keras.layers.Dense(units=313, activation='relu'),
    keras.layers.Dense(units=192, activation='relu'),
    keras.layers.Dense(units=128, activation='relu'),
    keras.layers.Dense(units=3, activation='softmax')
])
my_model.load_weights('saved_model/dnn')
my_model.compile(optimizer='adam', 
              loss=tf.losses.CategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

### Inference 

In [4]:
def evaluate(spectrogram_path):
    spec = np.load(spectrogram_path)
    spec = spec.T
    output = []
    for i in range(spec.shape[0]):
        x = spec[i].reshape(1,-1) 
        y = my_model.predict(tf.convert_to_tensor(x))
        if y[0][0] >= y[0][1] and y[0][0] >= y[0][2]:
            output.append('Music')
        elif y[0][1] >= y[0][0] and y[0][1] >= y[0][2]:
            output.append('Speech')
        elif y[0][2] >= y[0][1] and y[0][2] >= y[0][0]:
            output.append('Silence') 

    return output 

In [5]:
def erode_and_dillate(output):
    res = output 
    for i in range(1, len(output)-1):
        if output[i-1] == output[i+1]:
            res[i] = output[i-1] 

    res[0] = res[1] 
    res[len(output)-1] = res[len(output)-2]

    for i in range(1, len(output)-1):
        if res[len(output)-2-i] == res[len(output)-i] :
            res[len(output)-1-i] = res[len(output)-2-i]

    return res 

In [6]:
def detect_timestamps(output):
    timestamps = []
    events = []
    curr = ""
    start_frame = 0
    end_frame = 0

    for idx, class_ in enumerate(output):
        if curr == class_:
            end_frame += 1
        elif curr == "":
            curr = class_
            start_frame = idx
            end_frame = start_frame
        else:
            events.append(curr)
            timestamps.append((start_frame, end_frame))
            curr = class_
            start_frame = idx
            end_frame = start_frame

    events.append(class_)
    timestamps.append([start_frame, end_frame])
    return events, timestamps

In [7]:
def filter_timestamps(events, timestamps, threshold=32):
    assert len(events) == len(timestamps)
    events_new = []
    event_times = []
    for i in range(len(events)):
        if events[i] == 'silence' : 
            continue 
        elif timestamps[i][1] - timestamps[i][0] >= threshold:
            event_times.append([(timestamps[i][0]/312)*10, (timestamps[i][1]/312)*10])
            events_new.append(events[i])

    return events_new, event_times

In [8]:
import os 

test_spectrogram_file_path = "./Test Data_AED/" 
filenames = os.listdir(test_spectrogram_file_path)

task1 = [['filename', 'event', 'onset', 'offset']] 
task2 = [['filename', 'Music', 'Speech']] 
for file in filenames:
    output = evaluate(test_spectrogram_file_path + file)
    output = erode_and_dillate(output)
    events, timestamps = detect_timestamps(output)
    events, event_times = filter_timestamps(events, timestamps, 32)
    for i in range(len(events)):
        y1 = [file.split('.')[0], events[i], event_times[i][0], event_times[i][1]] 
        task1.append(y1) 
    
    music_present = 0 
    speech_present = 0 
    for event in events : 
        if event == 'Music' : 
            music_present = 1 
        elif event == 'Speech' :
            speech_present = 1 
    task2.append([file.split('.')[0], music_present, speech_present]) 

In [9]:
for i in range(len(task1)):
    print(task1[i][0], ',', task1[i][1], ',', task1[i][2], ',', task1[i][3], sep='')

filename,event,onset,offset
test_sample-0,Speech,0.0,7.435897435897436
test_sample-0,Music,7.467948717948718,10.0
test_sample-1,Speech,0.0,7.371794871794872
test_sample-1,Music,7.403846153846154,9.903846153846153
test_sample-10,Speech,0.0,7.083333333333334
test_sample-11,Speech,0.0,7.147435897435898
test_sample-11,Music,7.17948717948718,9.775641025641026
test_sample-12,Speech,0.0,2.1474358974358974
test_sample-12,Music,2.467948717948718,6.4743589743589745
test_sample-12,Music,7.147435897435898,9.583333333333334
test_sample-13,Speech,1.9230769230769231,4.006410256410256
test_sample-13,Speech,4.935897435897436,7.019230769230768
test_sample-13,Speech,8.653846153846153,10.0
test_sample-14,Speech,0.0,2.0833333333333335
test_sample-14,Music,2.1153846153846154,5.128205128205128
test_sample-14,Speech,8.653846153846153,10.0
test_sample-15,Speech,0.0,7.371794871794872
test_sample-16,Speech,0.0,7.467948717948718
test_sample-16,Music,7.5,10.0
test_sample-17,Speech,0.0,1.1217948717948718
test_sampl

In [10]:
for i in range(len(task2)):
    print(task2[i][0], ',', task2[i][1], ',', task2[i][2], sep='')

filename,Music,Speech
test_sample-0,1,1
test_sample-1,1,1
test_sample-10,0,1
test_sample-11,1,1
test_sample-12,1,1
test_sample-13,0,1
test_sample-14,1,1
test_sample-15,0,1
test_sample-16,1,1
test_sample-17,1,1
test_sample-18,1,1
test_sample-19,1,1
test_sample-2,1,1
test_sample-20,1,1
test_sample-21,1,1
test_sample-22,1,1
test_sample-23,1,1
test_sample-24,1,1
test_sample-25,1,1
test_sample-26,1,1
test_sample-27,1,1
test_sample-28,1,1
test_sample-29,0,1
test_sample-3,1,1
test_sample-30,1,1
test_sample-31,1,1
test_sample-32,1,1
test_sample-33,1,1
test_sample-34,0,1
test_sample-35,0,1
test_sample-36,1,1
test_sample-37,1,1
test_sample-38,0,1
test_sample-39,1,1
test_sample-4,0,1
test_sample-40,1,1
test_sample-41,0,1
test_sample-42,1,1
test_sample-43,1,1
test_sample-44,1,1
test_sample-45,1,1
test_sample-46,1,1
test_sample-47,0,1
test_sample-48,1,1
test_sample-49,0,1
test_sample-5,1,1
test_sample-50,0,1
test_sample-51,0,1
test_sample-52,1,1
test_sample-53,1,1
test_sample-54,1,1
test_sample-55,