In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split
from tensorflow_core.python.keras.utils import np_utils
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from matplotlib import rc
import seaborn as sns
import librosa
import datetime
import random
random.seed(42)
%matplotlib inline

In [None]:
MODEL_PATH = "my mobilenet_7.h5 path" # Replace this
model = load_model(MODEL_PATH)

In [None]:
%%bash

# Download video
youtube-dl "https://www.youtube.com/watch?v=O9uvBFovKj8"

# Download audio track
youtube-dl -f bestaudio --extract-audio --audio-format wav --audio-quality 0 "https://www.youtube.com/watch?v=O9uvBFovKj8"

In [None]:
WAVE_FILE_PATH = "Driving Fails Compilation - Episode #180 HD-uFwnmh0GpBo.wav"

conf={}
conf['sampling_rate'] = 22050
conf['duration'] = 1  # 4
conf['hop_length'] = 128
conf['fmin'] = 20
conf['fmax'] = conf['sampling_rate'] // 2
conf['n_mels'] = 128  # 128
conf['n_fft'] = 1024
conf['audio_split'] = 'head'

categories_dictionary = {'driving':0,
                         'crash':1,
                         'horn':2,
                         'music':3,
                         'talking':4,
                         'tire':5,
                         'scream':6
                         }
all_categories_list = categories_dictionary.keys()
inv_map = {v: k for k, v in categories_dictionary.items()}  # inverse of category_dictionary (ClassNumber:ClassName)

frames = 51
FRAMES_PER_SEGMENT = frames - 1
WINDOW_SIZE = 256 * FRAMES_PER_SEGMENT   # 256*50=12800 circa 0.5sec a 22050 hz
STEP_SIZE = 256 * FRAMES_PER_SEGMENT // 2

def get_X_from_signal(signal, conf):
    spectrogram = librosa.feature.melspectrogram(signal, 
                                                 sr=conf['sampling_rate'],
                                                 n_mels=conf['n_mels'],
                                                 n_fft=1024,
                                                 hop_length=128,
                                                 fmin=conf['fmin'],  # 1024,128 -> 128x101
                                                 fmax=conf['fmax'])
    spectrogram = librosa.power_to_db(spectrogram)
    spectrogram = spectrogram.astype(np.float32)
    return spectrogram


def get_normalized_clip(filename):
    new_clip, sr = librosa.load(filename)
    new_clip = librosa.resample(new_clip, sr, conf['sampling_rate'])
    normalization_factor = 1 / np.max(np.abs(new_clip))
    return new_clip * normalization_factor
    

datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    samplewise_center=True,  # Set each sample mean to 0
    samplewise_std_normalization=True  #D ivide each input by its std.
)
output_dataset = pd.DataFrame()
new_clip = get_normalized_clip(WAVE_FILE_PATH)
s = 0
while len(new_clip[s * STEP_SIZE:s * STEP_SIZE + WINDOW_SIZE]) == WINDOW_SIZE:
    d = {}
    signal = new_clip[s * STEP_SIZE:s * STEP_SIZE + WINDOW_SIZE]
    new_x = np.expand_dims(get_X_from_signal(signal, conf), axis=-1)
    d['X'] = new_x
    X = np.asarray(np.dstack([new_x[:,:,0], librosa.feature.delta(new_x[:,:,0], order=1), librosa.feature.delta(new_x[:,:,0], order=2)]))
    predictions = model.predict(np.expand_dims(datagen.standardize(X), axis=0))
    prediction = np.argmax(predictions)
    d['predictions'] = predictions
    d['prediction'] = prediction
    d['timestamp'] = str(datetime.timedelta(seconds=(s*STEP_SIZE/conf['sampling_rate'])))
    output_dataset = output_dataset.append(d, ignore_index=True)
    s = s + 1

In [None]:
def create_subtitle_file(output_dataset, output_subtitle_file, sampling_freq):
    import re
    import datetime
    inv_map = {v: k for k, v in categories_dictionary.items()}  # inverse of category_dictionary {ClassNumber:ClassName}
    PROBABILITY_THRESHOLD = 0.6
    
    with open(output_subtitle_file, "a") as sub_file:
        for index, row in output_dataset.iterrows():
            max_probability = np.max(row['predictions'][0])
            sub_file.write('{}\n'.format(index+1))
            sub_file.write('{} --> {}\n'.format(re.sub(r"(\.\d{3})\d{3}", r'\1', str(datetime.timedelta(seconds=(index*STEP_SIZE/sampling_freq)))), 
                                                re.sub(r"(\.\d{3})\d{3}", r'\1',str(datetime.timedelta(seconds=(((index+1)*STEP_SIZE))/sampling_freq)))))
            if max_probability >= PROBABILITY_THRESHOLD:
                if(row['prediction'] == 1):
                    sub_file.write('<b><font color="#ff0000">{}</font></b>'.format(inv_map[row['prediction']]))
                else:
                    sub_file.write('{}'.format(inv_map[row['prediction']]))

            # other predictions with probability
            prob_string = []
            for prob in row['predictions'][0]:
                prob_string.append('{}:{:.2f}'.format(inv_map[list(row['predictions'][0]).index(prob)], prob))
            sub_file.write('\n<font size="12">{}</font>'.format(' '.join(prob_string)))
            # closing the file
            sub_file.write('\n\n')

In [None]:
create_subtitle_file(output_dataset, "subtitle.srt path", 22050)