### 1. Input
#### video/audio, conversion
### 2. Conversion to AudioSet features
### 3. Repeat & Truncate to 9.60 seconds
### 4. Generating probabilties matrix & subtitles
### 5. Intergrating with video

In [19]:
import time                                          #1
print (time.strftime("%m/%d/%Y %H:%M:%S"))

11/02/2018 13:19:50


In [1]:
import moviepy.editor as mp
import numpy as np
import os
from scipy.io import wavfile
import vggish_input
import vggish_params
import vggish_postprocess
import vggish_slim
import tensorflow as tf
from keras.models import load_model
import datetime
import pandas as pd
import plotly.plotly as py
import plotly.graph_objs as go
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

Using TensorFlow backend.


In [2]:
def truncateAudioFs(data, fs):
    i = 0
    new_data=[]
    while (i+1)*fs<len(data):
        new_data.append(data[i*fs:i*fs + int(fs*0.96)])
        i = i + 1
    new_data.append(data[i*fs::])
    new_data = np.vstack(new_data)
    return new_data

In [3]:
def prepareAudio(WAV_FILE):
    fs, data = wavfile.read(WAV_FILE)
    data = truncateAudioFs(data, fs)
    data = data / 32768.0
    return vggish_input.waveform_to_examples(data, fs)

In [4]:
def genWave(videoName):
    clip = mp.VideoFileClip(videoName)
    if clip.audio is None:
        print('No audio to recognize in the video.')
        return 0
    audioName = videoName[:-4]+'.wav'
    clip.audio.write_audiofile(audioName, codec='pcm_s16le', verbose=1)# 16 bit 44100 fs PCM wav
    return audioName

In [5]:
def getMelSpecGram(fname):
    if not os.path.isfile(fname):
        print('File does not exists.')
        return 0
    dataType = fname[-3:]
    if dataType == 'mp4':
        audioName = genWave(fname)
        mels = prepareAudio(audioName)
    elif dataType == 'wav':
        mels = prepareAudio(fname)
    return mels

In [24]:
print (time.strftime("%m/%d/%Y %H:%M:%S"))
getMelSpecGram(FILE_NAME)
print (time.strftime("%m/%d/%Y %H:%M:%S"))


11/02/2018 13:28:13
11/02/2018 13:28:18


In [22]:
def getAudioSetFeatures(fname):
    print('Start loading PCA parameters:')
    print (time.strftime("%m/%d/%Y %H:%M:%S"))
    pproc = vggish_postprocess.Postprocessor('vggish_pca_params.npz')
    print('Start getting Mels:')
    print (time.strftime("%m/%d/%Y %H:%M:%S"))
    mels = getMelSpecGram(fname)
    print('Start creating sessions:')

    print (time.strftime("%m/%d/%Y %H:%M:%S"))
    with tf.Graph().as_default(), tf.Session() as sess:
        # Define the model in inference mode, load the checkpoint, and
        # locate input and output tensors.
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, 'vggish_model.ckpt')
        print('Finish loading VGG model:')

        print (time.strftime("%m/%d/%Y %H:%M:%S"))
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)
        # Run inference and postprocessing.
        print (time.strftime("%m/%d/%Y %H:%M:%S"))
        [embedding_batch] = sess.run([embedding_tensor],
                                    feed_dict={features_tensor: mels})
        print('Finish feeding VGG:')

        postprocessed_batch = pproc.postprocess(embedding_batch)
        print (time.strftime("%m/%d/%Y %H:%M:%S"))

        sample = unit8_to_float32(postprocessed_batch)
    return sample

In [7]:
def unit8_to_float32(x):
    return (np.float32(x) - 128.) / 128.

In [8]:
def pad2Ten(features):
    length = features.shape[0]
    if length <= 10:
        repetition = 10 // length + 1
        padFeatures = np.zeros((30, 128))
        start = 0
        end = length
        while start < 10:
            padFeatures[start:end] = features
            start = start + length
            end = end + length
        return padFeatures[0:10]
    else:
        return features[0:10]

In [9]:
def tenSegModelPreds(features, modelName, left=3, right=0):
    model = load_model(modelName)
    length = features.shape[0]
    inputAll = np.zeros((length, 10, 128))
    for i in range(length):
        start = max(0, i - left)
        end = min(length, i + 1 + right)
        current = features[start:end]
        current = pad2Ten(features[start:end])
        inputAll[i] = current
    predictions = model.predict(inputAll)
    return predictions

In [10]:
def getStartEnd(num):
    START_TIME = datetime.datetime(100,1,1,0,0,0)
    start = START_TIME + datetime.timedelta(0,num*1+0.001)
    end = START_TIME + datetime.timedelta(0,(num+1)*1)
    return (str(start.time()).replace('.',',')[:-3], str(end.time())+",000")

In [11]:
def giveThres(pred, thres=0.015, showSpeechMusic=False):
    with open("class_labels_indices.csv", "r") as fh:
        allclasses = fh.read().splitlines()
    classes2displaynames={int(i.split(',')[0]):i.split(',')[2] for i in allclasses[1:]}
    
    sort = np.argsort(pred)
    sort = sort[::-1]
    for i in range(527):
        if pred[sort[i+1]] < thres: break
    sort = sort[0:i]
    if showSpeechMusic == True : names = [classes2displaynames[i] for i in sort]
    else: names = [classes2displaynames[i] for i in sort if i!=0 and i!=137]
    sent = '--'.join(names)
    return sent

In [12]:
def subWrite(f, num, sub):
    f.write(str(num+1))
    f.write("\n")
    (start, end) = getStartEnd(num)
    f.write(start)
    f.write(' --> ')
    f.write(end)
    f.write("\n")
    f.write(sub)
    f.write("\n")
    f.write("\n")

In [13]:
def genSubsThres(preds, fname, thres):
    srtName = fname[:-4] + '.srt'
    with open(srtName, "w") as f:
        for i in range(len(preds)):
            sub = giveThres(preds[i], thres)
            subWrite(f, i, sub)

In [14]:
def giveSumRes(preds, thres=0.015, showSpeechMusic=False):
    pred = np.average(preds,axis=0)
    print('Tagging for the whold clip.')
    with open("class_labels_indices.csv", "r") as fh:
        allclasses = fh.read().splitlines()
    classes2displaynames={int(i.split(',')[0]):i.split(',')[2] for i in allclasses[1:]}
    sort = np.argsort(pred)
    sort = sort[::-1]
    for i in range(527):
        if pred[sort[i+1]] < thres: break
    sort = sort[0:i]
    if not showSpeechMusic:
        sort = [index for index in sort if index!=0 and index!=137] 
    predSum = 0.0
    for i in range(len(sort)):
        predSum += pred[sort[i]]
    for index in sort:
        print(classes2displaynames[index])
        print('----------------' + str(round(pred[index]/predSum*100,2)))

In [15]:
def printHeatMap(preds):
    classes = pd.read_csv("/scratch/work/xuz2/Hackathon/class_labels_indices.csv")
    heatmap = go.Heatmap(z=np.transpose(preds), 
                     y=classes['display_name'],
                     colorscale='Earth')
                                  
    data = [heatmap]

    plot(data, filename= FILE_NAME[:-4] + '_LEFT_' + str(WINDOW_LEFT) + '_RIGHT_' + 
         str(WINDOW_RIGHT) + '.html')

In [16]:
FILE_NAME = 'Game of Thrones Season 7 Official Trailer (HBO).wav'# wav or mp4
MODEL_NAME = 'model_27000[0.344128].h5'
WINDOW_LEFT = 2
WINDOW_RIGHT = 0
THRESHOLD = 0.010
SHOW_MUSIC_SPEECH = False

In [26]:
model = load_model(MODEL_NAME)

In [27]:
features = getAudioSetFeatures(FILE_NAME)

Start loading PCA parameters:
11/02/2018 13:40:50
Start getting Mels:
11/02/2018 13:40:50
Start creating sessions:
11/02/2018 13:40:54
INFO:tensorflow:Restoring parameters from vggish_model.ckpt
Finish loading VGG model:
11/02/2018 13:40:58
11/02/2018 13:40:58
Finish feeding VGG:
11/02/2018 13:41:06


In [31]:
features[0].shape

(128,)

In [33]:
print (time.strftime("%m/%d/%Y %H:%M:%S"))
model.predict(features[0:10].reshape(1,10,128))

11/02/2018 13:42:08


array([[4.58846420e-01, 5.67859551e-03, 6.77480269e-03, 1.15341938e-03,
        5.72425639e-03, 8.07672832e-03, 1.59373216e-04, 3.98593163e-03,
        2.72445381e-04, 4.36173839e-04, 7.55041139e-04, 2.74223188e-04,
        3.19529063e-04, 6.35031145e-04, 4.55133413e-04, 2.29443912e-03,
        9.24944121e-04, 3.08028975e-04, 3.51197785e-04, 1.76251645e-03,
        2.97224760e-04, 1.40096049e-03, 3.02048778e-04, 6.32540090e-04,
        9.32678638e-04, 3.16262199e-03, 3.23922560e-03, 2.83069722e-03,
        1.43822294e-03, 3.32346564e-04, 2.08653603e-03, 2.38534668e-03,
        7.77767971e-04, 1.17997627e-03, 1.46389400e-04, 7.21471326e-04,
        8.29411380e-04, 3.23723361e-04, 6.98664982e-04, 4.07305517e-04,
        8.04728654e-04, 1.31022942e-03, 2.63036869e-04, 2.31712474e-04,
        1.47413788e-02, 1.60412237e-04, 1.28638381e-02, 4.53824672e-04,
        7.60044961e-04, 5.44729119e-04, 1.91173549e-05, 1.21389492e-03,
        5.91442513e-04, 3.47103528e-03, 1.70284894e-03, 1.171051

In [23]:
getAudioSetFeatures(FILE_NAME)

Start loading PCA parameters:
11/02/2018 13:23:08
Start getting Mels:
11/02/2018 13:23:08
Start creating sessions:
11/02/2018 13:23:12
INFO:tensorflow:Restoring parameters from vggish_model.ckpt
Finish loading VGG model:
11/02/2018 13:23:16
11/02/2018 13:23:16
Finish feeding VGG:
11/02/2018 13:23:22


array([[ 0.3515625, -0.921875 ,  0.1484375, ..., -0.9609375, -0.3671875,
         0.9921875],
       [ 0.359375 , -0.90625  ,  0.140625 , ..., -1.       ,  0.0390625,
         0.9921875],
       [ 0.2578125, -0.8671875,  0.21875  , ...,  0.046875 , -0.3515625,
         0.9921875],
       ...,
       [ 0.203125 , -0.90625  ,  0.28125  , ..., -0.609375 , -1.       ,
         0.9921875],
       [ 0.1953125, -0.9375   ,  0.1640625, ..., -1.       ,  0.328125 ,
         0.9921875],
       [ 0.1875   , -0.9140625,  0.296875 , ..., -0.9765625, -0.421875 ,
         0.9921875]], dtype=float32)

In [253]:
preds = tenSegModelPreds(getAudioSetFeatures(FILE_NAME), MODEL_NAME, WINDOW_LEFT, WINDOW_RIGHT)
genSubsThres(preds, FILE_NAME, THRESHOLD)

INFO:tensorflow:Restoring parameters from vggish_model.ckpt


INFO:tensorflow:Restoring parameters from vggish_model.ckpt


In [254]:
giveSumRes(preds, THRESHOLD, SHOW_MUSIC_SPEECH)


Tagging for the whold clip.
"Squish"
----------------13.47
"Vehicle"
----------------12.84
"Gasp"
----------------8.63
"Whispering"
----------------7.98
"Musical instrument"
----------------7.48
"Single-lens reflex camera"
----------------7.42
"Inside
----------------7.19
"Sound effect"
----------------6.69
"Bang"
----------------6.24
"Male speech
----------------5.79
"Narration
----------------5.56
"Echo"
----------------5.43
"Rain"
----------------5.28


In [230]:
printHeatMap(preds)

['Greys', 'YlGnBu', 'Greens', 'YlOrRd', 'Bluered', 'RdBu',
            'Reds', 'Blues', 'Picnic', 'Rainbow', 'Portland', 'Jet',
            'Hot', 'Blackbody', 'Earth', 'Electric', 'Viridis', 'Cividis']