## Video search in scala

This project aims to create a Flask based REST API that can be used to query for scenes inside a video. First we define a few helper functions needed for this functionality.
The first of these is the tf-idf functions needed for doing a plot-subtitle based search on the videos.

### tf-Idf and cosine similarity functions

In [2]:
import math
from collections import defaultdict

def term_frequency(document):               # output the normalized term frequencies for all words
    term_frequency = dict()                 # creates a dictionary (hash table)
    for word in document:
        if word not in term_frequency:
            term_frequency[word] = document.count(word)/float(len(document))
    return term_frequency

def inv_doc_frequency(documents):
    allwords, idf, doc_word_set = set(), defaultdict(lambda: 0.0), []
    print('Number of documents is ', len(documents))
    for doc in documents:
        unique_words = set(doc)
        for word in unique_words:
            idf[word] += 1
    for word in idf:
        idf[word] = 1.0 + math.log(float(len(documents))/idf[word])
    return idf

def tf_idf(tf_list, idf):
    tfIdf = dict()
    for doc in tf_list:                               # create a dictionary of dictionaries
        tfIdf[doc] = dict()
        for term in tf_list[doc]:
            tfIdf[doc][term] = tf_list[doc][term] * idf[term]
    return tfIdf

def cosine_similarity(query, idf, tfIdf, document_size):
    query_set = set(query)                       # take the unique words in the query
    print('query is ', query_set)
    print('IDF is', idf)
    if len(query_set) == 0:
        return (-1, -1)
    term_freq = 1.0/len(query_set)               # the frequency of each term of the query is the same
    max_similarity = 0                           # initialization of similarity
    for doc in range(document_size):
        dotproductsum = 0                        # sum of the dot product of query and document
        query_mag = 0                            # query magnitude and doc. term magnitude
        doc_mag = 0
        for term in query_set:                   # global idf * term_freq * tf_idf of term in document
            dotproductsum += (idf[term] if term in idf else 0) * term_freq * (tfIdf[doc][term] if term in tfIdf[doc] else 0)
            query_mag += math.pow((idf[term] if term in idf else 0) * term_freq, 2)
        for word in tfIdf[doc]:                 # take the tf_idf of all terms in the document and square-add
            doc_mag += math.pow(tfIdf[doc][word], 2)
        cosine_sim = dotproductsum / (math.sqrt(query_mag * doc_mag) + 0.001) # add 0.001 to avoid 0/0 division
        if (cosine_sim >= max_similarity):       # check the highest cosine similarity in each iteration
             max_similarity = cosine_sim
             max_doc = doc
    if (max_similarity == 0):
        max_doc = "None"
    return (max_similarity, max_doc)

### Plot to shot utility functions

In [4]:
def plot_shot_helper(plot_sentences, subtitle_data, stamp_data):
    plot_sub_data = plot_sub_assigner(plot_sentences, subtitle_data['sub_text'])
    plot_to_sub = plot_sub_data['plot_to_sub']
    tf_list = plot_sub_data['tf_list']
    idf, tfIdf = plot_sub_data['idf'], plot_sub_data['tfIdf']
    sub_to_shot = sub_shot_assigner(subtitle_data['sub_stamps'], stamp_data['scene_stamps'])
    plot_to_shot = plot_shot_assigner(plot_to_sub, sub_to_shot)
    return {'plot_to_shot': plot_to_shot, 'idf': idf, 'tfIdf': tfIdf, 'tf_list': tf_list}

def plot_sub_assigner(plot_sentences, sub_text):  # used by sim. function 1
    # plot assignment to shots
    plot_to_sub = [[] for i in range(len(plot_sentences))]
    tf_list = dict()
    # find term frequency for all plot sentences
    for index, plot_sentence in enumerate(plot_sentences):
        tf_list[index] = term_frequency(plot_sentence)
    idf = inv_doc_frequency(plot_sentences)
    tfIdf = tf_idf(tf_list, idf)
    # which plot sentence most similar with subtitle?
    for index, sub_sentence in enumerate(sub_text):
        similarity = cosine_similarity(sub_sentence, idf, tfIdf, len(plot_sentences))
        if similarity == (-1, -1) or similarity == (0, 'None'):    # query has a problem
            continue
        else:
            plot_to_sub[similarity[1]].append((index, similarity[0]))
    # sort plot_to_sub before return
    for i in range(len(plot_to_sub)):
        plot_to_sub[i] = sorted(plot_to_sub[i], key = lambda x: x[1], reverse=True)
    return {'plot_to_sub': plot_to_sub, 'idf': idf, 'tfIdf': tfIdf, 'tf_list': tf_list}

def sub_shot_assigner(sub_stamps, scene_stamps):
    ''' first part assigns shot numbers to each part of a subtitle
        optimizations can be done here '''
    temp_sub_shot = [[0, 0] for i in range(len(sub_stamps))]
    for sub_index, sub in enumerate(sub_stamps):
        for scene_index, scene in enumerate(scene_stamps):
            if (sub[0] < scene[1]):
                temp_sub_shot[sub_index][0] = scene_index
                break
    for sub_index, sub in enumerate(sub_stamps):
        for scene_index, scene in enumerate(scene_stamps):
            if (sub[1] < scene[1]):
                temp_sub_shot[sub_index][1] = scene_index
                break
    # second part assigns the subtitles properly to shots based on above information
    sub_to_shot = [None]*len(temp_sub_shot)
    for index, tup in enumerate(temp_sub_shot):
        if (tup[0] == tup[1]):    # subtitle start and end in the same shot
            sub_to_shot[index] = tup[0]
        else:    # tup[1] - tup[0] >= 1
            diff = tup[1] - tup[0]    # (scene gap between the subtitle start and end) + 1
            for i in range(1, diff):    # if difference is 1 it won't work
                sub_to_shot[index+i] = tup[0] + i
            if ((scene_stamps[tup[0]][1]-sub_stamps[index][0])/float(scene_stamps[tup[0]][1]-scene_stamps[tup[0]][0])) > ((sub_stamps[index][1]-scene_stamps[tup[1]][0])/float(scene_stamps[tup[1]][1]-scene_stamps[tup[1]][0])):
                sub_to_shot[index] = tup[0]
            else:
                sub_to_shot[index] = tup[1]
    return sub_to_shot

def plot_shot_assigner(plot_to_sub, sub_to_shot):
    ''' plot_to_sub[i] gives the matching list of subtitle sentences ->
        [(35, 0.2656571164563915), (604, 0.2658152134299805), (619, 0.26629063540377135),
         (624, 0.44261725639867383), (687, 0.3904935983047358)] '''
    temp, plot_to_shot = [[] for i in range(len(plot_to_sub))], [[] for i in range(len(plot_to_sub))]  # as many as the number of plot sentences
    for i in range(len(plot_to_sub)):
        temp[i] = [sub_to_shot[j[0]] for j in plot_to_sub[i]]
    # use this method instead of list(set())
    for i in range(len(plot_to_sub)):
        for item in temp[i]:
            if item not in plot_to_shot[i]:
                plot_to_shot[i].append(item)
    # now plot_to_shot has the sorted list of shots
    return plot_to_shot

### Main functionality

In [None]:
# -*- coding: utf-8 -*-
# above line required to make non ascii characters work
# ffmpeg -i test.mp4 -vf select='gt(scene\,0.4)' -vsync vfr thumb%04d.png
# use above to get thumbnail output
import os, sys, re
import pdb
from pyspark import SparkContext
from pyspark import SparkConf
from nltk.tokenize import sent_tokenize, word_tokenize    # nltk sentence and word tokenizers
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()                  # initialization of the lemmatizer
from nltk.corpus import stopwords                         # get list of stop words
import subprocess, shlex
symbols_list = ['...', "''",  '.', '!', '?', ',', '``', '--', '[', ']', '<', '>', '♪', '/i', '/', ';', '(', ')', '-', ':', '¡', '¿']
sc = SparkContext('local[*]', 'PySpark')

# tokenizer functionality common to all functions
def tokenizer(query):
    temp = word_tokenize(query.lower()) # needs to be lowered for sim. 2
    temp = [j for j in temp if j not in symbols_list]
    temp = [wordnet_lemmatizer.lemmatize(j) for j in temp]
    temp = [j for j in temp if j not in stopwords.words('english')]
    return temp    # return a pointer to the list

class VideoFile(object):
    def __init__(self, path_to_srt, path_to_plot):
        global sc
        self.plot_sentences = self.extract_plot_sentences(path_to_plot)
        self.stamp_data = self.get_stamps()
        self.subtitle_data = self.get_subtitle_data(path_to_srt)
        plot_shot_data = plot_shot_helper(self.plot_sentences, self.subtitle_data,
                                          self.stamp_data)
        self.plot_to_shot = plot_shot_data['plot_to_shot']
        self.idf = plot_shot_data['idf']
        self.tfIdf = plot_shot_data['tfIdf']
        self.tf_list = plot_shot_data['tf_list']
        print('Processing complete')
        sys.stderr.write('\nProcessing complete\n')
        sc.stop()

    def extract_plot_sentences(self, plot_txt_path):
        sys.stderr.write('\nStarting plot sentence processing ...\n')
        plot_file = sc.textFile(plot_txt_path)
        plot_sentences = plot_file.flatMap(sent_tokenize).coalesce(1).collect()
        plot_sentences = [tokenizer(i) for i in plot_sentences]
        return plot_sentences

    # preprocessing to get the scenes detected in the video
    # scene stamps define the scene boundaries
    def get_stamps(self):
        time_stamps, scene_stamps = [0], []
        sys.stderr.write('\nSubtitles processing started ...')
        subtitles_data = sc.textFile('hdfs://localhost:9000/user/arun/scenes.csv').zipWithIndex().filter(lambda x: x[1] > 1).map(lambda line: line[0].split(',')).glom().collect()
        for ts in subtitles_data[0]:
            time_stamps.append(float(ts[3]))  # the timestamp is the third field
            scene_stamps.append((time_stamps[-2], time_stamps[-1]))
        return {'time_stamps': time_stamps, 'scene_stamps': scene_stamps}

    def get_subtitle_data(self, sub_file_path):
        sub_stamps, sub_text, buf = [], [], []
        subs_rdd = sc.newAPIHadoopFile(sub_file_path,
                                       "org.apache.hadoop.mapreduce.lib.input.TextInputFormat",
                                       "org.apache.hadoop.io.LongWritable",
                                       "org.apache.hadoop.io.Text",
            conf={"textinputformat.record.delimiter": '\n'}).map(lambda line: line[1]).collect()
        # subs = sc.textFile(sub_file_path).map(lambda line: line).collect()
        print('Processing subs ...')
        # with open(sub_file_path) as fp:
        #     subs = fp.readlines()
        for index, line in enumerate(subs_rdd):
            l = line.strip()
            if l:
                buf.append(l)
            if (not l or index == len(subs_rdd) - 1):
                # first process the time stamps
                temp = re.split(' --> ', buf[1])
                temp_1, temp_2 = temp[0].split(':'), temp[1].split(':')
                # convert the time stamp into seconds only...
                temp_1 = float(temp_1[1])*60 + float(''.join(temp_1[2].split(',')))/1000.0
                temp_2 = float(temp_2[1])*60 + float(''.join(temp_2[2].split(',')))/1000.0
                sub_stamps.append((temp_1, temp_2))
                sub_text.append(' '.join(buf[2:]))
                buf = []
        raw_sub_text = sub_text
        temp = []
        for sub in sub_text:
            temp.append(tokenizer(sub))
        sub_text = temp
        return {'sub_stamps': sub_stamps, 'sub_text': sub_text, 'raw_sub_text': raw_sub_text}

    def search_shot(self, query):
        processed_query = tokenizer(query)            # tokenizer accepts a list
        max_sim, max_sim_sentence = cosine_similarity(processed_query,
                                                      self.idf, self.tfIdf,
                                                      len(self.plot_sentences))
        sys.stderr.write('\nSearch shot started\n')
        print('\nIdf is', self.idf)
        if max_sim_sentence == 'None':
            sys.stderr.write('\nMax is none\n')
            return {'shot_timestamps': -1, 'max_sim': -1}
        sys.stderr.write('\nSearch is valid\n')
        shots_list = self.plot_to_shot[max_sim_sentence]
        shot_timestamps = [self.stamp_data['time_stamps'][shot] for shot in shots_list]
        return {'shot_timestamps': shot_timestamps, 'max_sim': max_sim}

if __name__ == '__main__':
    video = VideoFile('hdfs://localhost:9000/user/arun/metadata/testvideo.srt', 'hdfs://localhost:9000/user/arun/metadata/plot.txt')
    query = raw_input('Enter a search query : ')
    search_results = video.search_shot(query)
    print(search_results)
