## Model Results Comparison

Objectives:

    1. Establish clear concepts / methodology for assessment
    2. Assess Vendor Performance using definitions above
    3. Assess Model performance by using same metrics

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from glob import glob
import os

In [2]:
# Test Dataset information
videos = glob('/datadrive/test_dataset/**')
VIDEOS_LIST = os.path.abspath('syria_videos.txt')
FRAME_LEVEL_SAVE_FOLDER = os.path.abspath('/datadrive/intermediate/frame_level')
VIDEO_LEVEL_SAVE_FOLDER = '/datadrive/intermediate/video_level'
VIDEO_SIGNATURES_SAVE_FOLDER = '/datadrive/intermediate/video_signatures'

In [3]:
len(videos)

3843

In [4]:
vendorquery_df = pd.read_excel('Benetech_Deduping Report_25Jan2019.xlsx',sheet_name='Queries')

In [5]:
vendorquery_df['filename'] = vendorquery_df['Q ID'].apply(lambda x:x.split('/')[-1])
vendorquery_df.head()

Unnamed: 0,Q ID,filename
0,s3.videocitestest/_5g95MUIieM-7673d38ccc6e268b...,_5g95MUIieM-7673d38ccc6e268bf6d79e61f388d37b.mp4
1,s3.videocitestest/_a8srUndP8Y-48644a14cbb89129...,_a8srUndP8Y-48644a14cbb89129dccdcde0f5715de3.mp4
2,s3.videocitestest/_afTSrDroHs-06d2f1b68aabc513...,_afTSrDroHs-06d2f1b68aabc513424f74184663caa9.mp4
3,s3.videocitestest/_bja8VCDY_w-129f9c4b34cfe622...,_bja8VCDY_w-129f9c4b34cfe6227d4311231e08ee3f.mp4
4,s3.videocitestest/_cmXD8JT2hY-2b0c77a29384fe93...,_cmXD8JT2hY-2b0c77a29384fe9387b37408888ea96e.mp4


In [6]:
vendorquery_df['filename'].value_counts()

358ef46f0c2d4125945bf0d547e7ff0a.mp4                 1
0dbdb8c12bfa4e75880b606f3bb4a43b.webm                1
B68yHSlYjgo-15999884aee1a49a926462368fc798aa.mp4     1
5865784a2a654f36a98bf8a808a16a63.webm                1
74f788b680e24a789e5820e7e5e4b26f.ogv                 1
13146ea8523a4b24b111df7727bc9380.webm                1
b3a4d1f77cda451dabd8d4be294c4958.mp4                 1
X1zpH3AH-Q0-fc0726cb43f191159c53ac96ecd56b8b.mp4     1
RU7-9l71zWs-e2a2099fd673e8874930816fb8fe387f.mp4     1
a3aa0f40d50645b0b7fda69ea05bf172.mp4                 1
6975a5a021794433ac22918eff8458cc.ogv                 1
6bb05f5ac21b44de9566c1bec4e9230e.mp4                 1
69865809e5d34b4dab7d8cb9937b68de.webm                1
9720ef42cece4d818db2f12c01a11718.mp4                 1
d2c49de30ec5495fbc8bd60163f1360c.webm                1
06708214fe254a72aacb2ee15cea4546.mp4                 1
49fbb4a3fc254702974717b464b80173.ogv                 1
d26468ddc9eb4436906535812429566b.mp4                 1
65bdfeb5b0

## Match Query summary

In [8]:
# Vendor Results spreadsheet
vendor_df = pd.read_excel('Benetech_Deduping Report_25Jan2019.xlsx')
# Additional filename fields for both Query / Match Videos
vendor_df['q_filename'] = vendor_df['Q ID'].apply(lambda x:x.split('/')[-1])
vendor_df['m_filename'] = vendor_df['M ID'].apply(lambda x:x.split('/')[-1])
# 
unique_qs = vendor_df['q_filename'].value_counts().index
vendor_df.loc[vendor_df['q_filename'] == unique_qs[0],:]

Unnamed: 0,Q ID,Q URL,Q Duration,Q Width,Q Height,Q FPS,M ID,M URL,M Duration,M Status,q_filename,m_filename
1,s3.videocitestest/QGFBGnpXMqE-07d9e6754f49d3eb...,https://videocitestest.s3.amazonaws.com/QGFBGn...,81,640,360,30.0,s3.videocitestest/6YmKwon4X0c-f518e04bd71707dd...,https://videocitestest.s3.amazonaws.com/6YmKwo...,179,match,QGFBGnpXMqE-07d9e6754f49d3eb87d95767bff7e696.mp4,6YmKwon4X0c-f518e04bd71707ddd985f89f03cc7784.mp4
71,s3.videocitestest/QGFBGnpXMqE-07d9e6754f49d3eb...,https://videocitestest.s3.amazonaws.com/QGFBGn...,81,640,360,30.0,s3.videocitestest/sW02MYKmUaM-f8c6ef6974e0d853...,https://videocitestest.s3.amazonaws.com/sW02MY...,64,match,QGFBGnpXMqE-07d9e6754f49d3eb87d95767bff7e696.mp4,sW02MYKmUaM-f8c6ef6974e0d853df113f45e1242628.mp4
77,s3.videocitestest/QGFBGnpXMqE-07d9e6754f49d3eb...,https://videocitestest.s3.amazonaws.com/QGFBGn...,81,640,360,30.0,s3.videocitestest/018MsV21q00-3419176ec77eea4d...,https://videocitestest.s3.amazonaws.com/018MsV...,81,match,QGFBGnpXMqE-07d9e6754f49d3eb87d95767bff7e696.mp4,018MsV21q00-3419176ec77eea4da5161cf6faa05a3d.mp4
163,s3.videocitestest/QGFBGnpXMqE-07d9e6754f49d3eb...,https://videocitestest.s3.amazonaws.com/QGFBGn...,81,640,360,30.0,s3.videocitestest/4fbf84566d434628bc6954a36e77...,https://videocitestest.s3.amazonaws.com/4fbf84...,79,match,QGFBGnpXMqE-07d9e6754f49d3eb87d95767bff7e696.mp4,4fbf84566d434628bc6954a36e7722d5.webm
169,s3.videocitestest/QGFBGnpXMqE-07d9e6754f49d3eb...,https://videocitestest.s3.amazonaws.com/QGFBGn...,81,640,360,30.0,s3.videocitestest/7faf034173f4451398b05e6f1686...,https://videocitestest.s3.amazonaws.com/7faf03...,83,match,QGFBGnpXMqE-07d9e6754f49d3eb87d95767bff7e696.mp4,7faf034173f4451398b05e6f1686a896.webm
175,s3.videocitestest/QGFBGnpXMqE-07d9e6754f49d3eb...,https://videocitestest.s3.amazonaws.com/QGFBGn...,81,640,360,30.0,s3.videocitestest/4214d032b0924870a216958e84a7...,https://videocitestest.s3.amazonaws.com/4214d0...,86,match,QGFBGnpXMqE-07d9e6754f49d3eb87d95767bff7e696.mp4,4214d032b0924870a216958e84a7544a.ogv


In [26]:
vendor_df.shape

(303, 14)

In [9]:
vendor_df['q_filename'].value_counts()[:20]

QGFBGnpXMqE-07d9e6754f49d3eb87d95767bff7e696.mp4     6
550c6ee08ac64e31849862c60b810cb9.ogv                 5
4pcIA0Vx_Ok-a6ff21a834134a75701b950030cca07c.mp4     4
a11fa902d2df4e7fb623d8f15dbbc907.webm                4
4ac1ad9c38934eaab7fa0146141c9354.mp4                 4
9b223626726c4e8db8d9fe683cb64c20.webm                4
743d8aef543e49f0901853f8ec239978.webm                3
jU1MgcbSpwA-6a91a8bcbb42936aa2585d0f893fe0a3.mp4     3
20ef78f27905452db356123f68a38382.ogv                 3
04da0835050b472689abff6fbb47515c.ogv                 3
8885400fcf264e67babb46fbe1b8d290.mp4                 3
ba2dac8a2a4b44659c53671358b6980a.ogv                 3
GyaFEZujpbY-e2e5d448e968fd988bf891592ef744af.mp4     3
d037ae1a2c8a43938139252e660dec04.webm                3
7157713dfa9e45ea8e5a02d7eedc9bc8.mp4                 2
7472a034c4954c289f9e3064b0bbbbce.ogv                 2
412af188b7c64b43a51b870134a53b7b.webm                2
GhmiBQWoT0Y-9ba30d949fae62dfc38da4dffe6563a3.webm    2
c5bcecc23e

Assumption: Each query on the Queries sheet, was performed against the remaining dataset. This explains the reason why there are multiple queries for a given filename.

In [10]:
augmentation_manifesto = pd.read_csv('/home/felipeb/augmenter/augmented_dataset_manifest')

In [11]:
augmentation_manifesto['original_basename'] = augmentation_manifesto['file_path'].apply(lambda x:os.path.basename(x))
test_dataset = [os.path.basename(x) for x in videos]
used = augmentation_manifesto.loc[augmentation_manifesto['new_filename'].isin(test_dataset),:]
used['isAugmented'] = True

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [12]:
augmented_files = used['new_filename'].values

In [13]:
vendor_df['augmented_q'] = False
vendor_df['augmented_m'] = False

vendor_df.loc[vendor_df['q_filename'].isin(augmented_files),'augmented_q'] = True
vendor_df.loc[vendor_df['m_filename'].isin(augmented_files),'augmented_m'] = True

In order to assess performance, we'll we'll restrain the problem to the files that we actually have the answer (augmented files). Even though there might be additional files for which the vendor has reported the right answer but the only way to verify it is to perform manual validation.

In [14]:
vendorquery_df.head()

Unnamed: 0,Q ID,filename
0,s3.videocitestest/_5g95MUIieM-7673d38ccc6e268b...,_5g95MUIieM-7673d38ccc6e268bf6d79e61f388d37b.mp4
1,s3.videocitestest/_a8srUndP8Y-48644a14cbb89129...,_a8srUndP8Y-48644a14cbb89129dccdcde0f5715de3.mp4
2,s3.videocitestest/_afTSrDroHs-06d2f1b68aabc513...,_afTSrDroHs-06d2f1b68aabc513424f74184663caa9.mp4
3,s3.videocitestest/_bja8VCDY_w-129f9c4b34cfe622...,_bja8VCDY_w-129f9c4b34cfe6227d4311231e08ee3f.mp4
4,s3.videocitestest/_cmXD8JT2hY-2b0c77a29384fe93...,_cmXD8JT2hY-2b0c77a29384fe9387b37408888ea96e.mp4


In [15]:
m_q = dict(zip(vendor_df['m_filename'],vendor_df['q_filename']))

In [16]:
def matching(x,d):
    try:
        return d[x]
    except:
        return x

In [17]:
vendorquery_df['match_filename'] = vendorquery_df['filename'].apply(lambda x:matching(x,m_q))

In [18]:
vendorquery_df['match_filename'].nunique()

2356

In [19]:
vendorquery_df['isUnique'] = vendorquery_df.apply(lambda x: x['filename'] == x['match_filename'],axis=1)

In [20]:
vendorquery_df.isUnique.value_counts()

True     2246
False     254
Name: isUnique, dtype: int64

Unless there are two files from the same source, or the source itself, we cannot make any assumptions about a file being unique. Now we need to implement a few helper functions to evaluate the results in a way to account for these details.

In [21]:
augmentation_manifesto.head()

Unnamed: 0.1,Unnamed: 0,banner,bitrate,color_intensity,crop,extension,file_path,intensity,logo,offset,shaky,shaky_after_logo,speed_change,filename,original_extension,new_filename,original_basename
0,0,,32k,0.42,,,/datadrive/videos/BVx0nR366sk-3b0933b1902d9a66...,-0.09,,,False,True,-0.09,BVx0nR366sk-3b0933b1902d9a66730d447258c425ec,webm,16560611413745a1b33748cad5cdb726.webm,BVx0nR366sk-3b0933b1902d9a66730d447258c425ec.webm
1,1,,64k,0.24,,,/datadrive/videos/6Y1uTg7wCV8-b068dba553b7529e...,-0.05,logos/jazeera.png,6.0,False,True,,6Y1uTg7wCV8-b068dba553b7529eb2c2c4727c47fed9,webm,4fe31156621142de9be93e3c6a4a8bd9.webm,6Y1uTg7wCV8-b068dba553b7529eb2c2c4727c47fed9.webm
2,2,False,,,,,/datadrive/videos/VuW-M9TB7tM-73472a9187f950d4...,-0.12,logos/fox.png,2.0,False,False,,VuW-M9TB7tM-73472a9187f950d4812582d25154f1d4,mp4,49f397b693eb41a9b7c8a95b0147c6a0.mp4,VuW-M9TB7tM-73472a9187f950d4812582d25154f1d4.mp4
3,3,,,,[0.11 0.15],,/datadrive/videos/gMz2bJykKaY-31b60bdf2a061902...,,,3.0,False,True,0.34,gMz2bJykKaY-31b60bdf2a061902a1c004eab2ced596,webm,e366f4367c2148c3bb4e88a6bf3b0835.webm,gMz2bJykKaY-31b60bdf2a061902a1c004eab2ced596.webm
4,4,,,,[0.04 0.03],,/datadrive/videos/6QWIAsPOlxM-907ba6ac2932e487...,-0.19,logos/sbt.gif,3.0,False,False,0.31,6QWIAsPOlxM-907ba6ac2932e487c215c9b364d06e14,webm,7611fe4baab94392a82f5a8a28bdeba2.webm,6QWIAsPOlxM-907ba6ac2932e487c215c9b364d06e14.webm


In [22]:
# Match Augmented to their counterparts
a_o = dict(zip(augmentation_manifesto['new_filename'],augmentation_manifesto['original_basename']))
vendorquery_df['original_basename_q'] = vendorquery_df['filename'].apply(lambda x:matching(x,a_o))
vendorquery_df['original_basename_m'] = vendorquery_df['match_filename'].apply(lambda x:matching(x,a_o))

In [23]:
per_a = vendorquery_df['original_basename_q'].value_counts()
msk = per_a[per_a > 1].index.values

In [24]:
restricted = vendorquery_df.loc[vendorquery_df['original_basename_q'].isin(msk),:]
# restricted = restricted.loc[restricted.isUnique == False,:]
restricted.shape

(1585, 6)

In [25]:
sum(restricted['original_basename_m'] == restricted['original_basename_q']) / restricted.shape[0]

0.9570977917981073

In [26]:
sum(vendorquery_df['original_basename_m'] == vendorquery_df['original_basename_q'])

2387

## Analysing Results from our model

In [27]:
from winnow.feature_extraction import SimilarityModel


sm = SimilarityModel()

video_signatures = sm.predict(VIDEO_LEVEL_SAVE_FOLDER)
# Save Signatires
video_signatures.shape

(3843, 4096)


(3843, 500)

In [28]:
video_signatures = np.nan_to_num(video_signatures)

In [29]:
filenames = [x.split('_vgg')[0].split('/')[-1] for x in  sm.index]

In [30]:
in_q = vendorquery_df['filename'].apply(lambda x:x.split('.')[0])

In [31]:
video_space = video_signatures[np.isin(filenames,in_q)]
labels = np.array(filenames)[np.isin(filenames,in_q)]

In [32]:
from sklearn.cluster import DBSCAN

In [33]:
clusters = DBSCAN(metric='cosine',eps=0.1,min_samples=1).fit_predict(video_space)

In [34]:
model_results = pd.DataFrame({'filename':labels,'cluster':clusters})

In [35]:
filename_original = a_o = dict(zip(augmentation_manifesto['new_filename'].apply(lambda x:x.split('.')[0]),augmentation_manifesto['original_basename']))

In [36]:
model_results['original_basename_q'] = model_results['filename'].apply(lambda x:matching(x,filename_original))
model_results['original_basename_q'] = model_results['original_basename_q'].apply(lambda x:x.split('.')[0])

In [37]:
model_results ['original_basename_m'] = 'NA'

In [38]:
for c in model_results.cluster.unique():
    
    msk = model_results.cluster == c
    prevailing = model_results.loc[msk,:].original_basename_q.value_counts().index.values[0]
    model_results.loc[msk,'original_basename_m'] = prevailing

    
    
    
    

In [39]:
sum(model_results['original_basename_m'] == model_results['original_basename_q'])

2486

In [44]:
model_results['filename'].nunique()

2500

In [47]:
model_results['isUnique'] = model_results['filename'] == model_results['original_basename_q']

In [50]:
model_results.isUnique.value_counts()

False    1551
True      949
Name: isUnique, dtype: int64

In [56]:
per_a = model_results['original_basename_q'].value_counts()
msk = per_a[per_a > 1].index.values
restricted = model_results.loc[restricted['original_basename_q'].isin(msk),:]

IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match

In [54]:
msk

array(['7LYTx0IJtPY-983ff03d543f14a7fdf9f8e84e55f6b6',
       'cW51HiyRXTk-102421b7f08102d49fa0d2ba8e792151',
       '_IeeP6GgqAA-6d13b4ca9335abe139f9041d41ee24a3',
       'vjSxK0g8E_w-3dbf4e5565cb6c549ce8707105cfecc3',
       'xAIq2HtX_lk-7ddb270bece191e9e4dcab56fa46391e',
       'epjbvDTIdF0-0eca92847d6dbf683f9afea77a544124',
       'fOqdfZdmNEQ-4060ff05b68d5c33f74d426cb619c905',
       'E6B3Hwsd5mM-f743130db139c5722a3a610dccf88b7d',
       '8Rvu-jlsaOU-5b988b26edd0096ea42d3b804621d297',
       'D3FpFnEvBTk-fe5d058c03547270eb52a229497b5256',
       'AA82DdkVh5s-db5b2e3d870f1005da5e88c3e6b256f4',
       '94g4ysN4qVs-657067b21447935a88b4da5a1a74eeac',
       'FHF8jy9U694-416457c565713a90334d032a75d3ff93',
       'jr6oGoqDEfI-8842ae65a03a5bdb8c23e78509adb4b1',
       'fJa7D3Uzaas-7f59def97c7a906c066e5dec84bed753',
       'VaO-OpFPg2A-0eee60c766905dd04230864eee939e21',
       'LFnMl6Lj3co-11cf15e9ee5ae44575f2743de9812ee5',
       'N1_SdQE1XME-09e54af56f7e9e22f4653052ebbb8054',
       '6o