In [2]:
import faiss
import numpy as np
import pandas as pd
import scipy
from scipy.spatial.distance import cosine
import random
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, zero_one_loss
%matplotlib inline

In [3]:
from util import *

In [4]:
DATASET = 'data/LOCALHIST_CORRUPT'
TRAIN_CSV = DATASET+'_df.csv'
SUBCLIPS_CSV = DATASET+'_subclips.csv'

In [5]:
train_df = pd.read_csv(TRAIN_CSV)
train_df.head()

Unnamed: 0,video_path,frame_time,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,...,x_890,x_891,x_892,x_893,x_894,x_895,x_896,x_897,x_898,x_899
0,../data/1943 - Victory Through Air Power.avi,0.033367,0.0,0.0,0.0,0.0,0.0,0.140162,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,../data/1943 - Victory Through Air Power.avi,2.035369,0.000108,0.0,0.0,0.0,4.4e-05,0.007993,0.0,0.000355,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,../data/1943 - Victory Through Air Power.avi,2.068735,5.8e-05,0.0,0.0,0.000105,3.6e-05,0.009339,8e-06,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,../data/1943 - Victory Through Air Power.avi,2.102102,1.7e-05,0.0,0.0,0.000147,0.000103,0.008468,1.9e-05,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,../data/1943 - Victory Through Air Power.avi,2.135469,6e-06,0.0,0.0,0.000203,5e-05,0.008313,1.1e-05,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
subclips_df = pd.read_csv(SUBCLIPS_CSV)
subclips_df.head()

Unnamed: 0,clip_path,frame_time,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,...,x_891,x_892,x_893,x_894,x_895,x_896,x_897,x_898,x_899,video_path
0,./tmp/1943 - Victory Through Air Power.avi_sub...,0.0,3e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,../data/1943 - Victory Through Air Power.avi
1,./tmp/1943 - Victory Through Air Power.avi_sub...,7.173841,0.00038,0.001302,0.000158,0.000158,0.0,0.0,0.000278,0.001946,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,../data/1943 - Victory Through Air Power.avi
2,./tmp/1943 - Victory Through Air Power.avi_sub...,7.207207,3e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,../data/1943 - Victory Through Air Power.avi
3,./tmp/1943 - Victory Through Air Power.avi_sub...,7.807808,3e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,../data/1943 - Victory Through Air Power.avi
4,./tmp/1943 - Victory Through Air Power.avi_sub...,7.874541,3e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,../data/1943 - Victory Through Air Power.avi


In [7]:
subclips_df.video_path.value_counts()

../data/1948 - Melody Time.avi                               244
../data/1943 - Victory Through Air Power.avi                 184
../data/1945 - The Three Caballeros.avi                      135
../data/1940 - Pinocchio.avi                                  50
../data/1948 - So Dear to My Heart.mp4                        46
../data/1947 - Fun and Fancy Free.avi                         43
../data/1937 - Snow White and the Seven Dwarves.avi           41
../data/1949 - The Adventures Of Ichabod And Mr. Toad.m4v     37
../data/The.Young.Pope.S01E02.HDTVRip.Jaskier.avi             24
../data/1941 - Dumbo.avi                                      22
../data/The.Young.Pope.S01E01.HDTVRip.Jaskier.avi             22
../data/1942 - Bambi.avi                                      19
../data/The.Young.Pope.S01E04.HDTVRip.Jaskier.avi             19
../data/1946 - Make Mine Music.avi                            18
../data/The.Young.Pope.S01E03.HDTVRip.Jaskier.avi             18
../data/1928 - Mickey Mou

In [8]:
frame_times = train_df['frame_time']
labels = train_df.video_path.values
frame_vectors = np.ascontiguousarray(train_df.drop(['frame_time', 'video_path'], axis=1).values).astype('float32')
frame_vectors

array([[0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [1.0824373e-04, 0.0000000e+00, 0.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [5.8285088e-05, 0.0000000e+00, 0.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       ...,
       [5.7452440e-04, 0.0000000e+00, 0.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [0.0000000e+00, 8.3264413e-06, 8.3264413e-06, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [1.9428362e-04, 0.0000000e+00, 5.8285088e-05, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00]], dtype=float32)

In [9]:
clf = FaissVideoClassifier(frame_vectors, labels, treshold=0.7)

In [10]:
unique_subclips = list(subclips_df.clip_path.unique())
y_true = []
y_pred = []
for clip in unique_subclips:
    subclip_df = subclips_df[subclips_df.clip_path == clip]
    subclip_vectors =  subclip_df.drop(['frame_time', 'video_path', 'clip_path'], axis=1).values
    true_label = subclip_df.iloc[0].video_path
    y_true.append(true_label)
    predicted_label, votes, dists, indices = clf.classify(subclip_vectors)
    y_pred.append(predicted_label)

In [11]:
print(dists, votes)

[0.9341675  0.62598324 0.76554847] ['../data/The.Young.Pope.S01E01.HDTVRip.Jaskier.avi' 'miss'
 '../data/The.Young.Pope.S01E04.HDTVRip.Jaskier.avi']


In [12]:
print(classification_report(y_true, y_pred))

                                                           precision    recall  f1-score   support

      ../data/1928 - Mickey Mouse -  Steamboat Willie.mp4       1.00      0.20      0.33        10
            ../data/1929 - Mickey Mouse - Plane Crazy.avi       1.00      1.00      1.00        10
      ../data/1937 - Snow White and the Seven Dwarves.avi       1.00      0.70      0.82        10
                              ../data/1940 - Fantasia.avi       1.00      0.67      0.80         3
                             ../data/1940 - Pinocchio.avi       0.71      1.00      0.83        10
                                 ../data/1941 - Dumbo.avi       1.00      0.90      0.95        10
                  ../data/1941 - The Reluctant Dragon.avi       0.67      0.50      0.57         8
                                 ../data/1942 - Bambi.avi       0.89      0.89      0.89         9
             ../data/1943 - Victory Through Air Power.avi       1.00      0.80      0.89        10
         

In [13]:
print('Accuracy', accuracy_score(y_true, y_pred))
print('Error rate', zero_one_loss(y_true, y_pred))

Accuracy 0.7593582887700535
Error rate 0.2406417112299465


In [14]:
# Removing one video from the training set, so that we can see how the algorithm behaves on out-of-sample data

In [15]:
to_remove = random.sample(list(train_df.video_path.unique()), 10)
cut_df = train_df[~train_df.video_path.isin(to_remove)]

frame_times = cut_df.frame_time
labels = cut_df.video_path.values
frame_vectors = cut_df.drop(['frame_time', 'video_path'], axis=1).values

In [16]:
clf = FaissVideoClassifier(frame_vectors, labels, treshold=0.7)

In [17]:
targets = cut_df.video_path.unique()
unique_subclips = list(subclips_df.clip_path.unique())
y_true = []
y_pred = []
for clip in unique_subclips:
    subclip_df = subclips_df[subclips_df.clip_path == clip]
    subclip_vectors =  subclip_df.drop(['frame_time', 'video_path', 'clip_path'], axis=1).values
    true_label = subclip_df.iloc[0].video_path if subclip_df.iloc[0].video_path in targets else 'miss'
    y_true.append(true_label)
    predicted_label, votes, dists, indices = clf.classify(subclip_vectors)
    y_pred.append(predicted_label)
print(classification_report(y_true, y_pred))
print('Accuracy', accuracy_score(y_true, y_pred))
print('Error rate', zero_one_loss(y_true, y_pred))

                                                           precision    recall  f1-score   support

      ../data/1928 - Mickey Mouse -  Steamboat Willie.mp4       1.00      0.40      0.57        10
                              ../data/1940 - Fantasia.avi       0.38      1.00      0.55         3
                                 ../data/1942 - Bambi.avi       0.58      0.78      0.67         9
                  ../data/1945 - The Three Caballeros.avi       0.83      1.00      0.91        10
                       ../data/1946 - Make Mine Music.avi       0.90      0.90      0.90        10
                    ../data/1947 - Fun and Fancy Free.avi       0.37      1.00      0.54        10
../data/1949 - The Adventures Of Ichabod And Mr. Toad.m4v       0.69      0.90      0.78        10
        ../data/The.Young.Pope.S01E02.HDTVRip.Jaskier.avi       0.44      0.70      0.54        10
        ../data/The.Young.Pope.S01E03.HDTVRip.Jaskier.avi       0.56      1.00      0.72         9
        .