# Instructions


Check frames in `data/*vid_name*/` folders match `data/labels.csv` file.

You need 1:1 mapping between frame files and rows of `labels.csv` or the rest of the code in this package won't run. 

A mismatch might be caused by a difference in FPS in labels vs extracted frames, an issue in frame extraction or an issue in your labels file.

This notebook will do the check and help debug a mismatch between frame files and `labels.csv`

Note: Your labels might be out by 1 or 2 frames due to the granularity of your labels vs FPS - the easiest solution is to delete those extra frames manually

# Setup

In [2]:
import numpy as np
import cv2
from time import time as timer
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
%matplotlib inline

In [3]:
# setup paths
pwd = os.getcwd().replace("notebooks","")
path_videos = pwd + 'videos/'
path_data = pwd + 'data/'

In [4]:
# read frames paths
paths_videos = os.listdir(path_data)
paths_videos = [path_data + v + '/' for v in paths_videos if v != '.DS_Store' and v != 'labels.csv']
paths_videos.sort()

# Check we have a 1:1 mapping between labels and frames

In [5]:
if not os.path.exists(pwd + 'data/labels.csv'):
    print("ERROR: labels.csv missing - please copy labels.csv to /data/labels.csv")
    print()
    print("Sample label file below:")    
    print("""video\t\t\tframe \t\t\t\t label
    20160801_no9_1\t\t20160801_no9_1_00001.jpeg	search
    20160801_no9_1\t\t20160801_no9_1_00002.jpeg	search
    ...""")
    print()
    print("Note you also need a 'split' column that assigns videos to train/valid/test splits - can use /notebooks/helper_add_train_valid_test_splits_to_labels.ipynb to add splits")

In [6]:
# load labels
labels = pd.read_csv(path_data + 'labels.csv')

In [7]:
labels['frame'] = labels['frame'].str.replace('.jpeg','.jpg')

In [8]:
labels.head()

Unnamed: 0,video,frame,label,split
0,s1-218,s1-218-00001.jpg,noseal,valid
1,s1-218,s1-218-00002.jpg,noseal,valid
2,s1-218,s1-218-00003.jpg,noseal,valid
3,s1-218,s1-218-00004.jpg,noseal,valid
4,s1-218,s1-218-00005.jpg,noseal,valid


In [10]:
vids_error = []
vids_ok = []

# for path_video in paths_videos:    
for path_video in paths_videos:

    # get vid name from path
    vid_name = path_video[:-1].split("/")[-1]

    # convert list of frame files to dataframe
    paths_frames = os.listdir(path_video)
    paths_frames = [f for f in paths_frames if f != '.DS_Store']
    paths_frames = pd.DataFrame(paths_frames, columns = ['frame_file'])

    # subset labels to just this vid
    labels_vid = labels[labels['video'] == vid_name]

    # collect errors so can print grouped
    if not len(labels_vid) == len(paths_frames):
        vids_error.append("{} .::. Different number of labels ({}) than frames ({}) ... DIFF: {} ".format(vid_name, len(labels_vid),len(paths_frames), len(labels_vid) - len(paths_frames)))
    else:
        vids_ok.append("{} .::. Same number of labels and frames for vid".format(vid_name))

# print errors
print("ERRORS: {} VIDS WITH #LABELS != #FRAMES".format(len(vids_error)))
for msg in vids_error:
    print(msg)

print("\n")
print("OK: {} VIDS WITH #LABELS == #FRAMES".format(len(vids_ok)))
for msg in vids_ok:
    print(msg)

ERRORS: 0 VIDS WITH #LABELS != #FRAMES


OK: 46 VIDS WITH #LABELS == #FRAMES
s1-218 .::. Same number of labels and frames for vid
s10-6558 .::. Same number of labels and frames for vid
s11-7363 .::. Same number of labels and frames for vid
s12-3465 .::. Same number of labels and frames for vid
s13-14 .::. Same number of labels and frames for vid
s14-1705 .::. Same number of labels and frames for vid
s15-2589 .::. Same number of labels and frames for vid
s16-0 .::. Same number of labels and frames for vid
s17-2973 .::. Same number of labels and frames for vid
s18-630 .::. Same number of labels and frames for vid
s19-672 .::. Same number of labels and frames for vid
s2-1133 .::. Same number of labels and frames for vid
s20-842 .::. Same number of labels and frames for vid
s21-919 .::. Same number of labels and frames for vid
s22-3733 .::. Same number of labels and frames for vid
s23-4847 .::. Same number of labels and frames for vid
s24-5851 .::. Same number of labels and frames for vid


In [11]:
assert labels['video'].nunique() == len(paths_videos), "Different number of videos in labels file than /data/"

# Find missing frames for a video

> this function will output any frames in the dataset that do not have labels

In [12]:
# for path_video in paths_videos:    
for vid_error in vids_error:

    # get vid name from path
    vid_name = vid_error.split(" .::.")[0]
    print(vid_name)

    # convert list of frame files to dataframe
    paths_frames = os.listdir(path_data+vid_name)
    paths_frames = [f for f in paths_frames if f != '.DS_Store']
    paths_frames = pd.DataFrame(paths_frames, columns = ['frame_file'])
    paths_frames.sort_values('frame_file', inplace=True)

    # figure out which frames we're missing
    labels_vid = labels[labels['video'] == vid_name]

    labels_vid = pd.merge(labels_vid,paths_frames,left_on='frame', right_on='frame_file',how='left')

    print(len(labels_vid[labels_vid['frame_file'].isnull()]))
    
#     print(labels_vid[labels_vid['frame_file'].isnull()])