# Instructions


Check frames in `data/*vid_name*/` folders match `data/labels.csv` file.

You need 1:1 mapping between frame files and rows of `labels.csv` or the rest of the code in this package won't run. 

A mismatch might be caused by a difference in FPS in labels vs extracted frames or an issue in frame extraction or an issue in your labels file.

This notebook will do the check and help debug a mismatch between frame files and `labels.csv`

Note: Your labels might be out by 1 or 2 frames due to the granularity of your labels vs FPS - the easiest solution is to delete those extra frames (manually or modify this notebook to do it using `os.remove()` to delete those extra frames)...

# Setup

In [224]:
import numpy as np
import cv2
from time import time as timer
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
%matplotlib inline

In [225]:
# setup paths
pwd = os.getcwd().replace("notebooks","")
path_videos = pwd + 'videos/'
path_data = pwd + 'data/'

In [226]:
# read video paths
paths = os.listdir(path_data)
paths = [path_data + v for v in paths if v != '.DS_Store']

In [227]:
# read frames paths
paths_videos = os.listdir(path_data)
paths_videos = [path_data + v + '/' for v in paths_videos if v != '.DS_Store' and v != 'labels.csv']

# Verify Frames Match Labels.csv

In [209]:
if not os.path.exists(pwd + 'data/labels.csv'):
    print("ERROR: labels.csv missing - please copy labels.csv to /data/labels.csv")
    print()
    print("Sample label file below:")    
    print("""video\t\t\tframe \t\t\t\t label
    20160801_no9_1\t\t20160801_no9_1_00001.jpeg	search
    20160801_no9_1\t\t20160801_no9_1_00002.jpeg	search
    ...""")
    print()
    print("Note you also need a 'split' column that assigns videos to train/valid/test splits - can use /notebooks/helper_add_train_valid_test_splits_to_labels.ipynb to add splits")

In [228]:
# load labels
labels = pd.read_csv(path_data + 'labels.csv')

In [229]:
labels['frame'] = labels['frame'].str.replace('.jpeg','.jpg')

In [230]:
labels.head()

Unnamed: 0,video,frame,label
0,20150807_no8B_1,20150807_no8B_1_000000.jpg,shallow
1,20150807_no8B_1,20150807_no8B_1_000001.jpg,shallow
2,20150807_no8B_1,20150807_no8B_1_000002.jpg,shallow
3,20150807_no8B_1,20150807_no8B_1_000003.jpg,shallow
4,20150807_no8B_1,20150807_no8B_1_000004.jpg,shallow


In [231]:
vids_error = []
vids_ok = []

# for path_video in paths_videos:    
for path_video in paths_videos:

    # get vid name from path
    vid_name = path_video[:-1].split("/")[-1]

    # convert list of frame files to dataframe
    paths_frames = os.listdir(path_video)
    paths_frames = [f for f in paths_frames if f != '.DS_Store']
    paths_frames = pd.DataFrame(paths_frames, columns = ['frame_file'])

    # subset labels to just this vid
    labels_vid = labels[labels['video'] == vid_name]

    # collect errors so can print grouped
    if not len(labels_vid) == len(paths_frames):
        vids_error.append("{} .::. Different number of labels ({}) than frames ({}) ... DIFF: {} ".format(vid_name, len(labels_vid),len(paths_frames), len(labels_vid) - len(paths_frames)))
    else:
        vids_ok.append("{} .::. Same number of labels and frames for vid".format(vid_name))
        
# print errors
print("ERRORS: {} VIDS WITH #LABELS != #FRAMES".format(len(vids_error)))
for msg in vids_error:
    print(msg)

print("\n")
print("OK: {} VIDS WITH #LABELS != #FRAMES".format(len(vids_ok)))
for msg in vids_ok:
    print(msg)

ERRORS: 11 VIDS WITH #LABELS != #FRAMES
20160930_no8_4 .::. Different number of labels (2817) than frames (2816) ... DIFF: 1 
20160929_no9_3 .::. Different number of labels (3543) than frames (3600) ... DIFF: -57 
20160929_no9_2 .::. Different number of labels (563) than frames (602) ... DIFF: -39 
20160819_no9_2 .::. Different number of labels (577) than frames (558) ... DIFF: 19 
20150820_no8B_1 .::. Different number of labels (3699) than frames (3600) ... DIFF: 99 
20150827_no8B_2 .::. Different number of labels (581) than frames (566) ... DIFF: 15 
20160929_no9_4 .::. Different number of labels (3583) than frames (2816) ... DIFF: 767 
20161005_no9_4 .::. Different number of labels (4047) than frames (3600) ... DIFF: 447 
20161005_no9_2 .::. Different number of labels (579) than frames (560) ... DIFF: 19 
20160930_no8_3 .::. Different number of labels (3601) than frames (3600) ... DIFF: 1 
20160930_no8_1 .::. Different number of labels (3531) than frames (3600) ... DIFF: -69 


OK: 

# Find missing frames for a video

In [90]:
# set video you want to visualize with labels - take this from the error output above
vid_name = "20160930_no8_4"

In [234]:
# for path_video in paths_videos:    
for vid_error in vids_error:

    # get vid name from path
    vid_name = vid_error.split(" .::.")[0]
    print(vid_name)

    # convert list of frame files to dataframe
    paths_frames = os.listdir(path_data+vid_name)
    paths_frames = [f for f in paths_frames if f != '.DS_Store']
    paths_frames = pd.DataFrame(paths_frames, columns = ['frame_file'])
    paths_frames.sort_values('frame_file', inplace=True)

    # figure out which frames we're missing
    labels_vid = labels[labels['video'] == vid_name]

    labels_vid = pd.merge(labels_vid,paths_frames,left_on='frame', right_on='frame_file',how='left')

    print(len(labels_vid[labels_vid['frame_file'].isnull()]))
    
#     print(labels_vid[labels_vid['frame_file'].isnull()])

20160930_no8_4
1
20160929_no9_3
0
20160929_no9_2
0
20160819_no9_2
19
20150820_no8B_1
99
20150827_no8B_2
15
20160929_no9_4
767
20161005_no9_4
447
20161005_no9_2
19
20160930_no8_3
1
20160930_no8_1
0


# Plot frames with labels for a video

In [235]:
def plot_frames_with_labels(vid_name, index_to_start, index_to_end):
    """
    Plot frames of video with label from index_to_start to index_to_end
    
    :video_name: name of video (matches folder containing video's frames in /data/)
    :index_to_start: frame to start plotting from
    :index_to_end: frame to end on
    """

    assert (index_to_start<index_to_end), "ERROR: must input index_to_end > index_to_start"

    path_video = path_data + vid_name + '/'

    # convert list of frame files to dataframe
    print(path_video)
    paths_frames = os.listdir(path_video)
    paths_frames = [f for f in paths_frames if f != '.DS_Store']
    paths_frames = pd.DataFrame(paths_frames, columns = ['frame_file'])
    paths_frames.sort_values('frame_file', inplace=True)
    
    assert (index_to_end <= len(paths_frames)), "ERROR: index_to_end greater than number of frames"
    
    # get labels for this vid
    labels_vid = labels[labels['video'] == vid_name]
    
    for i,row in enumerate(labels_vid.values):
        if i >= index_to_start and i <= index_to_end:
            # get dict for df row
            rr = dict(zip(labels_vid.columns, row))

            # print label with plotted image
            print(rr['label'], '\t\t', rr['frame'])
            plt.imshow(Image.open(path_data + rr['video'] + '/' + rr['frame']))
            plt.show()

In [None]:
plot_frames_with_labels("20160930_no8_1",1,100)

# Clean up labels

## remove a video from labels

In [204]:
labels.shape

(88031, 3)

In [361]:
# uncomment and run this cell to remove a video from labels and update labels.csv
print(labels.shape)
video_to_remove_from_labels = '20160929_no9_3'
labels = labels[labels['video'] != video_to_remove_from_labels]
labels.to_csv(path_data + 'labels.csv', index=False)
print(labels.shape)

(74507, 3)
(70964, 3)


In [362]:
# reload labels file
labels = pd.read_csv(path_data + 'labels.csv')
labels.head()
print(labels.shape)

(70964, 3)


## Delete chunk of frames for a video

In [297]:
labels.shape

(83051, 3)

In [304]:
labels['frameid'] = labels['frame'].str.split(".").str.get(0).str.split("_").str.get(-1).astype(int)

In [305]:
labels = labels[~((labels['video'] == '20160819_no9_2') & (labels['frameid'] > 557))]

In [306]:
del labels['frameid']

In [307]:
labels.to_csv(path_data + 'labels.csv', index=False)

In [393]:
labels.shape

(70964, 3)

In [394]:
labels['label'].value_counts()

search        28261
shallow       11675
ascent         9135
descent        8839
subsurface     7225
bottom         5452
breath          377
Name: label, dtype: int64