# Instructions


Check frames in `data/*vid_name*/` folders match `data/labels.csv` file.

You need 1:1 mapping between frame files and rows of `labels.csv` or the rest of the code in this package won't run. 

A mismatch might be caused by a difference in FPS in labels vs extracted frames or an issue in frame extraction or an issue in your labels file.

This notebook will do the check and help debug a mismatch between frame files and `labels.csv`

Note: Your labels might be out by 1 or 2 frames due to the granularity of your labels vs FPS - the easiest solution is to delete those extra frames (manually or modify this notebook to do it using `os.remove()` to delete those extra frames)...

# Setup

In [139]:
import numpy as np
import cv2
from time import time as timer
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
%matplotlib inline

In [145]:
# setup paths
pwd = os.getcwd().replace("notebooks","")
path_videos = pwd + 'videos/'
path_data = pwd + 'data/'

In [146]:
# read frames paths
paths_videos = os.listdir(path_data)
paths_videos = [path_data + v + '/' for v in paths_videos if v != '.DS_Store' and v != 'labels.csv']

# Verify Frames Match Labels.csv

In [147]:
if not os.path.exists(pwd + 'data/labels.csv'):
    print("ERROR: labels.csv missing - please copy labels.csv to /data/labels.csv")
    print()
    print("Sample label file below:")    
    print("""video\t\t\tframe \t\t\t\t label
    20160801_no9_1\t\t20160801_no9_1_00001.jpeg	search
    20160801_no9_1\t\t20160801_no9_1_00002.jpeg	search
    ...""")
    print()
    print("Note you also need a 'split' column that assigns videos to train/valid/test splits - can use /notebooks/helper_add_train_valid_test_splits_to_labels.ipynb to add splits")

In [148]:
# load labels
labels = pd.read_csv(path_data + 'labels.csv')

In [149]:
labels['frame'] = labels['frame'].str.replace('.jpeg','.jpg')

In [150]:
labels.head()

Unnamed: 0,video,frame,label
0,20150807_no8B_1,20150807_no8B_1_000000.jpg,shallow
1,20150807_no8B_1,20150807_no8B_1_000001.jpg,shallow
2,20150807_no8B_1,20150807_no8B_1_000002.jpg,shallow
3,20150807_no8B_1,20150807_no8B_1_000003.jpg,shallow
4,20150807_no8B_1,20150807_no8B_1_000004.jpg,shallow


In [151]:
vids_error = []
vids_ok = []

# for path_video in paths_videos:    
for path_video in paths_videos:

    # get vid name from path
    vid_name = path_video[:-1].split("/")[-1]

    # convert list of frame files to dataframe
    paths_frames = os.listdir(path_video)
    paths_frames = [f for f in paths_frames if f != '.DS_Store']
    paths_frames = pd.DataFrame(paths_frames, columns = ['frame_file'])

    # subset labels to just this vid
    labels_vid = labels[labels['video'] == vid_name]

    # collect errors so can print grouped
    if not len(labels_vid) == len(paths_frames):
        vids_error.append("{} .::. Different number of labels ({}) than frames ({}) ... DIFF: {} ".format(vid_name, len(labels_vid),len(paths_frames), len(labels_vid) - len(paths_frames)))
    else:
        vids_ok.append("{} .::. Same number of labels and frames for vid".format(vid_name))
        
# print errors
print("ERRORS: {} VIDS WITH #LABELS != #FRAMES".format(len(vids_error)))
for msg in vids_error:
    print(msg)

print("\n")
print("OK: {} VIDS WITH #LABELS != #FRAMES".format(len(vids_ok)))
for msg in vids_ok:
    print(msg)

ERRORS: 23 VIDS WITH #LABELS != #FRAMES
20150807_no8B_1 .::. Different number of labels (3139) than frames (101) ... DIFF: 3038 
20150820_no8B_2 .::. Different number of labels (570) than frames (101) ... DIFF: 469 
20150820_no8B_3 .::. Different number of labels (3407) than frames (101) ... DIFF: 3306 
20150820_no9W_3 .::. Different number of labels (2306) than frames (101) ... DIFF: 2205 
20160801_no9_1 .::. Different number of labels (3600) than frames (101) ... DIFF: 3499 
20160801_no9_2 .::. Different number of labels (3600) than frames (101) ... DIFF: 3499 
20160802_no8_1 .::. Different number of labels (3600) than frames (101) ... DIFF: 3499 
20160802_no8_3 .::. Different number of labels (2703) than frames (101) ... DIFF: 2602 
20160812_no9_1 .::. Different number of labels (3600) than frames (101) ... DIFF: 3499 
20160812_no9_2 .::. Different number of labels (563) than frames (101) ... DIFF: 462 
20160812_no9_3 .::. Different number of labels (3600) than frames (101) ... DIFF

# Find missing frames for a video

In [152]:
# for path_video in paths_videos:    
for vid_error in vids_error:

    # get vid name from path
    vid_name = vid_error.split(" .::.")[0]
    print(vid_name)

    # convert list of frame files to dataframe
    paths_frames = os.listdir(path_data+vid_name)
    paths_frames = [f for f in paths_frames if f != '.DS_Store']
    paths_frames = pd.DataFrame(paths_frames, columns = ['frame_file'])
    paths_frames.sort_values('frame_file', inplace=True)

    # figure out which frames we're missing
    labels_vid = labels[labels['video'] == vid_name]

    labels_vid = pd.merge(labels_vid,paths_frames,left_on='frame', right_on='frame_file',how='left')

    print(len(labels_vid[labels_vid['frame_file'].isnull()]))
    
#     print(labels_vid[labels_vid['frame_file'].isnull()])

20150807_no8B_1
3038
20150820_no8B_2
469
20150820_no8B_3
3306
20150820_no9W_3
2205
20160801_no9_1
3499
20160801_no9_2
3499
20160802_no8_1
3499
20160802_no8_3
2602
20160812_no9_1
3499
20160812_no9_2
462
20160812_no9_3
3499
20160819_no9_1
3499
20160819_no9_2
457
20160819_no9_3
3499
20160819_no9_4
3499
20160819_no9_5
667
20160930_no8_1
3431
20160930_no8_3
3500
20160930_no8_4
2715
20161005_no9_1
3499
20161005_no9_2
459
20161005_no9_3
3499
20161005_no9_4
3499


# Plot frames with labels for a video

In [153]:
def plot_frames_with_labels(vid_name, index_to_start, index_to_end):
    """
    Plot frames of video with label from index_to_start to index_to_end
    
    :video_name: name of video (matches folder containing video's frames in /data/)
    :index_to_start: frame to start plotting from
    :index_to_end: frame to end on
    """

    assert (index_to_start<index_to_end), "ERROR: must input index_to_end > index_to_start"

    path_video = path_data + vid_name + '/'

    # convert list of frame files to dataframe
    print(path_video)
    paths_frames = os.listdir(path_video)
    paths_frames = [f for f in paths_frames if f != '.DS_Store']
    paths_frames = pd.DataFrame(paths_frames, columns = ['frame_file'])
    paths_frames.sort_values('frame_file', inplace=True)
    
    assert (index_to_end <= len(paths_frames)), "ERROR: index_to_end greater than number of frames"
    
    # get labels for this vid
    labels_vid = labels[labels['video'] == vid_name]
    
    for i,row in enumerate(labels_vid.values):
        if i >= index_to_start and i <= index_to_end:
            # get dict for df row
            rr = dict(zip(labels_vid.columns, row))

            # print label with plotted image
            print(rr['label'], '\t\t', rr['frame'])
            plt.imshow(Image.open(path_data + rr['video'] + '/' + rr['frame']))
            plt.show()

## plot first and last 3 frames from each video

In [155]:
for path_video in paths_videos:
    vid_name = path_video[:-1].split("/")[-1]
    print("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
    print(vid_name)
    print("XXXX FIRST 3")
    plot_frames_with_labels(vid_name,0,2)
    print("XXXX LAST 3")
    plot_frames_with_labels(vid_name,98,100)

# Checksum frames

In [160]:
def checksum_frames(vid_name, frame_start=0, num_frames=3):
    """
    Check the sum of pixel values for a block of framess in a video. Useful to help debug frame extraction in the case it's done remotely and may not seem correct compared to a local extraction.
    
    :vid_name: name of video to run checksum on
    :frame_start: frame to start checksum on
    :num_frames: number of frames to include in the checksum
    """
    # convert list of frame files to dataframe
    paths_frames = os.listdir(path_data + vid_name)
    paths_frames = [f for f in paths_frames if f != '.DS_Store']
    paths_frames = pd.DataFrame(paths_frames, columns = ['frame_file'])
    paths_frames.sort_values('frame_file', inplace=True)
    paths_frames = list(paths_frames['frame_file'])

    checksum = 0
    for i in range(0,num_frames):
        # load image and add sum of pixels to checksum
        img = Image.open(path_data + vid_name + '/' + paths_frames[frame_start+i])
        checksum +=np.array(img).sum()
        
    return checksum

In [157]:
# list of video:checksum dicts 
checksums = []
vids = []
for video_path in paths_videos:
    vid_name = video_path[:-1].split("/")[-1]
    checksums.append(checksum_frames(vid_name,0,3))
    vids.append(vid_name)

In [158]:
# convert to dataframe
checksums = pd.DataFrame({"video":vids,"checksum":checksums})

In [159]:
checksums.to_csv(pwd + "checksums.csv")

# Clean up labels

## remove a video from labels

In [204]:
labels.shape

(88031, 3)

In [361]:
# uncomment and run this cell to remove a video from labels and update labels.csv
print(labels.shape)
video_to_remove_from_labels = '20160929_no9_3'
labels = labels[labels['video'] != video_to_remove_from_labels]
labels.to_csv(path_data + 'labels.csv', index=False)
print(labels.shape)

(74507, 3)
(70964, 3)


In [362]:
# reload labels file
labels = pd.read_csv(path_data + 'labels.csv')
labels.head()
print(labels.shape)

(70964, 3)


## Delete chunk of frames for a video

In [297]:
labels.shape

(83051, 3)

In [304]:
labels['frameid'] = labels['frame'].str.split(".").str.get(0).str.split("_").str.get(-1).astype(int)

In [305]:
labels = labels[~((labels['video'] == '20160819_no9_2') & (labels['frameid'] > 557))]

In [306]:
del labels['frameid']

In [307]:
labels.to_csv(path_data + 'labels.csv', index=False)

In [393]:
labels.shape

(70964, 3)

In [394]:
labels['label'].value_counts()

search        28261
shallow       11675
ascent         9135
descent        8839
subsurface     7225
bottom         5452
breath          377
Name: label, dtype: int64