# Instructions

Create `data/*vid_name*/` folder with frames for each video in videos and verify the created frames match your `data/labels.csv` file.

e.g. `/videos/video_1.MOV` will be converted to `/data/video_1_000001.jpg`, etc.

Note: depending on chosen `FPS_OUTPUT`, you may have to extend the number 
of digits in filename from 06 to something bigger like 09, just update the line in this script that says `_%06d.jpg`

Your `labels.csv` file must have frame filenames at the same FPS as the frame files in `/data/`.
You can use `notebooks/helper_convert_timestamps_file_to_labels.ipynb` to assist in converting `video,timestamp1,timestamp2,label` style data into `labels.csv`

You can use `notebooks/helper_check_frames_against_labels.ipynb` to more deeply investigate a mismatch between frame files and `labels.csv`...

(You could alternatively use the command line tool `ffmpeg` to extract frames instead of this notebook)

# Setup

In [5]:
# define FPS to extract frames at
FPS_OUTPUT = 2

In [1]:
import numpy as np
import cv2
from time import time as timer
import sys
import os
import pandas as pd

In [2]:
# setup paths
pwd = os.getcwd().replace("notebooks","")
path_videos = pwd + 'videos/'
path_data = pwd + 'data/'

In [3]:
# read video paths
paths = os.listdir(path_videos)
paths = [path_videos + v for v in paths if v != '.DS_Store']
paths

['/mnt/seals/videos/20161014_no8_3.MOV',
 '/mnt/seals/videos/20161014_no8_1.MOV',
 '/mnt/seals/videos/20161014_no8_2.MOV']

# Extract Frames from Videos

In [6]:
# create data folder
if not os.path.exists(path_data):
    os.makedirs(path_data)

for c, path in enumerate(paths):

    # extract video filename from path
    filename = path.split("/")[-1].split(".")[0]
    print(filename)

    if not os.path.exists(path_data + filename):

        print("Extracting frames from video {}/{}: {}".format(c+1,len(paths),filename))

        # create directory for this video's frames
        os.makedirs(path_data + filename)

        # open video
        vidcap = cv2.VideoCapture(path)

        # get fps
        print("video FPS {}".format(vidcap.get(cv2.CAP_PROP_FPS)))
        fps = vidcap.get(cv2.CAP_PROP_FPS)
        fps_savecheck = fps/FPS_OUTPUT

        # read frames and save to images at fps_save
        success,image = vidcap.read()
        count_frame = 0
        count_saved = 0
        success = True

        fps_savecheck_progress = 0

        while success:
            # save frame at desired framerate, indexing filename from 0
            if count_frame > fps_savecheck_progress:
                cv2.imwrite(path_data + filename + '/' + filename + "_%06d.jpg" % count_saved, image)
                fps_savecheck_progress += fps_savecheck
                count_saved += 1
            success,image = vidcap.read()
            count_frame += 1

        # close video file
        vidcap.release()
        
    else:
        print("Frames already extracted from video {}/{}".format(c+1,len(paths)))

# Done
print("Done extracting frames from {} videos".format(len(paths)))

20161014_no8_3
Frames already extracted from video 1/3
20161014_no8_1
Extracting frames from video 2/3: 20161014_no8_1
video FPS 29.818863446461272
20161014_no8_2
Extracting frames from video 3/3: 20161014_no8_2
video FPS 29.8181915800353
Done extracting frames from 3 videos


# Verify Frames Match Labels.csv

In [11]:
if not os.path.exists(pwd + 'data/labels.csv'):
    print("ERROR: labels.csv missing - please copy labels.csv to /data/labels.csv")
    print()
    print("Sample label file below:")    
    print("""video\t\t\tframe \t\t\t\t label
    20160801_no9_1\t\t20160801_no9_1_00001.jpeg	search
    20160801_no9_1\t\t20160801_no9_1_00002.jpeg	search
    ...""")
    print()
    print("Note you also need a 'split' column that assigns videos to train/valid/test splits - can use /notebooks/helper_add_train_valid_test_splits_to_labels.ipynb to add splits")

In [12]:
# load labels
labels = pd.read_csv(path_data + 'labels.csv')

In [13]:
labels.head()

Unnamed: 0,video,frame,label
0,20160801_no9_1,20160801_no9_1_00001.jpeg,search
1,20160801_no9_1,20160801_no9_1_00002.jpeg,search
2,20160801_no9_1,20160801_no9_1_00003.jpeg,search
3,20160801_no9_1,20160801_no9_1_00004.jpeg,search
4,20160801_no9_1,20160801_no9_1_00005.jpeg,search


In [14]:
# read frames paths
paths_videos = os.listdir(path_data)
paths_videos = [path_data + v + '/' for v in paths_videos if v != '.DS_Store' and v != 'labels.csv']

vids_error = []
vids_ok = []

# for path_video in paths_videos:    
for path_video in paths_videos:

    # get vid name from path
    vid_name = path_video[:-1].split("/")[-1]

    # convert list of frame files to dataframe
    paths_frames = os.listdir(path_video)
    paths_frames = [f for f in paths_frames if f != '.DS_Store']
    paths_frames = pd.DataFrame(paths_frames, columns = ['frame_file'])

    # subset labels to just this vid
    labels_vid = labels[labels['video'] == vid_name]

    # collect errors so can print grouped
    if not len(labels_vid) == len(paths_frames):
        vids_error.append("{} - Different number of labels ({}) than frames ({}) ... DIFF: {} ".format(vid_name, len(labels_vid),len(paths_frames), len(labels_vid) - len(paths_frames)))
    else:
        vids_ok.append("{} - Same number of labels and frames for vid".format(vid_name))
        
# print errors
print("ERRORS: {} VIDS WITH #LABELS != #FRAMES".format(len(vids_error)))
for msg in vids_error:
    print(msg)

print("\n")
print("OK: {} VIDS WITH #LABELS != #FRAMES".format(len(vids_ok)))
for msg in vids_ok:
    print(msg)

ERRORS: 21 VIDS WITH #LABELS != #FRAMES
20160930_no8_4 - Different number of labels (2814) than frames (2816) ... DIFF: -2 
20160929_no9_3 - Different number of labels (3634) than frames (3600) ... DIFF: 34 
20160819_no9_3 - Different number of labels (3598) than frames (3600) ... DIFF: -2 
20160812_no9_2 - Different number of labels (548) than frames (563) ... DIFF: -15 
20160929_no9_2 - Different number of labels (260) than frames (602) ... DIFF: -342 
20150827_no8B_3 - Different number of labels (3240) than frames (3241) ... DIFF: -1 
20160819_no9_2 - Different number of labels (576) than frames (558) ... DIFF: 18 
20150820_no8B_1 - Different number of labels (3698) than frames (3600) ... DIFF: 98 
20160802_no8_3 - Different number of labels (2702) than frames (2703) ... DIFF: -1 
20150827_no8B_2 - Different number of labels (580) than frames (566) ... DIFF: 14 
20160929_no9_4 - Different number of labels (3548) than frames (2816) ... DIFF: 732 
20160802_no8_2 - Different number of 

> see `/notebooks/helper_check_frames_against_labels.ipynb` for ways to more deeply investigate mismatch between frames and `labels.csv`