In [None]:
# Adrian Marinovich
# Springboard - Data Science Career Track 
# Capstone Project #2
# Data Wrangling

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import csv
import cv2
import glob
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import os
import os.path
import pickle
import random
import shutil
from subprocess import call

from scipy.stats import reciprocal, uniform
from scipy.misc import imsave

import tensorflow as tf

# for reproducibility:
np.random.seed(41)
tf.reset_default_graph()
tf.set_random_seed(51)
random.seed(61)
os.environ['PYTHONHASHSEED'] = '0'
    
from keras.models import model_from_yaml
from keras.models import Sequential
from keras.layers import Conv2D
from keras.layers import MaxPooling2D
from keras.layers import Flatten
from keras.layers import Dense
from keras.layers import Dropout
from keras.optimizers import SGD
from keras.utils import to_categorical

from keras.preprocessing import image
from keras.preprocessing.image import ImageDataGenerator

from keras import backend as K

from sklearn.svm import LinearSVC 
from sklearn.svm import SVC 

from sklearn.datasets import fetch_mldata
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import SGDClassifier

# setup plots
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

In [None]:
# (Adapted from:
#    https://github.com/harvitronix/five-video-classification-methods
#    https://github.com/wushidonguc/two-stream-action-recognition-keras )

In [None]:
# Obtain data
#
# bash command line in data folder:
# $ wget http://crcv.ucf.edu/data/UCF101/UCF101.rar
# $ unrar e UCF101.rar
# $ mkdir train && mkdir test && mkdir sequences && mkdir checkpoints
# 
# Get following 2 files from:
#     https://github.com/harvitronix/five-video-classification-methods/tree/master/data/ucfTrainTestlist
#
#   trainlist01.txt
#   testlist01.txt
#
#   ...and put in /home/adrian01/ucf101_lists
#
# Alternate train-test split lists can be obtained by downloading:
#     http://crcv.ucf.edu/data/UCF101/UCF101TrainTestSplits-RecognitionTask.zip

In [None]:
#        ***** Only run this file once *****
#
# (adapted from:
#   https://github.com/harvitronix/five-video-classification-methods/blob/master/data/1_move_files.py )

# This moves all the files into appropriate train or test/validation directories,
#   nested within subdirectories representing their classes
#
#        ***** Only run this file once *****
#

def get_train_test_lists(version='01'):
    # Uses the train and test lists created by UCF team, downloaded above
        
    os.chdir("/home/adrian01/ucf101_lists")

    test_file = os.path.join('/home/adrian01/ucf_lists/testlist01.txt')
    train_file = os.path.join('/home/adrian01/ucf_lists/trainlist01.txt')

    os.chdir("/home/adrian01/ucf101")
    
    # Build the test list
    with open(test_file) as fin:
        test_list = [row.strip() for row in list(fin)]

    # Build the train list. Extra step to remove the class index.
    with open(train_file) as fin:
        train_list = [row.strip() for row in list(fin)]
        train_list = [row.split(' ')[0] for row in train_list]

    # Set the groups in a dictionary.
    file_groups = {
        'train': train_list,
        'test': test_list
    }

    return file_groups

def move_files(file_groups):
    
    os.chdir("/home/adrian01/ucf101")
    
    # Do each of our groups.
    for group, videos in file_groups.items():

        # Do each of our videos.
        for video in videos:

            # Get the parts.
            parts = video.split(os.path.sep)
            classname = parts[0]
            filename = parts[1]

            # Check if this class exists.
            if not os.path.exists(os.path.join(group, classname)):
                print("Creating folder for %s/%s" % (group, classname))
                os.makedirs(os.path.join(group, classname))

            # Check if we have already moved this file, or at least that it
            # exists to move.
            if not os.path.exists(filename):
                print("Can't find %s to move. Skipping." % (filename))
                continue

            # Move it.
            dest = os.path.join(group, classname, filename)
            print("Moving %s to %s" % (filename, dest))
            os.rename(filename, dest)

    print("Done.")

# Now do it:

# Get the videos in groups so we can move them.
group_lists = get_train_test_lists()

# Move the files.
move_files(group_lists)

In [None]:
#        ***** Only run this file once *****
#
# (adapted from:
#   https://github.com/harvitronix/five-video-classification-methods/blob/master/data/2_extract_files.py )

# This extracts still images from the videos, and also 
#   creates a data_file.csv video metadata list used later.
#
#        ***** Only run this file once *****
#

os.chdir("/home/adrian01/ucf101")

def extract_files():
    # The following data is in the image file name:
    #   [train|test], class, filename, number of frames
    # Extraction done with ffmpeg:
    #   `ffmpeg -i video.mpg image-%04d.jpg`
    
    os.chdir("/home/adrian01/ucf101")
    
    data_file = []
    folders = ['train', 'test']

    for folder in folders:
        class_folders = glob.glob(os.path.join(folder, '*'))

        for vid_class in class_folders:
            class_files = glob.glob(os.path.join(vid_class, '*.avi'))

            for video_path in class_files:
                # Get the parts of the file.
                video_parts = get_video_parts(video_path)

                train_or_test, classname, filename_no_ext, filename = video_parts

                # Only extract if we haven't done it yet. Otherwise, just get
                # the info.
                if not check_already_extracted(video_parts):
                    # Now extract it.
                    src = os.path.join(train_or_test, classname, filename)
                    dest = os.path.join(train_or_test, classname,
                        filename_no_ext + '-%04d.jpg')
                    call(["ffmpeg", "-i", src, dest])

                # Now get how many frames it is.
                nb_frames = get_nb_frames_for_video(video_parts)

                data_file.append([train_or_test, classname, filename_no_ext, nb_frames])

                print("Generated %d frames for %s" % (nb_frames, filename_no_ext))

    with open('data_file.csv', 'w') as fout:
        writer = csv.writer(fout)
        writer.writerows(data_file)

    print("Extracted and wrote %d video files." % (len(data_file)))

def get_nb_frames_for_video(video_parts):
    # For previously extracted video, return number of frames extracted
    
    os.chdir("/home/adrian01/ucf101")
    
    train_or_test, classname, filename_no_ext, _ = video_parts
    generated_files = glob.glob(os.path.join(train_or_test, classname,
                                filename_no_ext + '*.jpg'))
    return len(generated_files)

def get_video_parts(video_path):
    # Given a full path to a video, return its parts
    
    os.chdir("/home/adrian01/ucf101")
    
    parts = video_path.split(os.path.sep)
    filename = parts[2]
    filename_no_ext = filename.split('.')[0]
    classname = parts[1]
    train_or_test = parts[0]

    return train_or_test, classname, filename_no_ext, filename

def check_already_extracted(video_parts):
    # Check to see if we created the -0001 frame of this file
    
    os.chdir("/home/adrian01/ucf101")
    
    train_or_test, classname, filename_no_ext, _ = video_parts
    return bool(os.path.exists(os.path.join(train_or_test, classname,
                               filename_no_ext + '-0001.jpg')))

# Now do it:
# Should report extracting and writing from 13320 video files
extract_files()