In [1]:
### TODO
# * 3-d CNN
# * concat dense model
# * fit_models create_architectures_list (append mode)
# * fit_models worker if experiment id last digit in os environment var

# * collect garbage between fitting models

# * refactor custom_model_name and model_weights_path to instead use trained model id

In [2]:
# whether to log each feature and sequence status
verbose = True

In [40]:
import multiprocessing

# get number of processors for multiprocessing fit generators
num_workers = multiprocessing.cpu_count()

In [3]:
import gc
import os
import pandas as pd
import numpy as np
from PIL import Image
import json
import cv2
import sys
sys.path.append('..')

In [4]:
from keras import backend as K
from keras.callbacks import EarlyStopping, ModelCheckpoint, CSVLogger, TensorBoard
from keras.layers import Dense, Flatten, Dropout, ZeroPadding3D, Input
from keras.layers.recurrent import SimpleRNN, GRU, LSTM
from keras.layers.wrappers import TimeDistributed
from keras.layers.convolutional import Conv2D, MaxPooling3D, Conv3D, MaxPooling2D, Convolution1D, Convolution3D, MaxPooling3D, ZeroPadding3D
from keras.models import Sequential, Model, load_model
from keras.optimizers import Adam, RMSprop
from keras.preprocessing.image import img_to_array

Using TensorFlow backend.


In [5]:
# setup paths
pwd = os.getcwd().replace("deepvideoclassification","")
path_cache = pwd + 'cache/'
path_data = pwd + 'data/'

In [6]:
# setup logging
# any explicit log messages or uncaught errors to stdout and file /logs.log
import logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s]  %(message)s",
    handlers=[
        logging.FileHandler("{0}/{1}.log".format(pwd, "logs")),
        logging.StreamHandler()
    ])
# init logger
logger = logging.getLogger()
# make logger aware of any uncaught exceptions
def handle_exception(exc_type, exc_value, exc_traceback):
    if issubclass(exc_type, KeyboardInterrupt):
        sys.__excepthook__(exc_type, exc_value, exc_traceback)
        return

    logger.error("Uncaught exception", exc_info=(exc_type, exc_value, exc_traceback))
sys.excepthook = handle_exception

# Pretrained_CNNs

In [7]:
# pretrained model shapes
pretrained_model_len_features = {}
#
pretrained_model_len_features['vgg16'] = 512
#
pretrained_model_len_features['mobilenetv2_1.00_224'] = 1280
#
pretrained_model_len_features['inception_resnet_v2'] = 1536
#
pretrained_model_len_features['resnet50'] = 2048
pretrained_model_len_features['xception'] = 2048
pretrained_model_len_features['inception_v3'] = 2048

In [35]:
# pretrained model shapes
pretrained_model_sizes = {}
#
pretrained_model_sizes['vgg16'] = (224,224)
pretrained_model_sizes['resnet50'] = (224,224)
pretrained_model_sizes['mobilenetv2_1.00_224'] = (224,224)
#
pretrained_model_sizes['xception'] = (299,299)
pretrained_model_sizes['inception_v3'] = (299,299)
pretrained_model_sizes['inception_resnet_v2'] = (299,299)

In [9]:
pretrained_model_names = ["inception_resnet_v2", "inception_v3", "mobilenetv2_1.00_224", "resnet50", "vgg16", "xception"]
poolings = ['max','avg']

In [10]:
def load_pretrained_model(pretrained_model_name, pooling, model_weights_path = None):
    """ Load pretrained model with given pooling applied
    
    Args:
        pretrained_model: name of pretrained model ["Xception", "VGG16", "ResNet50", "InceptionV3", "InceptionResNetV2", "MobileNetV2"]
        pooling: pooling strategy for final pretrained model layer ["max","avg"]
        :model_weights_path: path to custom model weights if we want to load CNN model we've fine-tuned to produce features (e.g. for LRCNN)
    
    Returns:
        Pretrained model object (excluding dense softmax 1000 ImageNet classes layer)
    """
    
    # initialize output
    model = None
    
    pretrained_model_name = pretrained_model_name.lower()
    
    ###########################
    ### import pretrained model
    ###########################
    if pretrained_model_name == "xception":   
        from keras.applications.xception import Xception
        model = Xception(include_top=False, weights='imagenet', pooling=pooling)
    elif pretrained_model_name == "vgg16":   
        from keras.applications.vgg16 import VGG16
        model = VGG16(include_top=False, weights='imagenet', pooling=pooling)
    elif pretrained_model_name == "resnet50":   
        from keras.applications.resnet50 import ResNet50
        model = ResNet50(include_top=False, weights='imagenet', pooling=pooling)
    elif pretrained_model_name == "inception_v3":   
        from keras.applications.inception_v3 import InceptionV3
        model = InceptionV3(include_top=False, weights='imagenet', pooling=pooling)
    elif pretrained_model_name == "inception_resnet_v2":   
        from keras.applications.inception_resnet_v2 import InceptionResNetV2
        model = InceptionResNetV2(include_top=False, weights='imagenet', pooling=pooling)
    elif pretrained_model_name == "mobilenetv2_1.00_224":   
        from keras.applications.mobilenet_v2 import MobileNetV2
        model = MobileNetV2(include_top=False, weights='imagenet', pooling=pooling)
    else:
        raise NameError('Invalid pretrained model name - must be one of ["Xception", "VGG16", "ResNet50", "InceptionV3", "InceptionResNetV2", "MobileNetV2"]')
    
    if model_weights_path is not None:
        if os.path.exists(model_weights_path):
            model.load_weights(model_weights_path)
        else:
            raise NameError('pretrained model weights not found')
    
    return model

In [11]:
def load_pretrained_model_preprocessor(pretrained_model_name):
    """
    Return preprocessing function for a given pretrained model
    """

    preprocess_input = None

    pretrained_model_name = pretrained_model_name.lower()
        
    if pretrained_model_name == "xception":   
        from keras.applications.xception import preprocess_input
    elif pretrained_model_name == "vgg16":   
        from keras.applications.vgg16 import preprocess_input
    elif pretrained_model_name == "resnet50":   
        from keras.applications.resnet50 import preprocess_input
    elif pretrained_model_name == "inception_v3":   
        from keras.applications.inception_v3 import preprocess_input
    elif pretrained_model_name == "inception_resnet_v2":   
        from keras.applications.inception_resnet_v2 import preprocess_input
    elif pretrained_model_name == "mobilenetv2_1.00_224":   
        from keras.applications.mobilenet_v2 import preprocess_input
    else:
        raise NameError('Invalid pretrained model name - must be one of ["Xception", "VGG16", "ResNet50", "InceptionV3", "InceptionResNetV2", "MobileNetV2"]')
        
    return preprocess_input

In [12]:
def precompute_CNN_features(pretrained_model_name, pooling, model_weights_path = None, custom_model_name = None):
    """ 
    Save pretrained features array computed over all frames of each video 
    using given pretrained model and pooling method
    
    :pretrained_model_name: pretrained model object loaded using `load_pretrained_model`
    :pooling: pooling method used with pretrained model
    :model_weights_path: path to custom model weights if we want to load CNN model we've fine-tuned to produce features (e.g. for LRCNN)
    :custom_model_name: custom output name to append to pretrained model name

    """
    
    pretrained_model_name = pretrained_model_name.lower()
    
    # setup path to save features
    path_features = path_cache + 'features/' + pretrained_model_name + "/" + pooling + '/'
    
    # store in custom directory if custom model name given (for when loading weights from fine-tuned CNN and precomputing features from that model)
    if custom_model_name is not None and model_weights_path is not None:
        path_features = path_cache + 'features/' + pretrained_model_name + "__" + custom_model_name + "/" + pooling + '/'
    
    if not os.path.exists(path_features):
        
        os.makedirs(path_features)
        
        # load pretrained model
        pretrained_model = load_pretrained_model(pretrained_model_name, pooling, model_weights_path)

        # load preprocessing function
        preprocess_input = load_pretrained_model_preprocessor(pretrained_model_name)

        # lookup pretrained model input shape
        model_input_size = pretrained_model_sizes[pretrained_model_name]
        
        # precompute features for each video using pretrained model
        from deepvideoclassification.data import get_video_paths
        path_videos = get_video_paths()

        for c, path_video in enumerate(path_videos):

            if verbose:
                logging.info("Computing pretrained model features for video {}/{} using pretrained model: {}, pooling: {}".format(c+1,len(path_videos),pretrained_model_name, pooling))

            # get video name from video path
            video_name = path_video.split("/")[-2]

            # build output path
            path_output = path_features + video_name

            try:
                if not os.path.exists(path_output + '.npy'):

                    path_frames = path_data + video_name + "/"

                    # initialize features list
                    features = []

                    frame_paths = os.listdir(path_frames)
                    frame_paths = [path_frames + f for f in frame_paths if f != '.DS_Store']

                    # sort paths in sequence (they were created with incrementing filenames through time)
                    frame_paths.sort()

                    # load each frame in vid and get features
                    for j, frame_path in enumerate(frame_paths):

                        # load image & preprocess
                        image = cv2.imread(frame_path, cv2.COLOR_BGR2RGB)
                        img = cv2.resize(image, model_input_size, interpolation=cv2.INTER_AREA)
                        img = img_to_array(img)
                        img = np.expand_dims(img, axis=0)
                        img = preprocess_input(img)

                        # get features from pretrained model
                        feature = pretrained_model.predict(img).ravel()
                        features.append(feature)

                    # convert to arrays
                    features = np.array(features)

                    np.save(path_output, features)
                else:
                    if verbose:
                        logger.info("Features already cached: {}".format(path_output))

            except Exception as e:
                logging.error("Error precomputing features {} / {},{}".format(video_namepretrained_model_name, pooling))
                logging.fatal(e, exc_info=True)
                
    else:
        if verbose:
            logger.info("Features already cached: {}".format(path_features))

# Architecture class (contains keras model object and train/evaluate method, writes training results to /models/)

In [36]:
class Architecture(object):
    
    def __init__(self, model_id, architecture, sequence_length, num_classes, frame_size = None, 
                pretrained_model_name = None, pooling = None,
                sequence_model = None, sequence_model_layers = 1,
                layer_1_size = 0, layer_2_size = 0, layer_3_size = 0, 
                dropout = 0, convolution_kernel_size = 3, model_weights_path = None):
        """
        Model object constructor. Contains Keras model object and training/evaluation methods. Writes model results to /models/_id_ folder
        
        Architecture can be one of: 
        image_MLP_frozen, image_MLP_trainable, video_MLP_concat, video_LRCNN_frozen, video_LRCNN_trainable, 3DCNN
        
        :model_id: integer identifier for this model e.g. 1337
        
        :architecture: architecture of model in [image_MLP_frozen, image_MLP_trainable, video_MLP_concat, video_LRCNN_frozen, video_LRCNN_trainable, 3DCNN]
        
        :sequence_length: number of frames in sequence to be returned by Data object
        :num_classes: number of classes to predict
        :frame_size: size that frames are resized to (different models / architectures accept different input sizes)

        :pretrained_model_name: name of pretrained model (or None if not using pretrained model e.g. for 3D-CNN)
        :pooling: name of pooling variant (or None if not using pretrained model e.g. for 3D-CNN or if fitting more non-dense layers on top of pretrained model base)
        
        :sequence_model: sequence model in [LSTM, SimpleRNN, GRU, Convolution1D]
        :sequence_model_layers: default to 1, can be stacked 2 or 3 (but less than 4) layer sequence model (assume always stacking the same sequence model, not mixing LSTM and GRU, for example)
        
        :layer_1_size: number of neurons in layer 1
        :layer_2_size: number of neurons in layer 2
        :layer_3_size: number of neurons in layer 3 
        
        :dropout: amount of dropout to add (same applied throughout model - good default is 0.2) 
        
        :convolution_kernel_size: size of 1-D convolutional kernel for 1-d conv sequence models (good default is 3)
        
        :model_weights_path: path to .h5 weights file to be loaded for pretrained CNN in LRCNN-trainable and in 3d-CNN. Can use custom CNN for other models but need to save features first then load them in data
        """
    
        # required params
        self.sequence_length = sequence_length
        self.frame_size = frame_size
        self.num_classes = num_classes
        
        # model architecture params
        self.architecture = architecture
        self.pretrained_model_name = pretrained_model_name
        self.pooling = pooling
        self.sequence_model = sequence_model
        self.sequence_model_layers = sequence_model_layers
        #
        self.layer_1_size = layer_1_size
        self.layer_2_size = layer_2_size
        self.layer_3_size = layer_3_size
        #
        self.dropout = dropout
        #
        self.convolution_kernel_size = convolution_kernel_size
        #
        self.model_weights_path = model_weights_path
        
        # fix case sensitivity
        if type(self.architecture) == str:
            self.architecture = self.architecture.lower()
        if type(self.pretrained_model_name) == str:
            self.pretrained_model_name = self.pretrained_model_name.lower()
        if type(self.pooling) == str:
            self.pooling = self.pooling.lower()
        
        # read num features from pretrained model
        if pretrained_model_name is not None:
            self.num_features = pretrained_model_len_features[pretrained_model_name]
            self.frame_size = pretrained_model_sizes[pretrained_model_name]
        
        # check one of pretrained model and frame size is specified
        assert (self.pretrained_model_name is not None and self.frame_size is not None), "Must specify one of pretrained_model_name or frame_size"
            
        model = None
        
        if architecture == "image_MLP_frozen":
            
            ####################
            ### image_MLP_frozen
            ####################
            # image classification (single frame)
            # train MLP on top of weights extracted from pretrained CNN with no fine-tuning
            
            # check inputs
            assert self.sequence_length == 1, "image_MLP_frozen requires sequence length of 1"
            assert self.pretrained_model_name is not None, "image_MLP_frozen requires a pretrained_model_name input" 
            assert self.pooling is not None, "image_MLP_frozen requires a pooling input" 
            
            # init model
            model = Sequential()

            # 1st layer group
            if self.layer_1_size > 0:
                model.add(Dense(self.layer_1_size, activation='relu', input_shape=(self.num_features,)))
                if self.dropout > 0:
                    model.add(Dropout(self.dropout))
                
            # 2nd layer group
            if self.layer_2_size > 0 and self.layer_1_size > 0:
                model.add(Dense(self.layer_2_size, activation='relu'))
                if self.dropout > 0:
                    model.add(Dropout(self.dropout))

            # 3rd layer group
            if self.layer_3_size > 0 and self.layer_2_size > 0 and self.layer_1_size > 0:
                model.add(Dense(self.layer_3_size, activation='relu'))
                if dropout > 0:
                    model.add(Dropout(self.dropout))

            # classifier layer
            model.add(Dense(self.num_classes, activation='softmax'))

        elif architecture == "image_MLP_trainable":
            
            #######################
            ### image_MLP_trainable
            #######################
            # image classification (single frame)
            # fine-tune pretrained CNN with MLP on top
            #
            # start off freezing base CNN layers then will unfreeze 
            # after each training round
            #
            # we will ultimately want to compare our best fine-tuned 
            # CNN as a feature extractor vs fixed ImageNet pretrained CNN features
            
            # check inputs
            assert self.sequence_length == 1, "image_MLP_trainable requires sequence length of 1"
            assert self.pretrained_model_name is not None, "image_MLP_trainable requires a pretrained_model_name input" 
            assert self.pooling is not None, "image_MLP_trainable requires a pooling input" 
            
            
            # create the base pre-trained model
            model_base = load_pretrained_model(self.pretrained_model_name, pooling=self.pooling)
            

            # freeze base model layers (will unfreeze after train top)
            for l in model_base.layers:
                l.trainable=False

            # use Keras functional API
            model_top = model_base.output

            # note layer names are there so we can exclude those layers 
            # when setting base model layers to trainable

            # 1st layer group
            if self.layer_1_size > 0:
                model_top = Dense(self.layer_1_size, activation="relu", name='top_a')(model_top)
                if self.dropout > 0:
                    model_top = Dropout(self.dropout, name='top_b')(model_top)

            # 2nd layer group
            if self.layer_2_size > 0 and self.layer_1_size > 0:
                model_top = Dense(self.layer_2_size, activation="relu", name='top_c')(model_top)
                if self.dropout > 0:
                    model_top = Dropout(self.dropout, name='top_d')(model_top)

            # 3rd layer group
            if self.layer_3_size > 0 and self.layer_2_size > 0 and self.layer_1_size > 0:
                model_top = Dense(self.layer_3_size, activation="relu", name='top_e')(model_top)
                if self.dropout > 0:
                    model_top = Dropout(self.dropout, name='top_f')(model_top)

            # classifier layer
            model_predictions = Dense(self.num_classes, activation="softmax", name='top_g')(model_top)

            # combine base and top models into single model object
            model = Model(inputs=model_base.input, outputs=model_predictions)
                
        elif architecture == "video_MLP_concat":

            ####################
            ### video_MLP_concat
            ####################
            
            # video classification
            # concatenate all frames in sequence and train MLP on top of concatenated frame input
            
            print('TODO')
            
        elif architecture == "video_LRCNN_frozen":

            ######################
            ### video_LRCNN_frozen
            ######################
            
            # Implement:
            # “Long-Term Recurrent Convolutional Networks for Visual Recognition and Description.”
            # Donahue, Jeff, Lisa Anne Hendricks, Marcus Rohrbach, Subhashini Venugopalan, 
            # Sergio Guadarrama, Kate Saenko, and Trevor Darrell.  
            # Proceedings of the IEEE Computer Society Conference on Computer Vision and 
            # Pattern Recognition, 2015, 2625–34.
            #
            # Essentially they extract features with fine-tuned CNN then fit recurrent models on top
            # in the paper they only use LSTM but we will also try RNN, GRU and 1-D CNN
            # 
            # note: no fine-tuning of CNN in this frozen LRCNN architecture
            # 
            # implementation inspired by:
            # https://github.com/sagarvegad/Video-Classification-CNN-and-LSTM-/blob/master/train_CNN_RNN.py

            
            # check inputs
            assert self.sequence_length > 1, "video_LRCNN_frozen requires sequence length > 1"
            assert self.layer_1_size > 0, "video_LRCNN_frozen requires a layer_1_size > 0" 
            assert self.pretrained_model_name is not None, "video_LRCNN_frozen requires a pretrained_model_name input" 
            assert self.pooling is not None, "video_LRCNN_frozen requires a pooling input" 
            assert self.sequence_model_layers >= 1, "video_LRCNN_frozen requires sequence_model_layers >= 1" 
            assert self.sequence_model_layers < 4, "video_LRCNN_frozen requires sequence_model_layers <= 3" 
            assert self.sequence_model is not None, "video_LRCNN_frozen requires a sequence_model" 
            if self.sequence_model == 'Convolution1D':
                assert self.convolution_kernel_size > 0, "Convolution1D sequence model requires convolution_kernel_size parameter > 0"
                assert self.convolution_kernel_size < self.sequence_length, "convolution_kernel_size must be less than sequence_length"

            # set whether to return sequences for stacked sequence models
            return_sequences_1, return_sequences_2 = False, False
            if sequence_model_layers > 1 and layer_2_size > 0:
                return_sequences_1 = True
            if sequence_model_layers >= 2 and layer_3_size > 0 and layer_2_size > 0:
                return_sequences_2 = True
                
            print(return_sequences_1, return_sequences_2)
                
            #LSTM, SimpleRNN, GRU, Convolution1D
            
            # init model
            model = Sequential()

            # layer 1 (sequence layer)
            if sequence_model == "LSTM":
                model.add(LSTM(self.layer_1_size, return_sequences=return_sequences_1, dropout=self.dropout, 
                               input_shape=(self.sequence_length, self.num_features)))
            elif sequence_model == "SimpleRNN":
                model.add(SimpleRNN(self.layer_1_size, return_sequences=return_sequences_1, dropout=self.dropout, 
                               input_shape=(self.sequence_length, self.num_features)))
            elif sequence_model == "GRU":
                model.add(GRU(self.layer_1_size, return_sequences=return_sequences_1, dropout=self.dropout, 
                               input_shape=(self.sequence_length, self.num_features)))
            elif sequence_model == "Convolution1D":
                model.add(Convolution1D(self.layer_1_size, kernel_size = self.convolution_kernel_size, padding = 'valid', 
                               input_shape=(self.sequence_length, self.num_features)))
                if layer_2_size == 0 or sequence_model_layers == 1:
                    model.add(Flatten())
            else:
                raise NameError('Invalid sequence_model - must be one of [LSTM, SimpleRNN, GRU, Convolution1D]')    

            # layer 2 (sequential or dense)
            if layer_2_size > 0:
                if return_sequences_1 == False:
                    model.add(Dense(self.layer_2_size, activation='relu'))
                    model.add(Dropout(self.dropout))
                else:
                    if sequence_model == "LSTM":
                        model.add(LSTM(self.layer_2_size, return_sequences=return_sequences_2, dropout=self.dropout))
                    elif sequence_model == "SimpleRNN":
                        model.add(SimpleRNN(self.layer_2_size, return_sequences=return_sequences_2, dropout=self.dropout))
                    elif sequence_model == "GRU":
                        model.add(GRU(self.layer_2_size, return_sequences=return_sequences_2, dropout=self.dropout))
                    elif sequence_model == "Convolution1D":
                        model.add(Convolution1D(self.layer_2_size, kernel_size = self.convolution_kernel_size, padding = 'valid'))
                    else:
                        raise NameError('Invalid sequence_model - must be one of [LSTM, SimpleRNN, GRU, Convolution1D]') 

            # layer 3 (sequential or dense)
            if layer_3_size > 0:
                if sequence_model_layers < 3:
                    if sequence_model_layers == 2:
                        model.add(Flatten())
                    model.add(Dense(self.layer_3_size, activation='relu'))
                    model.add(Dropout(self.dropout))
                else:
                    if sequence_model == "LSTM":
                        model.add(LSTM(self.layer_3_size, return_sequences=False, dropout=self.dropout))
                        model.add(Flatten())
                    elif sequence_model == "SimpleRNN":
                        model.add(SimpleRNN(self.layer_3_size, return_sequences=False, dropout=self.dropout))
                        model.add(Flatten())
                    elif sequence_model == "GRU":
                        model.add(GRU(self.layer_3_size, return_sequences=False, dropout=self.dropout))
                        model.add(Flatten())
                    elif sequence_model == "Convolution1D":
                        model.add(Convolution1D(self.layer_3_size, kernel_size = self.convolution_kernel_size, padding = 'valid'))
                        model.add(Flatten())
                    else:
                        raise NameError('Invalid sequence_model - must be one of [LSTM, SimpleRNN, GRU, Convolution1D]') 
            else:
                if return_sequences_2 == True: 
                    model.add(Flatten())

            # classifier layer
            if self.dropout > 0:
                model.add(Dropout(self.dropout))
            model.add(Dense(self.num_classes, activation='softmax'))

        elif architecture == "video_LRCNN_trainable":
            
            #########################
            ### video_LRCNN_trainable
            #########################
            
            # Same as above:
            # “Long-Term Recurrent Convolutional Networks for Visual Recognition and Description.”
            # Donahue, Jeff, Lisa Anne Hendricks, Marcus Rohrbach, Subhashini Venugopalan, 
            # Sergio Guadarrama, Kate Saenko, and Trevor Darrell.  
            # Proceedings of the IEEE Computer Society Conference on Computer Vision and 
            # Pattern Recognition, 2015, 2625–34.
            #
            # But with fine-tuning of the CNNs that are input into the recurrent models
            # 
            # note: will take long because not precomputing the CNN part so re-computed 
            # on each training pass

            # implementation inspired by https://stackoverflow.com/questions/49535488/lstm-on-top-of-a-pre-trained-cnn

            model_cnn = load_pretrained_model(self.pretrained_model_name, pooling=self.pooling)

            # optionally load weights for pretrained architecture
            # (will likely be better to first train CNN then load weights in LRCNN vs. use pretrained ImageNet CNN)
            if self.model_weights_path is not None:
                model_base.load_weights(self.model_weights_path)
            
            # freeze model_cnn layers (will unfreeze later after sequence model trained a while)
            for l in model_cnn.layers:
                l.trainable = False

            # sequential component on top of CNN
            frames = Input(shape=(self.sequence_length, self.frame_size[0], self.frame_size[1], 3))
            x = TimeDistributed(model_cnn)(frames)
            x = TimeDistributed(Flatten())(x)

            # layer 1 sequence model
            x = LSTM(self.layer_1_size, dropout=dropout)(x)

            # classifier layer
            out = Dense(self.num_classes)(x)

            # join cnn frame model and LSTM top
            model = Model(inputs=frames, outputs=out)
            

            
        elif architecture == "3DCNN":
            
            #########
            ### 3DCNN
            #########
            
            # Implement:
            
            # “3D Convolutional Neural Networks for Human Action Recognition.” 
            # Ji, Shuiwang, Wei Xu, Ming Yang, and Kai Yu. 
            # IEEE Transactions on Pattern Analysis and Machine Intelligence 
            # 35, no. 1 (2013): 221–31. doi:10.1109/TPAMI.2012.59.
            #
            # They fit a 3-D convolutional model on top of stacked frame volumes
            
            # Implementation from: 
            # https://gist.github.com/albertomontesg/d8b21a179c1e6cca0480ebdf292c34d2
            # note example has input shape as (channels, sequence_length, frame_size_1, frame_size_2)
            
            # init model
            model = Sequential()
            
            # 1st layer group
            model.add(Convolution3D(64, 3, 3, 3, activation='relu',  border_mode='same', name='conv1', subsample=(1, 1, 1), input_shape=(3, 16, 112, 112)))
            model.add(MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2), border_mode='valid', name='pool1'))

            # 2nd layer group
            model.add(Convolution3D(128, 3, 3, 3, activation='relu',border_mode='same', name='conv2', subsample=(1, 1, 1)))
            model.add(MaxPooling3D(pool_size=(2, 2, 2), strides=(2, 2, 2),  border_mode='valid', name='pool2'))

            # 3rd layer group
            model.add(Convolution3D(256, 3, 3, 3, activation='relu',border_mode='same', name='conv3a', subsample=(1, 1, 1)))
            model.add(Convolution3D(256, 3, 3, 3, activation='relu',  border_mode='same', name='conv3b', subsample=(1, 1, 1)))
            model.add(MaxPooling3D(pool_size=(2, 2, 2), strides=(2, 2, 2), border_mode='valid', name='pool3'))

            # 4th layer group
            model.add(Convolution3D(512, 3, 3, 3, activation='relu',  border_mode='same', name='conv4a', subsample=(1, 1, 1)))
            model.add(Convolution3D(512, 3, 3, 3, activation='relu',  border_mode='same', name='conv4b', subsample=(1, 1, 1)))
            model.add(MaxPooling3D(pool_size=(2, 2, 2), strides=(2, 2, 2), border_mode='valid', name='pool4'))

            # 5th layer group
            model.add(Convolution3D(512, 3, 3, 3, activation='relu',border_mode='same', name='conv5a', subsample=(1, 1, 1)))
            model.add(Convolution3D(512, 3, 3, 3, activation='relu', border_mode='same', name='conv5b', subsample=(1, 1, 1)))
            model.add(ZeroPadding3D(padding=(0, 1, 1)))
            model.add(MaxPooling3D(pool_size=(2, 2, 2), strides=(2, 2, 2), border_mode='valid', name='pool5'))
            model.add(Flatten())
            
            # FC layers group
            model.add(Dense(4096, activation='relu', name='fc6'))
            model.add(Dropout(.5))
            model.add(Dense(4096, activation='relu', name='fc7'))
            model.add(Dropout(.5))
            
            
            # if load weights from Sports1M model then need to load with 487 class classifier then pop it and add our own
            if self.model_weights_path is not None:
                model.add(Dense(487, activation='softmax', name='fc8'))
                model.load_weights(self.model_weights_path)
                model.layers.pop()
                model.add(Dense(self.num_classes, activation='softmax'))
            else:
                model.add(Dense(self.num_classes, activation='softmax', name='fc8'))
            
        else:
            raise NameError('Invalid architecture - must be one of [image_MLP_frozen, image_MLP_trainable, video_MLP_concat, video_LRCNN_frozen, video_LRCNN_trainable, 3DCNN]')    
        
        # set class model to constructed model
        self.model = model

In [17]:
def make_last_layers_trainable(model, num_layers):
    """
    Set the last *num_layers* non-trainable layers to trainable  
    
    NB to be used with model_base and assumes name = "top_xxx" added to each top layer to know 
    to ignore that layer when looping through layers from top backwards
    
    :num_layers: number of layers from end of model (that are currently not trainable) to be set as trainable
    """
    
    # get index of last non-trainable layer
    # (the layers we added on top of model_base are already trainable=True)
    # ...
    # need to find last layer of base model and set that (and previous num_layers)
    # to trainable=True via this method
    
    # find last non-trainable layer index
    idx_non_trainable = 0
    for i, l in enumerate(model.layers):
        if "top" not in l.name:
            idx_non_trainable = i
                
    # set last non-trainable layer and num_layers prior to trainable=True
    for i in reversed(range(idx_not_trainable-num_layers+1, idx_not_trainable+1)):
        model.layers[i].trainable = True
        print(idx_not_trainable, num_layers, i)
        
    return model

In [18]:
def train(model, data, path_model, learning_rate = 0.001, epochs = 20, batch_size = 32, patience=10, verbose = verbose):
    """
    Compile and fit model for *epochs* rounds of training, dividing learning rate by 10 after each round
    
    Fitting will stop if val_acc does not improve for at least patience epochs
    
    Only the best weights will be kept
    
    Good practice is to decrease the learning rate by a factor of 10 after each plateau and train some more 
    (after first re-loading best weights from previous training round)...

    for example:
        fit_history = train(model, data, path_model = pwd+'models/', learning_rate = 0.001, epochs = 30)
        model.load_weights(path_model + "model.h5")
        fit_history = train(model, data, path_model = pwd+'models/', learning_rate = 0.0001, epochs = 30)
    
    :model: model object to train
    :data: data object
    :path_model: path to save fit logs and model snapshot
    :learning_rate: learning rate parameter for Adam optimizer (default is 0.001)
    
    :epochs: number of training epochs per fit round (subject to patience)
    :batch_size: number of samples in each batch
    :patience: how many epochs without val_acc improvement before stopping fit round
    :verbose: print progress
    
    """

    # TODO: refactor
    path_model = pwd + 'models/'
    
    # create optimizer with given learning rate 
    opt = Adam(lr = learning_rate)
    
    # compile model
    model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])
    
    # setup training callbacks
    callback_stopper = EarlyStopping(monitor='val_acc', patience=patience, verbose=0)
    callback_csvlogger = CSVLogger(path_model + 'training.log')
    callback_checkpointer = ModelCheckpoint(path_model +  'model.h5', monitor='val_acc', save_best_only=True, verbose=verbose)
    callbacks = [callback_stopper, callback_checkpointer, callback_csvlogger]
    
    return model.fit(data.x_train, data.y_train, 
              validation_data=(data.x_valid, data.y_valid),
              batch_size=batch_size,
              epochs=epochs,
              callbacks=callbacks,
              shuffle=True,
              verbose=verbose)

In [19]:
# fit_history = train(model, data, path_model = pwd+'models/', learning_rate = 0.001)

# Debug

In [20]:
from deepvideoclassification.data import Data

In [29]:
pretrained_model_name = "vgg16"
pooling="max"
sequence_length = 2

layer_1_size = 128
layer_2_size = 64
layer_3_size = 32
dropout=0.20

data = Data(sequence_length = 1, 
            return_CNN_features = False, 
            pretrained_model_name=pretrained_model_name,
            pooling = pooling,
            return_generator=True,
            batch_size=32)

num_classes = data.num_classes 
frame_size = data.frame_size
num_features = pretrained_model_len_features[pretrained_model_name]

2019-01-18 21:27:52,884 [MainThread  ] [INFO ]  Loading frames into memory [may take a few minutes]
2019-01-18 21:27:52,887 [MainThread  ] [INFO ]  Loading frames into memory: 1/25
2019-01-18 21:27:57,614 [MainThread  ] [INFO ]  Loading frames into memory: 2/25
2019-01-18 21:28:01,082 [MainThread  ] [INFO ]  Loading frames into memory: 3/25
2019-01-18 21:28:04,870 [MainThread  ] [INFO ]  Loading frames into memory: 4/25
2019-01-18 21:28:08,398 [MainThread  ] [INFO ]  Loading frames into memory: 5/25
2019-01-18 21:28:11,371 [MainThread  ] [INFO ]  Loading frames into memory: 6/25
2019-01-18 21:28:12,060 [MainThread  ] [INFO ]  Loading frames into memory: 7/25
2019-01-18 21:28:16,066 [MainThread  ] [INFO ]  Loading frames into memory: 8/25
2019-01-18 21:28:20,522 [MainThread  ] [INFO ]  Loading frames into memory: 9/25
2019-01-18 21:28:25,181 [MainThread  ] [INFO ]  Loading frames into memory: 10/25
2019-01-18 21:28:25,859 [MainThread  ] [INFO ]  Loading frames into memory: 11/25
2019-01

In [37]:
architecture = Architecture(model_id = 1, 
                            architecture = 'image_MLP_trainable',
                            sequence_length = 1, 
                            num_classes = num_classes, 
                            pretrained_model_name = pretrained_model_name, 
                            pooling = 'max', 
                            layer_1_size=128,
                            layer_2_size=0, 
                            layer_3_size=0,
                            dropout=0.2)

In [38]:
# create optimizer and compile
opt = Adam()
architecture.model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])

In [43]:
data.

True

In [41]:
# Train model on dataset
architecture.model.fit_generator(generator=data.generator_train,
                    validation_data=data.generator_valid,
                    use_multiprocessing=True,
                    workers=num_workers)

AttributeError: 'Data' object has no attribute 'generator_train'

## frozen MLP

In [28]:
pretrained_model_name = "vgg16"
pooling="max"

In [29]:
layer_1_size = 128
layer_2_size = 64
layer_3_size = 32
dropout=0.20

In [30]:
data = Data(sequence_length = 1, 
            return_CNN_features = True, 
            pretrained_model_name=pretrained_model_name,
            pooling = pooling)

2019-01-09 22:06:08,363 [MainThread  ] [INFO ]  Features already cached: /mnt/seals/cache/features/vgg16/max/


In [31]:
num_classes = data.num_classes 
frame_size = data.frame_size
num_features = pretrained_model_len_features[pretrained_model_name]

In [32]:
# init model
model = Sequential()

# 1st layer group
if layer_1_size > 0:
    model.add(Dense(layer_1_size, activation='relu', input_shape=(num_features,)))
    if dropout > 0:
        model.add(Dropout(dropout))

# 2nd layer group
if layer_2_size > 0 and layer_1_size > 0:
    model.add(Dense(layer_2_size, activation='relu'))
    if dropout > 0:
        model.add(Dropout(dropout))

# 3rd layer group
if layer_3_size > 0 and layer_2_size > 0 and layer_1_size > 0:
    model.add(Dense(layer_3_size, activation='relu'))
    if dropout > 0:
        model.add(Dropout(dropout))

# classifier layer
model.add(Dense(num_classes, activation='softmax'))


In [33]:
# train model
fit_history = train(model, data, path_model = pwd+'models/', learning_rate = 0.001, epochs = 10)

Train on 10775 samples, validate on 1380 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.56449, saving model to /mnt/seals/models/model.h5
Epoch 2/10

Epoch 00002: val_acc improved from 0.56449 to 0.87319, saving model to /mnt/seals/models/model.h5
Epoch 3/10

Epoch 00003: val_acc improved from 0.87319 to 0.88768, saving model to /mnt/seals/models/model.h5
Epoch 4/10

Epoch 00004: val_acc improved from 0.88768 to 0.92174, saving model to /mnt/seals/models/model.h5
Epoch 5/10

Epoch 00005: val_acc did not improve from 0.92174
Epoch 6/10

Epoch 00006: val_acc did not improve from 0.92174
Epoch 7/10

Epoch 00007: val_acc did not improve from 0.92174
Epoch 8/10

Epoch 00008: val_acc did not improve from 0.92174
Epoch 9/10

Epoch 00009: val_acc did not improve from 0.92174
Epoch 10/10

Epoch 00010: val_acc did not improve from 0.92174


## concat frames

In [92]:
pretrained_model_name = "vgg16"
pooling="max"
sequence_length = 2

In [93]:
layer_1_size = 128
layer_2_size = 64
layer_3_size = 32
dropout=0.20

In [94]:
data = Data(sequence_length = sequence_length, 
            return_CNN_features = True, 
            pretrained_model_name=pretrained_model_name,
            pooling = pooling)

2019-01-09 22:12:17,962 [MainThread  ] [INFO ]  Features already cached: /mnt/seals/cache/features/vgg16/max/


In [95]:
data.x_train.shape

(10736, 2, 512)

In [96]:
data.y_train.mean(axis=0)[0], data.y_valid.mean(axis=0)[0], data.y_test.mean(axis=0)[0]

(0.643628912071535, 0.704, 0.36212624584717606)

In [97]:
num_classes = data.num_classes 
frame_size = data.frame_size
num_features = pretrained_model_len_features[pretrained_model_name]

In [98]:
# init model
model = Sequential()

model.add(Flatten(input_shape=(sequence_length, num_features)))

# 1st layer group
if layer_1_size > 0:
    model.add(Dense(layer_1_size, activation='relu', input_shape=(num_features,)))
    if dropout > 0:
        model.add(Dropout(dropout))

# 2nd layer group
if layer_2_size > 0 and layer_1_size > 0:
    model.add(Dense(layer_2_size, activation='relu'))
    if dropout > 0:
        model.add(Dropout(dropout))

# 3rd layer group
if layer_3_size > 0 and layer_2_size > 0 and layer_1_size > 0:
    model.add(Dense(layer_3_size, activation='relu'))
    if dropout > 0:
        model.add(Dropout(dropout))

# classifier layer
model.add(Dense(num_classes, activation='softmax'))


In [99]:
# create optimizer with given learning rate 
opt = Adam(lr = 0.001)

# compile model
model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])

model.fit(data.x_train, data.y_train, 
          validation_data=(data.x_valid, data.y_valid),
          batch_size=32,
          epochs=10,
          shuffle=True,
          verbose=1)

Train on 10736 samples, validate on 1375 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f08de0e96d8>

## lrcnn frozen

In [106]:
pretrained_model_name = "vgg16"
pooling="max"
sequence_length = 5

In [114]:
layer_1_size = 128
layer_2_size = 64
layer_3_size = 32
dropout=0.20

In [134]:
data = Data(sequence_length = sequence_length, 
            return_CNN_features = True, 
            pretrained_model_name=pretrained_model_name,
            pooling = pooling)

2019-01-09 22:25:15,691 [MainThread  ] [INFO ]  Features already cached: /mnt/seals/cache/features/vgg16/max/


In [135]:
data.x_train.shape

(10619, 5, 512)

In [136]:
architecture = Architecture(architecture="video_LRCNN_frozen", 
                            sequence_model = 'LSTM',
                            sequence_model_layers = 1,
                            sequence_length = sequence_length,
                            num_classes = data.num_classes, 
                            frame_size = data.frame_size, 
                            pretrained_model_name='vgg16', 
                            pooling='max',
                            layer_1_size=64, 
                            layer_2_size=32, 
                            layer_3_size=8, 
                            dropout=0.2,
                            convolution_kernel_size=3)

False False


In [137]:
# train model
fit_history = train(architecture.model, data, path_model = pwd+'models/', learning_rate = 0.001, epochs = 10)

Train on 10619 samples, validate on 1360 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.70294, saving model to /mnt/seals/models/model.h5
Epoch 2/10

Epoch 00002: val_acc improved from 0.70294 to 0.81985, saving model to /mnt/seals/models/model.h5
Epoch 3/10

Epoch 00003: val_acc improved from 0.81985 to 0.86985, saving model to /mnt/seals/models/model.h5
Epoch 4/10

Epoch 00004: val_acc improved from 0.86985 to 0.88088, saving model to /mnt/seals/models/model.h5
Epoch 5/10

Epoch 00005: val_acc improved from 0.88088 to 0.88456, saving model to /mnt/seals/models/model.h5
Epoch 6/10

Epoch 00006: val_acc did not improve from 0.88456
Epoch 7/10

Epoch 00007: val_acc improved from 0.88456 to 0.88897, saving model to /mnt/seals/models/model.h5
Epoch 8/10

Epoch 00008: val_acc did not improve from 0.88897
Epoch 9/10

Epoch 00009: val_acc did not improve from 0.88897
Epoch 10/10

Epoch 00010: val_acc improved from 0.88897 to 0.92132, saving model to /mnt/seals/models/model.

## lrcnn trainable

In [None]:
## train with generator

data = Data(sequence_length = sequence_length, 
            return_CNN_features = False, 
            pretrained_model_name=pretrained_model_name,
            pooling = pooling,
            batch_size=32,
            return_generator=True)

from keras.optimizers import Adam

architecture = Architecture(architecture="video_LRCNN_trainable", 
                            sequence_model = 'LSTM',
                            sequence_model_layers = 1,
                            sequence_length = sequence_length,
                            num_classes = data.num_classes, 
                            frame_size = data.frame_size, 
                            pretrained_model_name='vgg16', 
                            pooling='max',
                            layer_1_size=64, 
                            layer_2_size=32, 
                            layer_3_size=8, 
                            dropout=0.2,
                            convolution_kernel_size=3)

In [None]:
# create optimizer and compile
opt = Adam()
architecture.model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Train model on dataset
architecture.model.fit_generator(generator=data.generator_train,
                    validation_data=data.generator_valid,
                    use_multiprocessing=True,
                    workers=8)

## 3d-CNN

In [160]:
# 3d-CNN
data = Data(sequence_length = 16, 
            return_CNN_features = False, 
            frame_size = (112,112))

In [161]:
model = Sequential()
model.add(Conv3D(
    32, (3,3,3), activation='relu', input_shape=(sequence_length, 112, 112, 3)
))
model.add(MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2)))
model.add(Conv3D(64, (3,3,3), activation='relu'))
model.add(MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2)))
model.add(Conv3D(128, (3,3,3), activation='relu'))
# model.add(Conv3D(128, (3,3,3), activation='relu'))
model.add(MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2)))
model.add(Conv3D(256, (2,2,2), activation='relu'))
# model.add(Conv3D(256, (2,2,2), activation='relu'))
model.add(MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2)))

model.add(Flatten())
model.add(Dense(1024))
model.add(Dropout(0.5))
model.add(Dense(1024))
model.add(Dropout(0.5))
model.add(Dense(data.num_classes, activation='softmax'))

ValueError: Negative dimension size caused by subtracting 3 from 1 for 'conv3d_6/convolution' (op: 'Conv3D') with input shapes: [?,1,26,26,64], [3,3,3,64,128].

In [None]:
opt = Adam(lr = 0.001)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(data.x_train, data.y_train, 
              validation_data=(data.x_valid, data.y_valid),
              batch_size=32,
              epochs=10,
              shuffle=True,
              verbose=verbose)

In [None]:
np.mean(data.y_valid[:,1])

In [None]:
np.mean(data.y_train[:,1])

In [None]:
# model = Sequential()
# # 1st layer group
# model.add(Convolution3D(64, 3, 3, 3, activation='relu', 
#                         border_mode='same', name='conv1',
#                         subsample=(1, 1, 1), 
#                         input_shape=(3, 16, 112, 112)))
# model.add(MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2), 
#                        border_mode='valid', name='pool1'))
# # 2nd layer group
# model.add(Convolution3D(128, 3, 3, 3, activation='relu', 
#                         border_mode='same', name='conv2',
#                         subsample=(1, 1, 1)))
# model.add(MaxPooling3D(pool_size=(2, 2, 2), strides=(2, 2, 2), 
#                        border_mode='valid', name='pool2'))
# # 3rd layer group
# model.add(Convolution3D(256, 3, 3, 3, activation='relu', 
#                         border_mode='same', name='conv3a',
#                         subsample=(1, 1, 1)))
# model.add(Convolution3D(256, 3, 3, 3, activation='relu', 
#                         border_mode='same', name='conv3b',
#                         subsample=(1, 1, 1)))
# model.add(MaxPooling3D(pool_size=(2, 2, 2), strides=(2, 2, 2), 
#                        border_mode='valid', name='pool3'))
# # 4th layer group
# model.add(Convolution3D(512, 3, 3, 3, activation='relu', 
#                         border_mode='same', name='conv4a',
#                         subsample=(1, 1, 1)))
# model.add(Convolution3D(512, 3, 3, 3, activation='relu', 
#                         border_mode='same', name='conv4b',
#                         subsample=(1, 1, 1)))
# model.add(MaxPooling3D(pool_size=(2, 2, 2), strides=(2, 2, 2), 
#                        border_mode='valid', name='pool4'))
# # 5th layer group
# model.add(Convolution3D(512, 3, 3, 3, activation='relu', 
#                         border_mode='same', name='conv5a',
#                         subsample=(1, 1, 1)))
# model.add(Convolution3D(512, 3, 3, 3, activation='relu', 
#                         border_mode='same', name='conv5b',
#                         subsample=(1, 1, 1)))
# model.add(ZeroPadding3D(padding=(0, 1, 1)))
# model.add(MaxPooling3D(pool_size=(2, 2, 2), strides=(2, 2, 2), 
#                        border_mode='valid', name='pool5'))
# model.add(Flatten())
# # FC layers group
# model.add(Dense(4096, activation='relu', name='fc6'))
# model.add(Dropout(.5))
# model.add(Dense(4096, activation='relu', name='fc7'))
# model.add(Dropout(.5))
# model.add(Dense(487, activation='softmax', name='fc8'))

In [None]:
data.label_map

In [None]:
# # reorder default tensorflow axis order for 3D-CNN architecture
# data.x_train = np.moveaxis(data.x_train,4,1)
# data.x_valid = np.moveaxis(data.x_valid,4,1)
# data.x_test = np.moveaxis(data.x_test,4,1)

In [None]:
# num_classes = data.num_classes 
# frame_size = data.frame_size
# num_features = pretrained_model_len_features[pretrained_model_name]

In [None]:
architecture = Architecture(architecture="3DCNN", 
                     sequence_length = 16,
                     num_classes =  data.num_classes,
                     frame_size = data.frame_size)

# Evaluate (move to model)

In [None]:
evaluate(architecture.model, data)

In [None]:
def evaluate(model, data):
    """
    Evaluate model on test set, returning percentage of classes exactly correct and dataframe with predicted probabilities and labels
    
    :model: trained model object
    :data: data object
    """
    #######################
    ### predict on test set
    #######################

    # calculate predictions on test set
    predictions = model.predict(data.x_test)

    # build dataframe and calculate test error
    pdf = pd.DataFrame(predictions, columns = list(data.label_map.values()))
    pdf['prediction'] = pdf.idxmax(axis=1)
    truth = pd.DataFrame(data.y_test, columns = list(data.label_map.values()))
    truth['label'] = truth.idxmax(axis=1)
    truth = truth[['label']]
    pdf = pd.concat([pdf, truth], axis=1)
    pdf['error'] = (pdf['prediction'] != pdf['label']).astype(int)
    test_acc = 1 - pdf['error'].mean()
    print("test_acc {}".format(test_acc))

    return pdf, test_acc

# analyze results

In [None]:
path_models = pwd + 'models/'

results = []

for folder, subs, files in os.walk(path_models):
    for filename in files:
        if 'results.json' in filename:
            with open(os.path.abspath(os.path.join(folder, filename))) as f:
                data = json.load(f)
            results.append(data)

results = pd.DataFrame(results)        
results.sort_values("fit_val_acc", inplace=True, ascending=False)

In [None]:
results.head(10)