In [1]:
WORKER_COUNT = 24

In [2]:
# whether to log each feature and sequence status
verbose = True

In [3]:
import gc
import os
import pandas as pd
pd.options.display.max_rows = 5000
import numpy as np
import json
import datetime
import matplotlib.pyplot as plt
import itertools
import sys
sys.path.append('..')

In [4]:
# setup paths
pwd = os.getcwd().replace("notebooks","")
path_cache = pwd + 'cache/'
path_data = pwd + 'data/'

In [5]:
# setup logging
# any explicit log messages or uncaught errors to stdout and file /logs.log
import logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s]  %(message)s",
    handlers=[
        logging.FileHandler("{0}/{1}.log".format(pwd, "logs")),
        logging.StreamHandler()
    ])
# init logger
logger = logging.getLogger()
# make logger aware of any uncaught exceptions
def handle_exception(exc_type, exc_value, exc_traceback):
    if issubclass(exc_type, KeyboardInterrupt):
        sys.__excepthook__(exc_type, exc_value, exc_traceback)
        return

    logger.error("Uncaught exception", exc_info=(exc_type, exc_value, exc_traceback))
sys.excepthook = handle_exception

In [6]:
from deepvideoclassification.pretrained_CNNs import pretrained_model_names, pretrained_model_names_bucketed


Using TensorFlow backend.


# Create list of experiments to be run

* batch 1 = run frozen image MLP, LRCNNs and concat models on 1 of each pretrained_model_name in buckets (bucketed on feature sizes and limited to max sequence_length of 10)

* batch 2 = for best configurations from batch 1, run other pretrained models in buckets and run longer sequence lengths, maybe try different convolution kernel sizes

* batch 3 = run trainable MLP and LRCNN on best performing frozen variants

* batch 4 = run trainable but initializing with best CNN weights

* batch 5 = run C3D models

* batch 6 = analyze effect of dropout and pooling with best model

# Batch 1

In [None]:
experiment_batch_name = 'experiment_batch_1'

In [None]:
# init model id - need to make sure we pick up where we leave off don't overwrite it between batches
model_id_start = 0

In [None]:
# init list of experiments
experiments = []

In [None]:
pooling = 'max'
layer_sizes = [512, 256, 128, 0]
dropouts = [0.2]
sequence_lengths = [3,5,10]
sequence_models = ["LSTM", "SimpleRNN", "GRU", "Convolution1D"]
sequence_model_layer_counts = [1,2]

In [None]:
####################
### image_MLP_frozen 
####################

for pretrained_model_name in pretrained_model_names_bucketed:
    for layer_1_size in layer_sizes:
        for layer_2_size in layer_sizes:
            for layer_3_size in layer_sizes:
                for dropout in dropouts:

                    # build experiment parameters
                    experiment = {}
                    
                    experiment['architecture'] = 'image_MLP_frozen'
                    experiment['sequence_length'] = 1
                    experiment['pretrained_model_name'] = pretrained_model_name
                    experiment['layer_1_size'] = layer_1_size
                    experiment['layer_2_size'] = layer_2_size
                    experiment['layer_3_size'] = layer_3_size
                    experiment['dropout'] = dropout
                    experiment['pooling'] = 'max' # outperforms avg across all parameters
                    
                    # add to list of experiments
                    experiments.append(experiment)

In [None]:
####################
### video_MLP_concat
####################

for sequence_length in sequence_lengths:
    for pretrained_model_name in pretrained_model_names_bucketed:
        for layer_1_size in layer_sizes:
            for layer_2_size in layer_sizes:
                for layer_3_size in layer_sizes:
                    for dropout in dropouts:

                        # build experiment parameters
                        experiment = {}

                        experiment['architecture'] = 'video_MLP_concat'
                        experiment['pretrained_model_name'] = pretrained_model_name
                        experiment['layer_1_size'] = layer_1_size
                        experiment['layer_2_size'] = layer_2_size
                        experiment['layer_3_size'] = layer_3_size
                        experiment['dropout'] = dropout
                        experiment['pooling'] = 'max' # outperforms avg across all parameters
                        experiment['sequence_length'] = sequence_length

                        # add to list of experiments
                        experiments.append(experiment)

In [None]:
######################
### video_LRCNN_frozen
######################

for sequence_length in sequence_lengths:
    for pretrained_model_name in pretrained_model_names_bucketed:
        for layer_1_size in layer_sizes:
            for layer_2_size in layer_sizes:
                for layer_3_size in layer_sizes:
                    for dropout in dropouts:
                        for sequence_model in sequence_models:
                            for sequence_model_layers in sequence_model_layer_counts:

                                # build experiment parameters
                                experiment = {}

                                experiment['architecture'] = 'video_LRCNN_frozen'
                                experiment['pretrained_model_name'] = pretrained_model_name
                                experiment['layer_1_size'] = layer_1_size
                                experiment['layer_2_size'] = layer_2_size
                                experiment['layer_3_size'] = layer_3_size
                                experiment['dropout'] = dropout
                                experiment['pooling'] = 'max' # outperforms avg across all parameters
                                experiment['sequence_model'] = sequence_model
                                experiment['sequence_model_layers'] = sequence_model_layers
                                experiment['sequence_length'] = sequence_length

                                # add to list of experiments
                                experiments.append(experiment)

In [None]:
########################
### convert to dataframe
########################

experiments = pd.DataFrame(experiments)

### create model id column for this experiment batch
model_id_list = list(range(0,len(experiments)))
experiments['model_id'] = model_id_list

# assign to workers
experiments['WORKER'] = experiments['model_id'].apply(lambda x: x % WORKER_COUNT)

In [None]:
experiments.shape

In [None]:
############################################
### remove invalid experiment configurations
############################################

# Just won't run experiments for those model_ids - not an error that model ids not congituous count from 0!

# delete video experiments with 0 neurons in a layer with nonzero neurons in later layers
experiments = experiments[~((experiments['layer_1_size'] == 0) & (experiments['layer_2_size'] > 0))]
experiments = experiments[~((experiments['layer_1_size'] == 0) & (experiments['layer_3_size'] > 0))]
experiments = experiments[~((experiments['layer_2_size'] == 0) & (experiments['layer_3_size'] > 0))]

# delete video experiments where convolution_kernel_size > sequence_length (convolution_kernel_size defaults to 3 and not set in this batch)
experiments = experiments[~((experiments['sequence_model'] == 'Convolution1D') & (experiments['sequence_length']<=3))]

In [None]:
# delete LRCNN_frozen experiments with layer_1_size == 0
experiments = experiments[~((experiments['architecture'] == 'video_LRCNN_frozen') & (experiments['layer_1_size']==0))]

In [None]:
##################################
### output experiment batch to CSV
##################################
print(experiment_batch_name)
experiments.to_csv(pwd + "experiments/" + experiment_batch_name + '.csv', index=False)

In [None]:
print(experiments.shape)
experiments.tail().T

In [None]:
# upload to s3
response = os.system("aws s3 cp " + pwd + "experiments/" + experiment_batch_name + '.csv s3://thesisvids/penguins/' + experiment_batch_name + '.csv')
if response == 0:
    print("upload success")
else:
    print("upload error")

In [None]:
!aws s3 ls s3://thesisvids/penguins/

# Batch 2

In [None]:
# run other pretrained models for best configurations from batch 1
# and run longer sequence lengths

In [None]:
experiment_batch_name = 'experiment_batch_2'

In [None]:
# init model id - need to make sure we pick up where we leave off don't overwrite it between batches
model_id_start = pd.read_csv(pwd + "experiments/experiment_batch_1.csv")['model_id'].max() + 1

In [None]:
# init list of experiments
experiments = []

In [None]:
# TODO

In [None]:
########################
### convert to dataframe
########################

experiments = pd.DataFrame(experiments)

### create model id column for this experiment batch
model_id_list = list(range(0,len(experiments)))
experiments['model_id'] = model_id_list

# assign to workers
experiments['WORKER'] = experiments['model_id'].apply(lambda x: x % WORKER_COUNT)

In [None]:
##################################
### output experiment batch to CSV
##################################
print(experiment_batch_name)
experiments.to_csv(pwd + "experiments/" + experiment_batch_name + '.csv', index=False)

# Batch 3

In [None]:
# run trainable MLP and LRCNN on best performing frozen variants

In [None]:
#######################
### image_MLP_trainable
#######################

architecture = 'video_LRCNN_trainable'

In [None]:
#########################
### video_LRCNN_trainable
#########################

architecture = 'video_LRCNN_trainable'

# Batch 4

In [None]:
# run trainable but initializing with best CNN weights

# Batch 5

# Analyze results

## load results.json for all models into dataframe

In [7]:
path_models = pwd + 'models/'

results = []

for folder, subs, files in os.walk(path_models):
    for filename in files:
        if 'results.json' in filename:
            with open(os.path.abspath(os.path.join(folder, filename))) as f:
                data = json.load(f)
            results.append(data)

results = pd.DataFrame(results)        
results.sort_values("fit_val_acc", inplace=True, ascending=False)

In [28]:
results.sort_values("fit_test_acc", ascending=False).head().T

Unnamed: 0,354,1331,1713,1235,2127
architecture,video_lrcnn_frozen,video_mlp_concat,video_mlp_concat,video_mlp_concat,video_mlp_concat
batch_size,32,32,32,32,32
convolution_kernel_size,3,3,3,3,3
data_total_rows_test,3135,3130,3135,3135,3130
data_total_rows_train,60509,60399,60509,60509,60399
data_total_rows_valid,6408,6398,6408,6408,6398
dropout,0.2,0.2,0.2,0.2,0.2
fit_best_round,2,2,2,1,3
fit_dt_test_duration_seconds,24,1,0,0,1
fit_dt_test_end,2019-01-22 13:03:12,2019-01-21 00:06:24,2019-01-20 23:20:34,2019-01-20 23:13:01,2019-01-21 00:06:21


In [26]:
# results[results['model_id'].isin([362, 550, 162, 133, 3115, 3125])].T

In [22]:
results[results['architecture'] == 'video_mlp_concat'].head(5).T

Unnamed: 0,854,948,1362,2289,2475
architecture,video_mlp_concat,video_mlp_concat,video_mlp_concat,video_mlp_concat,video_mlp_concat
batch_size,32,32,32,32,32
convolution_kernel_size,3,3,3,3,3
data_total_rows_test,3137,3135,3137,3137,3137
data_total_rows_train,60553,60509,60553,60553,60553
data_total_rows_valid,6412,6408,6412,6412,6412
dropout,0.2,0.2,0.2,0.2,0.2
fit_best_round,3,3,2,1,2
fit_dt_test_duration_seconds,1,0,1,0,0
fit_dt_test_end,2019-01-20 21:15:03,2019-01-20 23:33:37,2019-01-20 21:37:34,2019-01-20 20:46:20,2019-01-20 21:17:15


In [21]:
results[results['architecture'] == 'image_mlp_frozen'].head(5).T

Unnamed: 0,1960,1931,951,2330,1765
architecture,image_mlp_frozen,image_mlp_frozen,image_mlp_frozen,image_mlp_frozen,image_mlp_frozen
batch_size,32,32,32,32,32
convolution_kernel_size,3,3,3,3,3
data_total_rows_test,3139,3139,3139,3139,3139
data_total_rows_train,60597,60597,60597,60597,60597
data_total_rows_valid,6416,6416,6416,6416,6416
dropout,0.2,0.2,0.2,0.2,0.2
fit_best_round,2,2,2,3,2
fit_dt_test_duration_seconds,2,1,1,1,1
fit_dt_test_end,2019-01-20 14:25:08,2019-01-20 14:01:02,2019-01-20 14:07:05,2019-01-20 13:54:52,2019-01-20 14:13:38


In [23]:
results[results['architecture'] == 'video_lrcnn_frozen'].head(5).T

Unnamed: 0,1241,2517,65,2273,621
architecture,video_lrcnn_frozen,video_lrcnn_frozen,video_lrcnn_frozen,video_lrcnn_frozen,video_lrcnn_frozen
batch_size,32,32,32,32,32
convolution_kernel_size,3,3,3,3,3
data_total_rows_test,3135,3135,3135,3135,3135
data_total_rows_train,60509,60509,60509,60509,60509
data_total_rows_valid,6408,6408,6408,6408,6408
dropout,0.2,0.2,0.2,0.2,0.2
fit_best_round,2,3,3,2,1
fit_dt_test_duration_seconds,13,24,21,19,16
fit_dt_test_end,2019-01-22 06:16:20,2019-01-22 13:40:31,2019-01-22 12:00:10,2019-01-22 09:56:28,2019-01-22 07:30:45


In [8]:
results.head(10).T

Unnamed: 0,1241,2517,65,2273,621,490,374,557,1734,889
architecture,video_lrcnn_frozen,video_lrcnn_frozen,video_lrcnn_frozen,video_lrcnn_frozen,video_lrcnn_frozen,video_lrcnn_frozen,video_lrcnn_frozen,video_lrcnn_frozen,video_lrcnn_frozen,video_lrcnn_frozen
batch_size,32,32,32,32,32,32,32,32,32,32
convolution_kernel_size,3,3,3,3,3,3,3,3,3,3
data_total_rows_test,3135,3135,3135,3135,3135,3135,3135,3135,3135,3135
data_total_rows_train,60509,60509,60509,60509,60509,60509,60509,60509,60509,60509
data_total_rows_valid,6408,6408,6408,6408,6408,6408,6408,6408,6408,6408
dropout,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2
fit_best_round,2,3,3,2,1,2,2,2,2,2
fit_dt_test_duration_seconds,13,24,21,19,16,9,15,21,17,11
fit_dt_test_end,2019-01-22 06:16:20,2019-01-22 13:40:31,2019-01-22 12:00:10,2019-01-22 09:56:28,2019-01-22 07:30:45,2019-01-22 03:45:26,2019-01-22 07:35:15,2019-01-22 11:50:33,2019-01-22 08:08:19,2019-01-22 04:00:49


In [29]:
results.groupby("sequence_model").agg('max')['fit_val_acc']

sequence_model
Convolution1D    0.936084
GRU              0.951779
LSTM             0.949171
SimpleRNN        0.953295
Name: fit_val_acc, dtype: float64

In [31]:
results.groupby("sequence_length").agg('max')['fit_val_acc']

sequence_length
1.0     0.946495
3.0     0.949024
5.0     0.953295
10.0    0.945161
Name: fit_val_acc, dtype: float64

In [33]:
results.groupby("sequence_model").agg("max")['fit_val_acc']

sequence_model
Convolution1D    0.936084
GRU              0.951779
LSTM             0.949171
SimpleRNN        0.953295
Name: fit_val_acc, dtype: float64

## Merge done status onto experiments

In [10]:
experiment_batch_name = 'experiment_batch_1'

In [11]:
# load list of experiments
experiments = pd.read_csv(pwd + "experiments/" + experiment_batch_name + '.csv')

In [12]:
experiments = pd.merge(experiments, results[['model_id','fit_val_acc']], left_on='model_id', right_on='model_id', how='left')
experiments['done'] = (experiments['fit_val_acc']>0).astype(int)
del experiments['fit_val_acc']

In [13]:
experiments.head()

Unnamed: 0,architecture,dropout,layer_1_size,layer_2_size,layer_3_size,pooling,pretrained_model_name,sequence_length,sequence_model,sequence_model_layers,model_id,WORKER,done
0,image_MLP_frozen,0.2,512,512,512,max,inception_resnet_v2,1,,,0,0,1
1,image_MLP_frozen,0.2,512,512,256,max,inception_resnet_v2,1,,,1,1,1
2,image_MLP_frozen,0.2,512,512,128,max,inception_resnet_v2,1,,,2,2,1
3,image_MLP_frozen,0.2,512,512,0,max,inception_resnet_v2,1,,,3,3,1
4,image_MLP_frozen,0.2,512,256,512,max,inception_resnet_v2,1,,,4,4,1


In [15]:
print("{}/{} experiments done".format(experiments[experiments['done'] == 1].shape[0], len(experiments)))

2272/3054 experiments done


## total experiments, split by architecture

In [16]:
experiments['architecture'].value_counts()

video_LRCNN_frozen    2574
video_MLP_concat       360
image_MLP_frozen       120
Name: architecture, dtype: int64

## analyze remaining experiments, split on architecture

In [17]:
experiments[experiments['done']==0]['architecture'].value_counts()

video_LRCNN_frozen    782
Name: architecture, dtype: int64

In [18]:
experiments[experiments['architecture'] == 'video_MLP_concat']

Unnamed: 0,architecture,dropout,layer_1_size,layer_2_size,layer_3_size,pooling,pretrained_model_name,sequence_length,sequence_model,sequence_model_layers,model_id,WORKER,done
120,video_MLP_concat,0.2,512,512,512,max,inception_resnet_v2,3,,,192,0,1
121,video_MLP_concat,0.2,512,512,256,max,inception_resnet_v2,3,,,193,1,1
122,video_MLP_concat,0.2,512,512,128,max,inception_resnet_v2,3,,,194,2,1
123,video_MLP_concat,0.2,512,512,0,max,inception_resnet_v2,3,,,195,3,1
124,video_MLP_concat,0.2,512,256,512,max,inception_resnet_v2,3,,,196,4,1
125,video_MLP_concat,0.2,512,256,256,max,inception_resnet_v2,3,,,197,5,1
126,video_MLP_concat,0.2,512,256,128,max,inception_resnet_v2,3,,,198,6,1
127,video_MLP_concat,0.2,512,256,0,max,inception_resnet_v2,3,,,199,7,1
128,video_MLP_concat,0.2,512,128,512,max,inception_resnet_v2,3,,,200,8,1
129,video_MLP_concat,0.2,512,128,256,max,inception_resnet_v2,3,,,201,9,1


In [19]:
experiments[experiments['architecture'] == 'video_MLP_concat'].sort_values("sequence_length").head()

Unnamed: 0,architecture,dropout,layer_1_size,layer_2_size,layer_3_size,pooling,pretrained_model_name,sequence_length,sequence_model,sequence_model_layers,model_id,WORKER,done
120,video_MLP_concat,0.2,512,512,512,max,inception_resnet_v2,3,,,192,0,1
207,video_MLP_concat,0.2,512,256,0,max,resnet50,3,,,327,15,1
206,video_MLP_concat,0.2,512,256,128,max,resnet50,3,,,326,14,1
205,video_MLP_concat,0.2,512,256,256,max,resnet50,3,,,325,13,1
204,video_MLP_concat,0.2,512,256,512,max,resnet50,3,,,324,12,1


# Copy experiment files to s3

In [None]:
model_ids = list(results['model_id'])
model_ids.sort()

In [None]:
for i, model_id in enumerate(model_ids):
    
    path_model = pwd + '/models/' + str(model_id) + '/'

    # aws s3 ls on path returns 0 if it exists so check if doesn't exist, then sync
    if os.system("aws s3 ls s3://thesisvids/penguins/models/" + str(model_id) + "/") > 0:
        print("Synching {}/{} - model_id={}".format(i+1,len(model_ids),model_id))
        response = os.system("aws s3 sync " + path_model + " s3://thesisvids/penguins/models/" + str(model_id) + "/")
        if response != 0:
            print("ERROR syncing model_id = {}".format(model_id))
    else:
        print("Already synched {}/{} - model_id={}".format(i+1,len(model_ids),model_id))

# Copy experiment files from s3

In [None]:
# TODO 
# FIRST SYNCH TO S3 THEN DELETE MODELS FOLDER AND SYNC FROM S3

# Debug experiment worker

In [7]:
from deepvideoclassification.architectures import Architecture

In [8]:
WORKER_ID = 0
GPU_ID = 7
experiment_batch_name = 'experiment_batch_1'

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]=str(WORKER_ID)

In [9]:
# setup logging
# separate log file for each worker
import logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s, [%(levelname)-8s] [%(filename)s:%(lineno)d] %(message)s',
    handlers=[
        logging.FileHandler("{0}/{1}.log".format(pwd, "logs_" + str(WORKER_ID))),
        logging.StreamHandler()
    ])
# init logger - will pass this to our architecture
logger = logging.getLogger()

logger.info("Start worker {} (GPU={}) processing {}".format(WORKER_ID, GPU_ID, experiment_batch_name))

2019-01-21 18:07:24,402 [MainThread  ] [INFO ]  Start worker 0 (GPU=7) processing experiment_batch_1


In [10]:
# load list of experiments
experiments = pd.read_csv(pwd + "experiments/" + experiment_batch_name + '.csv')

In [11]:
experiments[experiments['model_id'].isin([2495, 2463])]

Unnamed: 0,architecture,dropout,layer_1_size,layer_2_size,layer_3_size,pooling,pretrained_model_name,sequence_length,sequence_model,sequence_model_layers,model_id,WORKER
1317,video_LRCNN_frozen,0.2,256,512,0,max,inception_resnet_v2,5,Convolution1D,2.0,2463,15
1349,video_LRCNN_frozen,0.2,256,256,0,max,inception_resnet_v2,5,Convolution1D,2.0,2495,23


In [12]:
# for row in experiments.values:
debug_model_id = 2519

row = list(experiments[experiments['model_id'] == debug_model_id].values[0])

# get experiment params from dataframe row
experiment = dict(zip(experiments.columns, row))

In [13]:
experiment

{'WORKER': 23,
 'architecture': 'video_LRCNN_frozen',
 'dropout': 0.2,
 'layer_1_size': 256,
 'layer_2_size': 128,
 'layer_3_size': 128,
 'model_id': 2519,
 'pooling': 'max',
 'pretrained_model_name': 'inception_resnet_v2',
 'sequence_length': 5,
 'sequence_model': 'Convolution1D',
 'sequence_model_layers': 2.0}

In [14]:
experiments[experiments['model_id'] == debug_model_id].T

Unnamed: 0,1373
architecture,video_LRCNN_frozen
dropout,0.2
layer_1_size,256
layer_2_size,128
layer_3_size,128
pooling,max
pretrained_model_name,inception_resnet_v2
sequence_length,5
sequence_model,Convolution1D
sequence_model_layers,2


In [15]:
print(str(experiment["model_id"]) + "   " + "X"*60)
logging.info("Begin experiment for model_id={} on GPU:{} ".format(experiment['model_id'], os.environ["CUDA_VISIBLE_DEVICES"]))
print(experiment)

architecture = Architecture(model_id = experiment['model_id'], 
                            architecture = experiment['architecture'], 
                            sequence_length = experiment['sequence_length'], 
                            pretrained_model_name = experiment['pretrained_model_name'],
                            pooling = experiment['pooling'],
                            sequence_model = experiment['sequence_model'],
                            sequence_model_layers = experiment['sequence_model_layers'],
                            layer_1_size = experiment['layer_1_size'],
                            layer_2_size = experiment['layer_2_size'],
                            layer_3_size = experiment['layer_3_size'],
                            dropout = experiment['dropout'],
                            verbose=True)

2019-01-21 18:07:26,400 [MainThread  ] [INFO ]  Begin experiment for model_id=2519 on GPU:0 
2019-01-21 18:07:26,403 [MainThread  ] [INFO ]  Model folder exists but no results found - potential error in previous model training
2019-01-21 18:07:26,404 [MainThread  ] [INFO ]  Loading data


2519   XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
{'pretrained_model_name': 'inception_resnet_v2', 'layer_2_size': 128, 'layer_3_size': 128, 'sequence_model': 'Convolution1D', 'sequence_model_layers': 2.0, 'layer_1_size': 256, 'WORKER': 23, 'architecture': 'video_LRCNN_frozen', 'dropout': 0.2, 'model_id': 2519, 'pooling': 'max', 'sequence_length': 5}


2019-01-21 18:07:27,351 [MainThread  ] [INFO ]  Features already cached: /mnt/seals/cache/features/inception_resnet_v2/max/
2019-01-21 18:07:27,352 [MainThread  ] [INFO ]  Loading features sequence data into memory [may take a few minutes]


Done initializing data with #samples: train=60509, valid=6408, test=3135


In [16]:
len(architecture.model.layers[-1].output_shape)

2

In [None]:
architecture.train_model()

Train on 60509 samples, validate on 6408 samples
Epoch 1/20


In [None]:
architecture.model.summary()