In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2, 3"
from math import ceil

import time
import ipywidgets as widgets
from IPython.display import display

import numpy as np
import tensorflow as tf
from tensorflow.python.training.summary_io import SummaryWriterCache

import graph_manager
import net
import eval_utils
import loss_utils
import tf_inputs
import tf_utils
import viz

import logging
logging.getLogger("tensorflow").setLevel(logging.WARNING)

## Base Configuration

--- 

In [2]:
data = 'vedai'

configuration = {}
if data == 'vedai':
    configuration['setting'] = 'vedai'
    configuration['exp_name'] = 'vedai'
    configuration['save_summaries_steps'] = 100
    configuration['save_evaluation_steps'] = 250
    configuration['num_epochs'] = 1000
elif data == 'stanford':
    configuration['setting'] = 'sdd'
    configuration['exp_name'] = 'sdd'
    configuration['save_summaries_steps'] = 200
    configuration['save_evaluation_steps'] = 500
    configuration['num_epochs'] = 120
    
## Metadata
tfrecords_path = '/home/aroyer/indolentDetect/Data/metadata_%s.txt'
metadata = graph_manager.load_metadata(tfrecords_path % configuration['setting'])
configuration.update(metadata)
configuration['num_classes'] = len(configuration['data_classes'])

## GPUs
configuration['num_gpus'] = 2                                 
configuration['gpu_mem_frac'] = 1.

## Inputs Pipeline
configuration['subset'] = -1
configuration['batch_size'] = 16
configuration['test_batch_size'] = 16
configuration['shuffle_buffer'] = 2000
    
## Training
configuration['learning_rate'] = 1e-3
configuration['centers_localization_loss_weight'] = 1.
configuration['scales_localization_loss_weight']  = 1.
configuration['confidence_loss_weight']  = 5.
configuration['noobj_confidence_loss_weight']  = 1.
configuration['offsets_loss_weight']  = 1.

## Evaluation
configuration['save_checkpoint_secs'] = 3600
configuration['retrieval_intersection_threshold'] = [0.25, 0.5, 0.75]

graph_manager.finalize_configuration(configuration)

39688 training steps
...which means 121 epochs
10476 training samples (328 iters)
2619 validation samples (82 iters)

[41mConfig:[0m
[96mbatch_size:[0m 16
[96mcenters_localization_loss_weight:[0m 1.0
[96mconfidence_loss_weight:[0m 5.0
[96mdata_classes:[0m ['Biker', 'Bus', 'Car', 'Cart', 'Pedestrian', 'Skater']
[96mexp_name:[0m sdd
[96mfeature_keys:[0m ['im_id', 'num_boxes', 'bounding_boxes', 'classes']
[96mgpu_mem_frac:[0m 1.0
[96mimage_folder:[0m /home/aroyer/Datasets/sdd_images
[96mlast_test_batch_size:[0m 27
[96mlearning_rate:[0m 0.0005
[96mnoobj_confidence_loss_weight:[0m 1.0
[96mnum_classes:[0m 6
[96mnum_epochs:[0m 120
[96mnum_gpus:[0m 2
[96mnum_steps:[0m 39688
[96moffsets_loss_weight:[0m 1.0
[96mretrieval_intersection_threshold:[0m [0.25, 0.5, 0.75]
[96msave_checkpoint_secs:[0m 3600
[96msave_evaluation_steps:[0m 500
[96msave_summaries_steps:[0m 200
[96mscales_localization_loss_weight:[0m 1.0
[96msetting:[0m sdd
[96mshuffle_buffer:[0

## Network
---

In [None]:
def forward_pass(inputs, 
                 outputs, 
                 configuration,
                 is_training=True,
                 reuse=False, 
                 verbose=False,
                 scope_name='model'):
    """Forward-pass in the net"""
    with tf.variable_scope(scope_name, reuse=reuse):
        activations = net.tiny_yolo_v2(
            inputs["image"], is_training=is_training, reuse=reuse, verbose=verbose, **configuration)
        net.get_detection_outputs(activations, outputs, reuse=reuse, verbose=verbose, **configuration)
            
            
def train_pass(inputs, configuration, is_chief=False):
    """ Compute outputs of the net and add losses to the graph"""
    outputs = {}
    base_name = graph_manager.get_defaults(configuration, ['base_name'], verbose=is_chief)[0]
    if is_chief: print(' \033[34m%s:\033[0m' % base_name)
        
    # Feed forward
    with tf.name_scope('%s/net' % base_name):
        forward_pass(inputs, outputs, configuration, scope_name=base_name, 
                     is_training=True, reuse=not is_chief, verbose=is_chief) 
        
    # Add losses
    with tf.name_scope('%s/loss' % base_name):
        graph_manager.add_losses_to_graph(
            loss_utils.get_standard_loss, inputs, outputs, configuration, is_chief=is_chief, verbose=is_chief)
        
    if is_chief:
        print('\n'.join("    \033[32m%s\033[0m: shape=%s, dtype=%s" % (
            key, value.get_shape().as_list(), value.dtype) for key, value in outputs.items()))
    return outputs
        
    
def eval_pass(inputs, configuration, metrics_to_norms, clear_metrics_op, update_metrics_op, 
              device=0, is_chief=False):
    """ Compute output of the net and add metrics update and reset operatiosn to the graph"""
    outputs = {}
    base_name = graph_manager.get_defaults(configuration, ['base_name'], verbose=is_chief)[0]
    if is_chief: print(' \033[34m%s:\033[0m' % base_name)
        
    # Feed forward
    with tf.name_scope('%s/net' % base_name):
        forward_pass(inputs, outputs, configuration, scope_name=base_name, is_training=False, 
                     reuse=True, verbose=is_chief) 
        
    with tf.name_scope('%s/eval' % base_name):
        # Add number of samples counter
        graph_manager.add_metrics_to_graph(
            eval_utils.get_samples_running_counters, inputs, outputs, metrics_to_norms, clear_metrics_op, 
            update_metrics_op, configuration, device=device, verbose=is_chief) 
        # Add metrics
        graph_manager.add_metrics_to_graph(
            eval_utils.get_standard_eval, inputs, outputs, metrics_to_norms, clear_metrics_op, 
            update_metrics_op, configuration, device=device, verbose=is_chief)     
    return outputs    

## Train

---


In [None]:
########################################################################## Config
vanilla_configuration = configuration.copy()
vanilla_configuration['base_name'] =  'tinyyolov2'

# Set resolution parameter [I, J] and K
vanilla_configuration['image_size'] = 1024
vanilla_configuration['num_boxes'] = 1

# Finalize
vanilla_configuration['exp_name'] += '/yolov2_%d' % vanilla_configuration['image_size']
graph_manager.finalize_grid_offsets(vanilla_configuration)
print('Retrieval top k = %d (final)' % vanilla_configuration['retrieval_top_n'])


with tf.Graph().as_default() as graph:          
    ########################################################################## Train graph
    with tf.name_scope('train'):
        print('\n\033[44mLoad inputs:\033[0m')
        inputs = graph_manager.get_inputs(mode='train', verbose=True, **vanilla_configuration)   
        viz.display_graph_size('inputs(train)')  
        
        print('\n\033[43mTrain Graph:\033[0m')      
        for i, train_inputs in enumerate(inputs):
            with tf.device('/gpu:%d' % i):
                with tf.name_scope('dev%d' % i):
                    is_chief = (i == 0)
                    train_outputs = train_pass(train_inputs, vanilla_configuration, is_chief=is_chief)   
                    if is_chief:
                        print(' \033[34msummaries:\033[0m')
                        graph_manager.add_summaries(
                            train_inputs, train_outputs, mode='train', **vanilla_configuration)
            viz.display_graph_size('train net (gpu:%d)' % i)

        # Training Objective
        with tf.name_scope('losses'):
            losses = graph_manager.get_total_loss()
            full_loss = tf.add_n([x[0] for x in losses])
        viz.display_graph_size('full loss')

        # Train op    
        with tf.name_scope('train_op'):   
            global_step, train_op = graph_manager.get_train_op(losses, **vanilla_configuration)
        viz.display_graph_size('train op')
        
        # Additional info
        with tf.name_scope('config_summary'):
            viz.add_text_summaries(vanilla_configuration) 
            print('\n\033[43mLosses:\033[0m')
            print('\n'.join(["    \033[35m%s:\033[0m %s tensors" % (x, len(tf.get_collection(x)))  
                            for x in tf.get_default_graph().get_all_collection_keys() if x.endswith('_loss')]))
    
    
    ##########################################################################  Evaluation graph
    with tf.name_scope('eval'):        
        print('\n\033[43mTest Graph:\033[0m')
        update_metrics_op = []    # Store operations to update the metrics
        clear_metrics_op = []     # Store operations to reset the metrics
        metrics_to_norms = {}

        inputs = graph_manager.get_inputs(mode='test', verbose=False, **vanilla_configuration)         
        viz.display_graph_size('inputs(test)')            

        for i, val_inputs in enumerate(inputs):
            with tf.device('/gpu:%d' % i):
                with tf.name_scope('dev%d' % i):
                    is_chief = (i == 0)
                    val_outputs = eval_pass(val_inputs, vanilla_configuration, metrics_to_norms, 
                                            clear_metrics_op, update_metrics_op, device=i, is_chief=is_chief) 
                    if is_chief:
                        graph_manager.add_summaries(
                            val_inputs, val_outputs, mode='test', **vanilla_configuration)   
            viz.display_graph_size('test net (gpu:%d)' % i)

        with tf.name_scope('eval'):
            print('    \x1b[32m%d\x1b[0m eval update ops' % len(update_metrics_op))
            print('    \x1b[32m%d\x1b[0m eval clear ops' % len(clear_metrics_op))
            update_metrics_op = tf.group(*update_metrics_op)
            clear_metrics_op = tf.group(*clear_metrics_op)
            eval_summary_op = graph_manager.get_eval_op(metrics_to_norms)

        # Additional info
        print('\n\033[43mEval metrics:\033[0m')
        print('\n'.join(["    \033[35m%s:\033[0m %s tensors" % (x, len(tf.get_collection(x)))  
                        for x in tf.get_default_graph().get_all_collection_keys() 
                        if x.endswith('_eval')]))

    ########################################################################## Run    
    try:
        print('\n\033[44mLaunch session:\033[0m')
        graph_manager.generate_log_dir(vanilla_configuration)
        summary_writer = SummaryWriterCache.get(vanilla_configuration["log_dir"])
        print('    Log directory', os.path.abspath(vanilla_configuration["log_dir"]))
        
        with graph_manager.get_monitored_training_session(**vanilla_configuration) as sess:    
            loss_widget = widgets.HTML(value="")
            start_time = time.time()
            global_step_ = 0            
            print('\n\033[44mStart training:\033[0m')
            display(loss_widget)   
            while not sess.should_stop(): 
                        
                # Train
                global_step_, full_loss_, _ = sess.run([global_step, full_loss, train_op])
                
                # Evaluate
                if (vanilla_configuration["save_evaluation_steps"] is not None and (global_step_ > 1)
                    and global_step_  % vanilla_configuration["save_evaluation_steps"] == 0):
                    sess.run(clear_metrics_op)
                    num_epochs = vanilla_configuration["test_num_iters_per_epoch"]
                    for epoch in range(num_epochs):
                        viz.display_eval(loss_widget, global_step_, epoch + 1, num_epochs, start_time)
                        sess.run(update_metrics_op) 
                        if epoch == num_epochs - 1: 
                            eval_summary = sess.run(eval_summary_op)

                    # Write summary
                    summary_writer.add_summary(eval_summary, global_step_)
                    summary_writer.flush()
                    
                # Display
                if (global_step_ - 1) % 20 == 0:
                    viz.display_loss(loss_widget, global_step_, full_loss_, start_time, 
                                     vanilla_configuration["train_num_samples_per_iter"], 
                                     vanilla_configuration["train_num_samples"])
                
    except KeyboardInterrupt:
        print('\nInterrupted at step %d' % global_step_)   

grid size [32 32]
Retrieval top k = 1024 (final)

[44mLoad inputs:[0m
    with default `num_threads` = 8
    with default `prefetch_capacity` = 1
    with default `data_augmentation_threshold` = 0.5
    with default `with_groups` = False
    with default `with_classification` = False
    pad [32mtrain[0m inputs with [32m0[0m dummy samples
    [32mbounding_boxes[0m: shape=[None, 100, 4], dtype=<dtype: 'float32'>
    [32mnum_boxes[0m: shape=[None], dtype=<dtype: 'int32'>
    [32mimage[0m: shape=[None, 1024, 1024, 3], dtype=<dtype: 'float32'>
    [32mobj_i_mask_bbs[0m: shape=[None, 32, 32, 1, 100], dtype=<dtype: 'float32'>
    [32mim_id[0m: shape=[None], dtype=<dtype: 'int32'>
    [32mis_flipped[0m: shape=[None], dtype=<dtype: 'float32'>
[37minputs(train) graph: 0.00 MB[0m

[43mTrain Graph:[0m
 [34mtinyyolov2:[0m
  > Use custom [32mtiny yolo v2[0m
    with default `weight_decay` = 0.0
    with default `normalizer_decay` = 0.9
    with default `num_filters` = [16,