<a href="https://colab.research.google.com/github/akoo-45/nevis-CNN/blob/master/vgg16b_tf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook runs a VGG16b convolutional neural networks for classifying 5 particle images in a simulated LArTPC detector available from the [public dataset](http://deeplearnphysics.org/DataChallenge/). We use Tensorflow to train the network and larcv_threadio to fetch data from larcv files. 

To run the file: 


```
ssh hopper
cd /data/ashley.koo/larcv-tutorial
# Download necessary libraries (see appendix in [writeup]
(https://docs.google.com/document/d/1jElkhcZG15OG6Azza3dgda2aagX1vHUS5YlMw8LcOBc/edit)) 
python akoo_vgg16b_tf.py
```



# **Imports**

In [0]:
from larcv import larcv
from larcv.dataloader2 import larcv_threadio
import numpy as np
import os,sys,time
import tensorflow as tf
import tensorflow.contrib.slim as slim # useful library for defining complex models like VGG16b with repeated layers
import tensorflow.python.platform


# tensorflow/gpu start-up configuration
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES']='2'

[Slim](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/slim) is useful for VGG16 architectures since there are repeated layers with same parameters (Slim has a handy .repeat() method for doing that)

# Configurations

In [0]:
# Make directory paths for the larcv IOs
TUTORIAL_DIR     = '.'
TRAIN_IO_CONFIG  = os.path.join(TUTORIAL_DIR, 'tf/io_train.cfg') # configuration file stored in './tf/io_train.cfg'
TEST_IO_CONFIG   = os.path.join(TUTORIAL_DIR, 'tf/io_test.cfg' ) # configuration file stored in './tf/io_test.cfg'
TRAIN_BATCH_SIZE = 10
TEST_BATCH_SIZE  = 100
LOGDIR           = 'log'
ITERATIONS       = 5000
SAVE_SUMMARY     = 20
SAVE_WEIGHTS     = 100

# Check that the log directory is empty  
train_logdir = os.path.join(LOGDIR,'train')
test_logdir  = os.path.join(LOGDIR,'test')
# Make new log directory
if not os.path.isdir(train_logdir): os.makedirs(train_logdir)
if not os.path.isdir(test_logdir):  os.makedirs(test_logdir)
# Raise error 
if len(os.listdir(train_logdir)) or len(os.listdir(test_logdir)):
  sys.stderr.write('Error: train or test log dir not empty...\n')
  raise OSError

# Configure data reader
We prepare two data reader instances: one for training and another for testing the network. 

In [0]:
#
# Step 0: IO
#
# train dataset
train_io = larcv_threadio()  # create io interface
train_io_cfg = {'filler_name' : 'TrainIO',
                'verbosity'   : 10,
                'filler_cfg'  : TRAIN_IO_CONFIG}
train_io.configure(train_io_cfg)   # configure
train_io.start_manager(TRAIN_BATCH_SIZE) # start read thread
time.sleep(2)
train_io.next()

# test dataset
test_io = larcv_threadio()   # create io interface
test_io_cfg = {'filler_name' : 'TestIO',
               'verbosity'   : 10,
               'filler_cfg'  : TEST_IO_CONFIG}
test_io.configure(test_io_cfg)   # configure
test_io.start_manager(TEST_BATCH_SIZE) # start read thread
time.sleep(2)
test_io.next()

#Defining a network
We use 16 convolution layers with max-pooling operation followed after every 2 convolution layers except the last layer is average-pooling.



In [0]:
# 
# Step 1: Define Network
#
def build(inputs, num_class=4, trainable=True, debug=True):
  filters = 64
  with slim.arg_scope([slim.conv2d, slim.fully_connected], trainable=trainable,
                      activation_fn=tf.nn.relu):
    # conv1
    net = slim.conv2d(inputs, filters, [3, 3], stride=2, scope='conv1_1')
    net = slim.conv2d(net, filters, [3, 3], stride=1, scope='conv1_2')
	   # pool1 - stride2
    net = slim.max_pool2d(net, [2, 2], stride = 2, scope='pool1')
    filters *= 2 # num features X 2

    with slim.arg_scope([slim.conv2d], trainable=trainable, stride=1):
      # conv2
      net = slim.repeat(net, 2, slim.conv2d, filters, [3, 3], scope='conv2') # instead of conv2_1, conv2_2
	     # pool2 - stride2
      net = slim.max_pool2d(net, [2, 2], stride = 2, scope='pool2')
      filters *= 2

      # conv3
      net = slim.repeat(net, 3, slim.conv2d, filters, [3, 3], scope='conv3') # instead of conv2_1, conv2_2
	     # pool3 - stride2
      net = slim.max_pool2d(net, [2, 2], stride = 2, scope='pool3')
      filters *= 2

      # conv4
      net = slim.repeat(net, 3, slim.conv2d, filters, [3, 3], scope='conv4') # instead of conv2_1, conv2_2
        # pool4 - stride2
      net = slim.max_pool2d(net, [2, 2], stride = 2, scope='pool4')
      print("After step 4", net.shape) 

      # conv5
      net = slim.repeat(net, 3, slim.conv2d, filters, [3, 3], scope='conv5') # instead of conv2_1, conv2_2
	     # pool5 - stride2
      net = slim.max_pool2d(net, [2, 2], stride = 2, scope='pool5')
      print("After step 5", net.shape) 

    #net = slim.dropout(net, 0.5, scope='dropout6')
    #net = slim.fully_connected(net, 4096, scope='fc7')
    
    with tf.variable_scope('final'):
      net = slim.flatten(net, scope='flatten')

      if debug: print('After flattening', net.shape)
      print('After flattening', net.shape) 

      net = slim.fully_connected(net, int(num_class), scope='fc6')

      if debug: print('After final_fc', net.shape)
      print('After final_fc', net.shape)

      # NO SOFT MAX CALC.
      return net

# Build the Network

Build the network and define loss, accuracy metrics and our solver. Any optimizer should work but you may have to tune the parameters by yourself. Here, we use RMSPropOptimizer with base learning rate 0.0005 with no justification. Note we add minimal set of tensorflow variables into tf.summary to demonstrate later the tensorboard, a dedicated monitoring/visualization tool for network training with tensorflow.

In [0]:
#
# Step 2: Build network + define loss & solver
#
# retrieve dimensions of data for network construction
dim_data  = train_io.fetch_data('train_image').dim() 
dim_label = train_io.fetch_data('train_label').dim() 

# define place holders
data_tensor    = tf.placeholder(tf.float32, [None, dim_data[1] * dim_data[2] * dim_data[3]], name='image')
label_tensor   = tf.placeholder(tf.float32, [None, dim_label[1]], name='label')
data_tensor_2d = tf.reshape(data_tensor, [-1,dim_data[1],dim_data[2],dim_data[3]],name='image_reshape')

# Let's keep 10 random set of images in the log
tf.summary.image('input',data_tensor_2d,10)
# build net
net = build(inputs=data_tensor_2d, trainable=True, num_class=dim_label[1], debug=False)

# Define accuracy
with tf.name_scope('accuracy'):
  correct_prediction = tf.equal(tf.argmax(net,1), tf.argmax(label_tensor,1))
  accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
  tf.summary.scalar('accuracy', accuracy)
  # Define loss + backprop as training step
  with tf.name_scope('train'):
    print('label_tensor', label_tensor)
    print('logits', net)
    cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=label_tensor, logits=net))
    tf.summary.scalar('cross_entropy',cross_entropy)
    train_step = tf.train.RMSPropOptimizer(0.00005).minimize(cross_entropy)


# Defining tensorflow IO

In the next cell we define tensorflow's IO

merged_summary is a tensorflow operation to create summaries to be written into a log file for tensorboard.
writer_train  writes monitoring data for training data sample into a log file.
writer_test is the same as writer_train except it is for testing data sample.
saver is a handle to store the state of the network = trained network variable values (weights, biases, etc.).


In [0]:
#                                                                                                                                      
# Step 3: weight saver & summary writer                                                                                                
#                                                                                                                                      
# Create a bandle of summary                                                                                                           
merged_summary=tf.summary.merge_all()
# Create a session                                                                                                                     
sess = tf.InteractiveSession()
# Initialize variables                                                                                                                 
sess.run(tf.global_variables_initializer())
# Create a summary writer handle                                                                                                       
writer_train=tf.summary.FileWriter(train_logdir)
writer_train.add_graph(sess.graph)
writer_test=tf.summary.FileWriter(test_logdir)
writer_test.add_graph(sess.graph)
# Create weights saver                                                                                                                 
saver = tf.train.Saver()

# Train

In [0]:
#
# Step 4: Run training loop
#
for i in range(ITERATIONS):

    train_data  = train_io.fetch_data('train_image').data()
    train_label = train_io.fetch_data('train_label').data()

    feed_dict = { data_tensor  : train_data,
                  label_tensor : train_label }

    loss, acc, _ = sess.run([cross_entropy, accuracy, train_step], feed_dict=feed_dict)

    if (i+1)%SAVE_SUMMARY == 0:
      # Save train log
      sys.stdout.write('Training in progress @ step %d loss %g accuracy %g          \n' % (i,loss,acc))
      sys.stdout.flush()
      s = sess.run(merged_summary, feed_dict=feed_dict)
      writer_train.add_summary(s,i)
      
      # Calculate & save test log
      test_data  = test_io.fetch_data('test_image').data()
      test_label = test_io.fetch_data('test_label').data()
      feed_dict  = { data_tensor  : test_data,
                       label_tensor : test_label }
      loss, acc = sess.run([cross_entropy, accuracy], feed_dict=feed_dict)
      sys.stdout.write('Testing in progress @ step %d loss %g accuracy %g          \n' % (i,loss,acc))
      sys.stdout.flush()
      s = sess.run(merged_summary, feed_dict=feed_dict)
      writer_test.add_summary(s,i)
        
    test_io.next()

    train_io.next()

    if (i+1)%SAVE_WEIGHTS == 0:
      ssf_path = saver.save(sess,'weights/toynet',global_step=i)
      print('saved @',ssf_path)

# inform log directory
print()
print('Run `tensorboard --logdir=%s` in terminal to see the results.' % LOGDIR)
train_io.reset()
test_io.reset()