## Train Validate and Save

** Major bugs**

>1) used **argmax** for **label_test** tensor while computing correct prediction. That was needed only for one-hot encoded labels of MNIST

>2) Weights are not saved outside the session unless explicitly saved

> 3) Encountered **inf in logits** error after 9300 iterations of training (**model 2**) 

>4) Sometimes the weights still approach zero and training breaks

In [1]:
import tensorflow as tf
import os
%matplotlib inline
import matplotlib.pyplot as plt
tf.set_random_seed(0)

  return f(*args, **kwds)


### Utilities

In [2]:
def run_tf(x):
    
    config = tf.ConfigProto()
    config.gpu_options.allow_growth=True
    with tf.Session(config=config) as sess: 
        sess.run(tf.global_variables_initializer())
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(coord=coord)

        out = sess.run(x)

        coord.request_stop()
        coord.join(threads)
        
        return out
    

class FLAGS(object):
    pass

FLAGS.batch_size = 128
FLAGS.data_dir = "/home/sankaran/exercise/ML/TF-Exercise/Tutorials/CIFAR/cifar-10-batches-bin"
FLAGS.num_preprocess_threads = 16
FLAGS.num_classes = 10
FLAGS.dtype = tf.float32
FLAGS.train = True

def distorted_inputs(data_dir, batch_size, distort=True):
    
    if(FLAGS.train):
        filenames = [os.path.join(data_dir, 'data_batch_%d.bin' % i) for i in range(1, 6)]
        print("using ", filenames)
    else:
        filenames = [os.path.join(data_dir, 'test_batch.bin')]
        print("using ", filenames)
        
        
    # Create a queue that produces the filenames to read.
    filename_queue = tf.train.string_input_producer(filenames,seed=0)
    
    #Create FixedLenthRecord Reader with fixed bytes to read
    record_bytes = 32*32*3+1 #32*32*3 image with 1 byte for label
    reader = tf.FixedLengthRecordReader(record_bytes=record_bytes)
    
    key, value = reader.read(filename_queue)
    
    ##Decode
    decoded = tf.decode_raw(value, tf.uint8)
    label = tf.strided_slice(decoded,[0],[1])
    image = tf.strided_slice(decoded,[1],[record_bytes])
    
    
    label = tf.cast(label,tf.int32)
    label = tf.reshape(label,[1])
    image = tf.reshape(image,[3,32,32])
    image = tf.transpose(image,[1,2,0])
    #image = tf.cast(image,tf.float32) ## DESTROYES IMAGE VIS
    
    ##PRE PROCESS
    if(distort and FLAGS.train):
        image = tf.random_crop(image, [24, 24, 3])
        image = tf.image.random_flip_left_right(image)
        image = tf.image.random_brightness(image,max_delta=0.4)
        image = tf.image.random_contrast(image,lower=0.5,upper=1.8)
    
    image = tf.image.convert_image_dtype(image,dtype=FLAGS.dtype)
    # Ensure that the random shuffling has good mixing properties.
    min_fraction_of_examples_in_queue = 0.4
    NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000
    min_queue_examples = int(NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN *
                             min_fraction_of_examples_in_queue)
    
    images, label_batch = tf.train.shuffle_batch(
        [image, label],
        batch_size=batch_size,
        num_threads=FLAGS.num_preprocess_threads,
        capacity=min_queue_examples + 3 * batch_size,
        min_after_dequeue=min_queue_examples,
        seed=0)
    
    return [images,label_batch]

In [3]:
def inference(images):
    
    def weight_variable(shape,std=0.1,dtype=FLAGS.dtype):
      initializer = tf.truncated_normal_initializer(stddev=std, dtype=dtype)
      return tf.get_variable("Weights",shape,initializer=initializer,dtype=dtype)

    def bias_variable(shape,const=0.0,dtype=FLAGS.dtype):
      initializer = tf.constant_initializer(const,dtype)
      return tf.get_variable("biases",shape,initializer=initializer,dtype=dtype)
    
    print("input : ", images)
    
    with tf.variable_scope("conv1"):
        W_conv1 = weight_variable([5, 5, 3, 64],std=5e-2)
        b_conv1 = bias_variable([64],const=0.0)

        conv = tf.nn.conv2d(images,W_conv1,strides=[1,1,1,1],padding="SAME")
        h_conv1 = tf.nn.relu(conv + b_conv1)
        
        print("conv 1 : ",h_conv1)
        
    with tf.variable_scope("maxpool1_norm"):
        h_pool1 = tf.nn.max_pool(h_conv1,ksize=[1,3,3,1],strides=[1,2,2,1],padding="SAME")
        h_norm1 = tf.nn.lrn(h_pool1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75, name='norm1')
        
        print("pool_norm 1 : ",h_norm1)
        
    with tf.variable_scope("conv2"):
        W_conv2 = weight_variable([5, 5, 64, 64],std=5e-2)
        b_conv2 = bias_variable([64],const=0.1)

        conv = tf.nn.conv2d(h_norm1,W_conv2,strides=[1,1,1,1],padding="SAME")
        h_conv2 = tf.nn.relu(conv + b_conv2)
        
        print("conv 2 : ",h_conv2)
        
    with tf.variable_scope("norm_maxpool2"):
        h_norm2 = tf.nn.lrn(h_conv2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75, name='norm2')
        h_pool2 = tf.nn.max_pool(h_norm2,ksize=[1,3,3,1],strides=[1,2,2,1],padding="SAME")
        
        print("norm_pool 2 : ",h_pool2)
        
    with tf.variable_scope("Flatten"):
        h_pool2_flat = tf.reshape(h_pool2, [FLAGS.batch_size, -1])
        
        print("flatten : ",h_pool2_flat)
        
    with tf.variable_scope("fc1"):
        W_fc1 = weight_variable([h_pool2_flat.shape[1].value, 384],std=0.04)
        b_fc1 = bias_variable([384],const=0.1)
        h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
        
        print("fc1 : ",h_fc1)
        
    with tf.variable_scope("fc2"):
        W_fc2 = weight_variable([384, 192],std=0.04)
        b_fc2 = bias_variable([192],const=0.1)
        h_fc2 = tf.nn.relu(tf.matmul(h_fc1, W_fc2) + b_fc2)
        
        print("fc2 : ",h_fc2)
        
    with tf.variable_scope("logit"):
        W_fc3 = weight_variable([192, FLAGS.num_classes],std=1/192.0)
        b_fc3 = bias_variable([FLAGS.num_classes],const=0.0)
        logit = tf.nn.relu(tf.matmul(h_fc2, W_fc3) + b_fc3)
        
        print("logit : ",logit)
        
        return logit

In [4]:
def regularizer(wd):
    fc1_w = tf.trainable_variables(scope='fc1/Weights')[0]
    wd_fc1 = tf.multiply(tf.nn.l2_loss(fc1_w), wd, name='fc1/weight_loss')

    fc2_w = tf.trainable_variables(scope='fc2/Weights')[0]
    wd_fc2 = tf.multiply(tf.nn.l2_loss(fc2_w), wd, name='fc2/weight_loss')
    
    return wd_fc1+wd_fc2

def loss(logit,labels):
    with tf.variable_scope("cross-entropy"):
        cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf.reshape(labels,[FLAGS.batch_size]), logits=logit)
        avg_cross_entropy = tf.reduce_mean(cross_entropy)
        
        return avg_cross_entropy

### Training

In [5]:
def train(learning_rate,decay_step,decay_rate,global_step):
    
    FLAGS.train = True
    with tf.variable_scope("Input-queue-train"):
        images,labels = distorted_inputs(FLAGS.data_dir,FLAGS.batch_size,distort=False)
        #tf.summary.image('images', images)

    logit = inference(images)
    tf.summary.histogram("Logits",logit,collections=["Train"])

    ce_loss = loss(logit,labels)
    tf.summary.scalar("mean_cross_entropy",ce_loss,collections=["Train"])

    total_loss = ce_loss + regularizer(0.004)
    tf.summary.scalar("total_loss",total_loss,collections=["Train"])

    lr = tf.train.exponential_decay(
                                    learning_rate=learning_rate,
                                    global_step=global_step,
                                    decay_steps=decay_step,
                                    decay_rate=decay_rate,
                                    staircase=True)

    tf.summary.scalar("learning_rate",lr,collections=["Train"])
    opt = tf.train.GradientDescentOptimizer(lr)
    grads = opt.compute_gradients(total_loss)

    train_op = opt.apply_gradients(grads, global_step=global_step)
    
    return (train_op,ce_loss)

### Evaluate accuracy

In [6]:
def eval_accuracy():
    
    FLAGS.train = False
    with tf.variable_scope("Input-queue-test"):
        images_test,labels_test = distorted_inputs(FLAGS.data_dir,FLAGS.batch_size,distort=False)

    logit = inference(images_test)
    ce_loss = loss(logit,labels_test)
    tf.summary.scalar("test_mean_cross_entropy",ce_loss,collections=["Test"])

    correct_prediction = tf.equal(tf.argmax(logit, 1), tf.reshape(tf.cast(labels_test,dtype=tf.int64),[FLAGS.batch_size,]))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    ac_summary = tf.summary.scalar("accuracy",accuracy,collections=["Test"])
    tf.add_to_collection("acc",accuracy)
    
    return (accuracy,ce_loss,tf.argmax(logit, 1))

### Run Session

In [7]:
tf.reset_default_graph()

learning_rate = tf.placeholder(FLAGS.dtype)
decay_step = tf.placeholder(tf.int32)
decay_rate = tf.placeholder(FLAGS.dtype)
global_step = tf.train.get_or_create_global_step()


train_op,train_loss = train(learning_rate,decay_step,decay_rate,global_step)

with tf.variable_scope(tf.get_variable_scope(), reuse=True): 
  test_op,test_loss,prediction = eval_accuracy()

using  ['/home/sankaran/exercise/ML/TF-Exercise/Tutorials/CIFAR/cifar-10-batches-bin/data_batch_1.bin', '/home/sankaran/exercise/ML/TF-Exercise/Tutorials/CIFAR/cifar-10-batches-bin/data_batch_2.bin', '/home/sankaran/exercise/ML/TF-Exercise/Tutorials/CIFAR/cifar-10-batches-bin/data_batch_3.bin', '/home/sankaran/exercise/ML/TF-Exercise/Tutorials/CIFAR/cifar-10-batches-bin/data_batch_4.bin', '/home/sankaran/exercise/ML/TF-Exercise/Tutorials/CIFAR/cifar-10-batches-bin/data_batch_5.bin']
input :  Tensor("Input-queue-train/shuffle_batch:0", shape=(128, 32, 32, 3), dtype=float32)
conv 1 :  Tensor("conv1/Relu:0", shape=(128, 32, 32, 64), dtype=float32)
pool_norm 1 :  Tensor("maxpool1_norm/norm1:0", shape=(128, 16, 16, 64), dtype=float32)
conv 2 :  Tensor("conv2/Relu:0", shape=(128, 16, 16, 64), dtype=float32)
norm_pool 2 :  Tensor("norm_maxpool2/MaxPool:0", shape=(128, 8, 8, 64), dtype=float32)
flatten :  Tensor("Flatten/Reshape:0", shape=(128, 4096), dtype=float32)
fc1 :  Tensor("fc1/Relu:0",

In [14]:
feed_dict = {learning_rate:0.1,
            decay_rate : 0.1,
            decay_step:10000}

max_steps = 5000
eval_every = 2000
max_eval_steps = 1000

In [23]:
merged_train = tf.summary.merge_all(key="Train")
merged_test = tf.summary.merge_all(key="Test")

config = tf.ConfigProto()
config.gpu_options.allow_growth=True
with tf.Session(config=config) as sess: 
    writer = tf.summary.FileWriter("Train_val_log/3/",sess.graph)
    
    sess.run(tf.global_variables_initializer())
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)

    for i in range(max_steps):
        FLAGS.train = True
        _,tr_loss,summary = sess.run([train_op,train_loss,merged_train],feed_dict=feed_dict)
        writer.add_summary(summary,i)
        if(i%100 == 0):
            print(FLAGS.train)
            print(i, "Loss : ", "{:.2e}".format(tr_loss))
        
        if(i%eval_every == 0 and i!=0):
            FLAGS.train = False
            print("Evaluating..")
            print(FLAGS.train)
            for j in range(max_eval_steps):
                acc,te_loss,pred,summary_te = sess.run([test_op,test_loss,prediction,merged_test],feed_dict=feed_dict)
                writer.add_summary(summary_te,j)
                if(j%100 == 0):
                    print("Testing : Loss : ", "{:.2e}".format(te_loss), " Accuracy : ","{:.2e}".format(acc), "Prediction : ", pred)
                

    coord.request_stop()
    coord.join(threads)
    
    saver = tf.train.Saver()
    saver.save(sess,"Train_val_log/3/models/model.chpt")


True
0 Loss :  2.30e+00
True
100 Loss :  2.27e+00
True
200 Loss :  2.30e+00
True
300 Loss :  2.17e+00
True
400 Loss :  2.09e+00
True
500 Loss :  2.03e+00
True
600 Loss :  2.01e+00
True
700 Loss :  1.78e+00
True
800 Loss :  1.89e+00
True
900 Loss :  1.65e+00
True
1000 Loss :  1.70e+00
True
1100 Loss :  1.65e+00
True
1200 Loss :  1.59e+00
True
1300 Loss :  1.58e+00
True
1400 Loss :  1.66e+00
True
1500 Loss :  1.50e+00
True
1600 Loss :  1.71e+00
True
1700 Loss :  1.62e+00
True
1800 Loss :  1.27e+00
True
1900 Loss :  1.56e+00
True
2000 Loss :  1.28e+00
Evaluating..
False
Testing : Loss :  1.68e+00  Accuracy :  4.61e-01 Prediction :  [0 6 0 0 2 9 2 6 0 8 9 8 0 6 2 6 5 9 0 2 0 6 5 5 8 6 6 4 0 8 8 7 7 9 9 4 8
 0 7 6 5 8 5 7 0 7 7 5 8 8 7 2 7 8 9 7 5 2 0 6 4 0 6 9 0 0 8 5 8 8 7 0 7 0
 0 7 6 0 7 8 9 0 8 9 0 4 0 4 5 6 9 5 0 9 7 8 0 0 2 2 9 0 7 4 7 7 2 9 0 8 4
 5 6 8 4 4 8 7 8 8 5 0 7 7 0 4 5 9]
Testing : Loss :  1.37e+00  Accuracy :  5.55e-01 Prediction :  [0 2 8 6 0 8 7 5 6 8 5 4 6 6 4 7 7 4 6 

In [24]:
from tensorflow.python.tools import inspect_checkpoint as chkp
chkp.print_tensors_in_checkpoint_file("Train_val_log/3/models/model.chpt",tensor_name='', all_tensors=False,all_tensor_names=True)

tensor_name:  conv1/Weights
tensor_name:  conv1/biases
tensor_name:  conv2/Weights
tensor_name:  conv2/biases
tensor_name:  fc1/Weights
tensor_name:  fc1/biases
tensor_name:  fc2/Weights
tensor_name:  fc2/biases
tensor_name:  global_step
tensor_name:  logit/Weights
tensor_name:  logit/biases


In [26]:
imported_meta = tf.train.import_meta_graph("Train_val_log/3/models/model.chpt.meta")

sess = tf.InteractiveSession()
imported_meta.restore(sess,tf.train.latest_checkpoint("Train_val_log/3/models/"))

INFO:tensorflow:Restoring parameters from Train_val_log/3/models/model.chpt


In [27]:
tf.get_default_graph().get_all_collection_keys()

['queue_runners',
 'Test',
 'train_op',
 'summaries',
 'trainable_variables',
 'variables',
 'global_step',
 'acc',
 'Train']

In [30]:
x = sess.graph.get_collection("acc")[1]

In [None]:
sess.run(x)

In [16]:
FLAGS.train = False
images_test,labels_test = distorted_inputs(FLAGS.data_dir,FLAGS.batch_size,distort=False)
with tf.variable_scope(tf.get_variable_scope(), reuse=True): 
    logit = inference(images_test)
correct_prediction = tf.equal(tf.argmax(logit, 1), tf.reshape(tf.cast(labels_test,dtype=tf.int64),[FLAGS.batch_size,]))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

using  ['/home/sankaran/exercise/ML/TF-Exercise/Tutorials/CIFAR/cifar-10-batches-bin/test_batch.bin']
input :  Tensor("shuffle_batch_1:0", shape=(128, 32, 32, 3), dtype=float32)
conv 1 :  Tensor("conv1_3/Relu:0", shape=(128, 32, 32, 64), dtype=float32)
pool_norm 1 :  Tensor("maxpool1_norm_3/norm1:0", shape=(128, 16, 16, 64), dtype=float32)
conv 2 :  Tensor("conv2_3/Relu:0", shape=(128, 16, 16, 64), dtype=float32)
norm_pool 2 :  Tensor("norm_maxpool2_3/MaxPool:0", shape=(128, 8, 8, 64), dtype=float32)
flatten :  Tensor("Flatten_3/Reshape:0", shape=(128, 4096), dtype=float32)
fc1 :  Tensor("fc1_3/Relu:0", shape=(128, 384), dtype=float32)
fc2 :  Tensor("fc2_3/Relu:0", shape=(128, 192), dtype=float32)
logit :  Tensor("logit_3/Relu:0", shape=(128, 10), dtype=float32)


In [21]:
run_tf([correct_prediction,tf.reshape(tf.cast(labels_test,dtype=tf.int64),[FLAGS.batch_size,]),accuracy])

[array([ True, False, False, False, False, False, False, False, False,
         True, False, False, False, False, False, False, False,  True,
        False, False, False, False, False, False, False, False, False,
        False, False, False,  True, False,  True, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
         True, False, False, False, False, False, False, False, False,
        False,  True, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False,  True,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
         True, False, False, False, False, False, False, False, False,
      

In [20]:
logit

<tf.Tensor 'logit_3/Relu:0' shape=(128, 10) dtype=float32>

### Error made

In [51]:
run_tf([labels_test,tf.argmax(labels_test,1)])

[array([[5],
        [6],
        [3],
        [7],
        [4],
        [6],
        [6],
        [9],
        [5],
        [2],
        [9],
        [5],
        [1],
        [0],
        [6],
        [2],
        [4],
        [3],
        [2],
        [3],
        [5],
        [5],
        [6],
        [4],
        [1],
        [9],
        [6],
        [2],
        [0],
        [2],
        [1],
        [4],
        [9],
        [7],
        [4],
        [0],
        [5],
        [7],
        [9],
        [4],
        [0],
        [4],
        [6],
        [1],
        [2],
        [8],
        [0],
        [4],
        [1],
        [3],
        [0],
        [0],
        [2],
        [4],
        [1],
        [7],
        [3],
        [0],
        [7],
        [6],
        [6],
        [4],
        [1],
        [3],
        [9],
        [0],
        [8],
        [2],
        [5],
        [2],
        [9],
        [5],
        [0],
        [3],
        [0],
        [4],
        [2],