# ReLU

#  Softmax for MNIST

In [4]:
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)

# Parameters
learning_rate = 0.01
training_epochs = 25
batch_size = 100
display_step = 1

# tf Graph Input
x = tf.placeholder(tf.float32, [None, 784]) # mnist data image of shape 28*28=784
y = tf.placeholder(tf.float32, [None, 10]) # 0-9 digits recognition => 10 classes

# Set model weights
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))

# Construct model
pred = tf.nn.softmax(tf.matmul(x, W) + b) # Softmax

# Minimize error using cross entropy
cost = tf.reduce_mean(-tf.reduce_sum(y*tf.log(pred), reduction_indices=1))
# Gradient Descent
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)

# Initializing the variables
init = tf.initialize_all_variables()

# Launch the graph
with tf.Session() as sess:
    sess.run(init)

    # Training cycle
    for epoch in range(training_epochs):
        avg_cost = 0.
        total_batch = int(mnist.train.num_examples/batch_size)
        # Loop over all batches
        for i in range(total_batch):
            batch_xs, batch_ys = mnist.train.next_batch(batch_size)
            # Run optimization op (backprop) and cost op (to get loss value)
            _, c = sess.run([optimizer, cost], feed_dict={x: batch_xs,
                                                          y: batch_ys})
            # Compute average loss
            avg_cost += c / total_batch
        # Display logs per epoch step
        if (epoch+1) % display_step == 0:
            print("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(avg_cost))

    print("Optimization Finished!")

    # Test model
    correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
    # Calculate accuracy
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    print("Accuracy:", accuracy.eval({x: mnist.test.images, y: mnist.test.labels}))

Epoch: 0001 cost= 1.182142357
Epoch: 0002 cost= 0.664722067
Epoch: 0003 cost= 0.552636083
Epoch: 0004 cost= 0.498540523
Epoch: 0005 cost= 0.465420423
Epoch: 0006 cost= 0.442470942
Epoch: 0007 cost= 0.425431271
Epoch: 0008 cost= 0.412137324
Epoch: 0009 cost= 0.401391492
Epoch: 0010 cost= 0.392392294
Epoch: 0011 cost= 0.384764056
Epoch: 0012 cost= 0.378149798
Epoch: 0013 cost= 0.372362514
Epoch: 0014 cost= 0.367307299
Epoch: 0015 cost= 0.362718756
Epoch: 0016 cost= 0.358626538
Epoch: 0017 cost= 0.354886612
Epoch: 0018 cost= 0.351451644
Epoch: 0019 cost= 0.348343237
Epoch: 0020 cost= 0.345421698
Epoch: 0021 cost= 0.342717705
Epoch: 0022 cost= 0.340269641
Epoch: 0023 cost= 0.337910988
Epoch: 0024 cost= 0.335700768
Epoch: 0025 cost= 0.333688375
Optimization Finished!
Accuracy: 0.9137


# Neural Nets (NN) for MNIST

In [15]:
learning_rate = 0.01
training_epochs = 15
batch_size = 100
display_step = 1

# tf Graph Input
X = tf.placeholder(tf.float32, [None, 784]) # mnist data image of shape 28*28=784
Y = tf.placeholder(tf.float32, [None, 10]) # 0-9 digits recognition => 10 classes

W1 = tf.Variable(tf.random_normal([784,256]))
W2 = tf.Variable(tf.random_normal([256,256]))
W3 = tf.Variable(tf.random_normal([256,10]))

B1 = tf.Variable(tf.random_normal([256]))
B2 = tf.Variable(tf.random_normal([256]))
B3 = tf.Variable(tf.random_normal([10]))

L1 = tf.nn.relu(tf.add(tf.matmul(X,W1),B1))
L2 = tf.nn.relu(tf.add(tf.matmul(L1,W2),B2))
hypothesis = tf.add(tf.matmul(L2,W3),B3)

cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(hypothesis,Y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

# Initializing the variables
init = tf.initialize_all_variables()

In [16]:
# Launch the graph
with tf.Session() as sess:
    sess.run(init)

    # Training cycle
    for epoch in range(training_epochs):
        avg_cost = 0.
        total_batch = int(mnist.train.num_examples/batch_size)
        # Loop over all batches
        for i in range(total_batch):
            batch_xs, batch_ys = mnist.train.next_batch(batch_size)
            # Run optimization op (backprop) and cost op (to get loss value)
            sess.run(optimizer, feed_dict={X: batch_xs, Y: batch_ys})
            # Compute average loss
            avg_cost += sess.run(cost,feed_dict={X: batch_xs,Y:batch_ys})/total_batch
        # Display logs per epoch step
        if epoch % display_step == 0:
            print("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(avg_cost))

    print("Optimization Finished!")

    # Test model
    correct_prediction = tf.equal(tf.argmax(hypothesis, 1), tf.argmax(Y, 1))
    # Calculate accuracy
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    print("Accuracy:", accuracy.eval({X: mnist.test.images, Y: mnist.test.labels}))

Epoch: 0001 cost= 38.215531665
Epoch: 0002 cost= 6.713962545
Epoch: 0003 cost= 2.911824695
Epoch: 0004 cost= 1.742142624
Epoch: 0005 cost= 1.320672049
Epoch: 0006 cost= 0.983924351
Epoch: 0007 cost= 0.812092398
Epoch: 0008 cost= 0.776286150
Epoch: 0009 cost= 0.625411877
Epoch: 0010 cost= 0.576501161
Epoch: 0011 cost= 0.494068335
Epoch: 0012 cost= 0.387775614
Epoch: 0013 cost= 0.341370435
Epoch: 0014 cost= 0.339820825
Epoch: 0015 cost= 0.287158297
Optimization Finished!
Accuracy: 0.9611


# Xavier initialization 
 - 초기화 방법 

In [17]:
def xavier_init(n_inputs, n_outputs, uniform=True):
  """Set the parameter initialization using the method described.
  This method is designed to keep the scale of the gradients roughly the same
  in all layers.
  Xavier Glorot and Yoshua Bengio (2010):
           Understanding the difficulty of training deep feedforward neural
           networks. International conference on artificial intelligence and
           statistics.
  Args:
    n_inputs: The number of input nodes into each output.
    n_outputs: The number of output nodes for each input.
    uniform: If true use a uniform distribution, otherwise use a normal.
  Returns:
    An initializer.
  """
  if uniform:
    # 6 was used in the paper.
    init_range = math.sqrt(6.0 / (n_inputs + n_outputs))
    return tf.random_uniform_initializer(-init_range, init_range)
  else:
    # 3 gives us approximately the same limits as above since this repicks
    # values greater than 2 standard deviations from the mean.
    stddev = math.sqrt(3.0 / (n_inputs + n_outputs))
    return tf.truncated_normal_initializer(stddev=stddev)

In [None]:
learning_rate = 0.01
training_epochs = 15
batch_size = 100
display_step = 1

# tf Graph Input
X = tf.placeholder(tf.float32, [None, 784]) # mnist data image of shape 28*28=784
Y = tf.placeholder(tf.float32, [None, 10]) # 0-9 digits recognition => 10 classes

# Xavier Initialization 
W1 = tf.get_variable("W1", shape=[784,256], initializer = xavier_init(784,256))
W2 = tf.get_variable("W2", shape=[256,256], initializer = xavier_init(256,256))
W3 = tf.get_variable("W3", shape=[256,10], initializer = xavier_init(256,10))

B1 = tf.Variable(tf.random_normal([256]))
B2 = tf.Variable(tf.random_normal([256]))
B3 = tf.Variable(tf.random_normal([10]))

L1 = tf.nn.relu(tf.add(tf.matmul(X,W1),B1))
L2 = tf.nn.relu(tf.add(tf.matmul(L1,W2),B2))
hypothesis = tf.add(tf.matmul(L2,W3),B3)

cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(hypothesis,Y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

# Initializing the variables
init = tf.initialize_all_variables()

# Launch the graph
with tf.Session() as sess:
    sess.run(init)

    # Training cycle
    for epoch in range(training_epochs):
        avg_cost = 0.
        total_batch = int(mnist.train.num_examples/batch_size)
        # Loop over all batches
        for i in range(total_batch):
            batch_xs, batch_ys = mnist.train.next_batch(batch_size)
            # Run optimization op (backprop) and cost op (to get loss value)
            sess.run(optimizer, feed_dict={X: batch_xs, Y: batch_ys})
            # Compute average loss
            avg_cost += sess.run(cost,feed_dict={X: batch_xs,Y:batch_ys})/total_batch
        # Display logs per epoch step
        if epoch % display_step == 0:
            print("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(avg_cost))

    print("Optimization Finished!")

    # Test model
    correct_prediction = tf.equal(tf.argmax(hypothesis, 1), tf.argmax(Y, 1))
    # Calculate accuracy
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    print("Accuracy:", accuracy.eval({X: mnist.test.images, Y: mnist.test.labels}))

## Neural Nets (NN) with ReLU, Xavier initializer, and dropout on Adam optimizer for MNIST data.

In [19]:
import math
import numpy as np
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)

def xavier_init(n_inputs, n_outputs, uniform=True):
  """Set the parameter initialization using the method described.
  This method is designed to keep the scale of the gradients roughly the same
  in all layers.
  Xavier Glorot and Yoshua Bengio (2010):
           Understanding the difficulty of training deep feedforward neural
           networks. International conference on artificial intelligence and
           statistics.
  Args:
    n_inputs: The number of input nodes into each output.
    n_outputs: The number of output nodes for each input.
    uniform: If true use a uniform distribution, otherwise use a normal.
  Returns:
    An initializer.
  """
  if uniform:
    # 6 was used in the paper.
    init_range = math.sqrt(6.0 / (n_inputs + n_outputs))
    return tf.random_uniform_initializer(-init_range, init_range)
  else:
    # 3 gives us approximately the same limits as above since this repicks
    # values greater than 2 standard deviations from the mean.
    stddev = math.sqrt(3.0 / (n_inputs + n_outputs))
    return tf.truncated_normal_initializer(stddev=stddev)

# Parameters
learning_rate = 0.001
training_epochs = 15
batch_size = 100
display_step = 1


# tf Graph input
X = tf.placeholder("float", [None, 28*28])  #MNIST data input (img dimention: 28x28)
Y = tf.placeholder("float", [None, 10] )

# Store layers weight & bias
W1 = tf.get_variable("W1", shape=[784, 256], initializer = xavier_init(784, 256)) 
W2 = tf.get_variable("W2", shape=[256, 256], initializer = xavier_init(256, 256)) 
W3 = tf.get_variable("W3", shape=[256, 10], initializer = xavier_init(256, 10)) 


B1 = tf.Variable(tf.random_normal([256]))
B2 = tf.Variable(tf.random_normal([256]))
B3 = tf.Variable(tf.random_normal([10]))


# Construct model
dropout_rate = tf.placeholder("float")
_L1 = tf.nn.relu( tf.add(tf.matmul(X,W1),B1) )
L1 = tf.nn.dropout(_L1, dropout_rate)
_L2 = tf.nn.relu( tf.add(tf.matmul(L1,W2),B2) )
L2 = tf.nn.dropout(_L2, dropout_rate)
hypothesis = tf.add( tf.matmul(L2, W3), B3)


# Define loss and optimizer
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(hypothesis,Y))
optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost) # Adam optimizer.

# Initializing the variables
init = tf.initialize_all_variables()

# Launching the graph
with tf.Session() as sess:
    sess.run(init)
    
    # Training cycle.
    for epoch in range(training_epochs):
        avg_cost =0.
        total_batch = int(mnist.train.num_examples/batch_size)
        
        # Loop over all branches
        for i in range(total_batch):
            batch_xs, batch_ys = mnist.train.next_batch(batch_size)
            
            # Fit training using batch data.
            sess.run(optimizer, feed_dict={X:batch_xs, Y:batch_ys, dropout_rate:0.7} )
            
            # Compute average loss.
            avg_cost += sess.run(cost, feed_dict={X:batch_xs, Y:batch_ys, dropout_rate:1.0})/total_batch
            
        # Display logs per epoch step
        if epoch % display_step == 0:
            print( "Epoch:", '%04d'%(epoch+1), "cost=","{:.9f}".format(avg_cost))
    
    print("Optimization finished.") 
    
    # Test model. 
    correct_prediction = tf.equal(tf.argmax(hypothesis,1), tf.argmax(Y,1))
    
    # Calculate accuracy.
    accuracy = tf.reduce_mean( tf.cast(correct_prediction,"float") )
    print("Accuracy:", accuracy.eval({X:mnist.test.images,Y:mnist.test.labels, dropout_rate:1.0 }))

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz
Epoch: 0001 cost= 0.345570954
Epoch: 0002 cost= 0.124843770
Epoch: 0003 cost= 0.085858977
Epoch: 0004 cost= 0.064651518
Epoch: 0005 cost= 0.051763271
Epoch: 0006 cost= 0.043569133
Epoch: 0007 cost= 0.036085708
Epoch: 0008 cost= 0.030195680
Epoch: 0009 cost= 0.025588613
Epoch: 0010 cost= 0.021576271
Epoch: 0011 cost= 0.019802653
Epoch: 0012 cost= 0.017772669
Epoch: 0013 cost= 0.016121101
Epoch: 0014 cost= 0.013686364
Epoch: 0015 cost= 0.011981746
Optimization finished.
Accuracy: 0.9808
