Deep Learning
=============

Assignment 3
------------

Previously in `2_fullyconnected.ipynb`, you trained a logistic regression and a neural network model.

The goal of this assignment is to explore regularization techniques.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle

First reload the data we generated in `1_notmnist.ipynb`.

In [2]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

Training set (10000, 28, 28) (10000,)
Validation set (1000, 28, 28) (1000,)
Test set (1000, 28, 28) (1000,)


Reformat into a shape that's more adapted to the models we're going to train:
- data as a flat matrix,
- labels as float 1-hot encodings.

In [3]:
image_size = 28
num_labels = 10

def reformat(dataset, labels):
  dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
  # Map 1 to [0.0, 1.0, 0.0 ...], 2 to [0.0, 0.0, 1.0 ...]
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (10000, 784) (10000, 10)
Validation set (1000, 784) (1000, 10)
Test set (1000, 784) (1000, 10)


In [4]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

---
Problem 1
---------

Introduce and tune L2 regularization for both logistic and neural network models. Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compute the L2 loss for a tensor `t` using `nn.l2_loss(t)`. The right amount of regularization should improve your validation / test accuracy.

---

---
Logistic Model
---------

L2 Regularization


In [18]:
graph = tf.Graph()
with graph.as_default():

    ### Input data ###
    # Load the training, validation and test data into constants that are attached to the graph.
    tf_train_dataset = tf.constant(train_dataset)
    tf_train_labels = tf.constant(train_labels)
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)

    ### Variables ###
    # These are the parameters that we are going to be training. 
    # The weight matrix will be initialized using random values following a (truncated) normal distribution.
    # The biases get initialized to zero.
    
    normalDist = tf.truncated_normal(shape=[image_size * image_size, num_labels])#Outputs random values from a truncated normal distribution.
    weights = tf.Variable(normalDist)
    
    zeros = tf.zeros([num_labels])
    biases = tf.Variable(zeros)

    
    ### Training computation ###
    # We multiply the inputs with the weight matrix, and add biases. 
    # We compute the softmax and cross-entropy (it's one operation in TensorFlow, because it's very common, and it can be optimized).
    # We take the average of this cross-entropy across all training examples --> that's our loss.
    
    logits = tf.matmul(tf_train_dataset, weights) + biases #Multiplicar input con matriz de pesos y sumamos bias

    #LOSS
    '''
    L2 REGULARIZATION
    tf.nn.l2_loss: Computes half the L2 norm of a tensor without the `sqrt`:
                   output = sum(t ** 2) / 2
    '''
    BETA = 0.01
    softMaxCrossEntropy = tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits)
    L2Reg = tf.nn.l2_loss(weights) #Añadimos regularización L2 a los Weights
    loss = tf.reduce_mean(softMaxCrossEntropy + BETA*L2Reg)
    print('Loss w/L2: ',loss)

    ### Optimizer ###
    # We are going to find the minimum of this loss using gradient descent.

    GDOptimizer = tf.train.GradientDescentOptimizer(0.5)
    optimizer = GDOptimizer.minimize(loss)
    
    ### Predictions for the training, validation, and test data ###
    # These are not part of training, but merely here so that we can report accuracy figures as we train.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(tf.matmul(tf_valid_dataset, weights) + biases)
    test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)

Loss w/L2:  Tensor("Mean:0", shape=(), dtype=float32)


In [19]:
num_steps = 801

#Calcula la accuracy
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

with tf.Session(graph=graph) as session:
    # This is a one-time operation which ensures the parameters get initialized as we described in the graph
    tf.global_variables_initializer().run()
    print('Initialized')
    
    # Run the computations.
    for step in range(num_steps):
        # We tell .run() that we want to run the optimizer, and get the loss value and the training predictions
        #Nuestro 'fetches' es una lista con [ElOptimizer, El Loss, Las predicciones] y lo mismo nos retornará
        dummyOpt, returnedLoss, returnedPred = session.run(fetches=[optimizer, loss, train_prediction])
        
        if (step % 100 == 0):
            print('Loss at step ',step,':',returnedLoss)
            
            acc = accuracy(returnedPred,train_labels)
            print('Training accuracy: ',acc)

            # Calling .eval() on valid_prediction is basically like calling run(), but just to get that one returned value.
            # Note that it recomputes all its graph dependencies.
            #VALID#
            validPred = valid_prediction.eval()
            validAcc = accuracy(validPred,valid_labels)
            print('Validation accuracy: ',validAcc)
            #TEST#
            testPred = test_prediction.eval()
            testAcc = accuracy(testPred,test_labels)
            print('Validation accuracy: ',testAcc)
            

Initialized
Loss at step  0 : 50.2545
Training accuracy:  5.78
Validation accuracy:  6.7
Validation accuracy:  8.2
Loss at step  100 : 11.8161
Training accuracy:  73.24
Validation accuracy:  74.2
Validation accuracy:  80.5
Loss at step  200 : 4.48081
Training accuracy:  78.53
Validation accuracy:  79.0
Validation accuracy:  83.5
Loss at step  300 : 1.98754
Training accuracy:  81.99
Validation accuracy:  81.4
Validation accuracy:  85.9
Loss at step  400 : 1.14146
Training accuracy:  83.38
Validation accuracy:  82.7
Validation accuracy:  87.0
Loss at step  500 : 0.850997
Training accuracy:  83.55
Validation accuracy:  83.3
Validation accuracy:  87.1
Loss at step  600 : 0.750071
Training accuracy:  83.74
Validation accuracy:  83.5
Validation accuracy:  87.0
Loss at step  700 : 0.714669
Training accuracy:  83.73
Validation accuracy:  83.4
Validation accuracy:  87.4
Loss at step  800 : 0.702155
Training accuracy:  83.86
Validation accuracy:  83.5
Validation accuracy:  87.5


---
Neural Network Model
---------

L2 Regularization


In [20]:
###PREPARAR ESTRUCTURA NN: SGD & HIDDEN LAYER & L2 REGULARIZATION###

batch_size = 128

graph = tf.Graph()
with graph.as_default():

    ###DATA### - Igual que antes
    # Input data. For the training data, we use a placeholder that will be fed at run time with a training minibatch.
    '''
    En este caso, en vez de usar un 'Constant', es decir, el set de datos completo, usamos un 'Placeholder', que pilla el 
    set de datos completo pero está preparado para luego entregárselo al Graph en 'trocitos' o 'minibatches' del batch_size indicado
    '''
    tf_train_dataset = tf.placeholder(tf.float32,shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)

    ### Variables & Layers ### - Igual que antes
    '''Sinapsis Input a Hidden'''
    weights1 = tf.Variable(tf.truncated_normal([image_size * image_size, 1024]))
    biases1 = tf.Variable(tf.zeros([1024]))
    '''Sinapsis Hidden a Output'''
    weights2 = tf.Variable(tf.truncated_normal([1024, num_labels]))
    biases2 = tf.Variable(tf.zeros([num_labels]))

    '''Hidden Layer'''
    hidden1 = tf.nn.relu(tf.matmul(tf_train_dataset, weights1) + biases1)
    
    ### Training computation ###
    logits = tf.matmul(hidden1, weights2) + biases2 #Logits: La última capa y los útlimos weights y bias
    
    #LOSS
    '''
    L2 REGULARIZATION
    tf.nn.l2_loss: Computes half the L2 norm of a tensor without the `sqrt`:
                   output = sum(t ** 2) / 2
    '''
    BETA = 0.01
    softMaxCrossEntropy = tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits)
    L2RegLayer1 = tf.nn.l2_loss(weights1) #Añadimos regularización L2 a los Weights1
    L2RegLayer2 = tf.nn.l2_loss(weights2) #Añadimos regularización L2 a los Weights2
    L2RegComplete = L2RegLayer1 + L2RegLayer2
    
    loss = tf.reduce_mean(softMaxCrossEntropy + BETA*L2RegComplete)
    print('Loss w/L2: ',loss)
    
    ### Optimizer ### - Igual que antes
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)

    ### Predictions for the training, validation, and test data ### - Igual que antes
    train_prediction = tf.nn.softmax(logits) #Igual
    
    #VALID#  
    inputToHiddenMultVal = tf.matmul(tf_valid_dataset, weights1) + biases1 
    hiddenToOutputReLUVal = tf.nn.relu(inputToHiddenMultVal + biases1) 
    completeMultVal = tf.matmul(hiddenToOutputReLUVal,weights2) + biases2 
    valid_prediction = tf.nn.softmax(completeMultVal)
    
    #TEST# 
    inputToHiddenMultTest = tf.matmul(tf_test_dataset, weights1) + biases1 
    hiddenToOutputReLUTest = tf.nn.relu(inputToHiddenMultTest + biases1) 
    completeMultTest = tf.matmul(hiddenToOutputReLUTest,weights2) + biases2 
    test_prediction = tf.nn.softmax(completeMultTest)
    
    
print('Estructura de la Red Neuronal creada.')

Loss w/L2:  Tensor("Mean:0", shape=(), dtype=float32)
Estructura de la Red Neuronal creada.


In [21]:
###ENTRENAR Y EVALUAR NN SGD & HIDDEN LAYER & L2 REGULARIZATION ###

#Al usar Stochastic podemos aumentar mucho el nº de epochs sin sufrir en rendimiento
num_steps = 3001

with tf.Session(graph=graph) as session:
    #Inicializar la estructura que hemos definido anteriormente - Igual que en el anterior
    tf.global_variables_initializer().run()
    print("Initialized")
    
    for step in range(num_steps):
        #Pick an offset within the training data, which has been randomized. Note:We could use better randomization across epochs.
        #Lo curioso de este método es que son aleatorios pero van incrementando hasta llegar al tamaño del set y vuelve a empezar
        batchStartIndex = (step * batch_size) % (train_labels.shape[0] - batch_size) #Por cada paso pillamos batches aleatorios
        batchEndIndex = batchStartIndex + batch_size
        
        # Generate a minibatch.
        batch_data = train_dataset[batchStartIndex:batchEndIndex, :] #[Comienzo : final] --> [batchStartIndex : batchStartIndex + tamañoBatch]
        batch_labels = train_labels[batchStartIndex:batchEndIndex, :]
        
        # Prepare a dictionary telling the session where to feed the minibatch.
        '''
        KEY: of the dictionary is the placeholder node of the graph to be fed,
        VALUE: is the numpy array to feed to it.
        '''
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        
        #Igual que en el GD normal pero
        #Hay que usar el parámetro 'feed_dict' e indicarle el dict que hemos creado para decirle a la sesión cómo encontrar los datos
        dummyOpt, returnedLoss, returnedPred = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
        
        if (step % 500 == 0):
            
            print('Fragmento/MiniBatch loss at step ',step,': ',returnedLoss)
            
            acc = accuracy(returnedPred,batch_labels)
            print('Fragmento/MiniBatch accuracy: ',acc)
            
            validPred = valid_prediction.eval()
            validAcc = accuracy(validPred,valid_labels)
            print('Validation accuracy: ',validAcc)
            
            testPred = test_prediction.eval()
            testAcc = accuracy(testPred,test_labels)
            print('Test accuracy: ',testAcc)
            

Initialized
Fragmento/MiniBatch loss at step  0 :  3482.62
Fragmento/MiniBatch accuracy:  17.96875
Validation accuracy:  32.6
Test accuracy:  37.7
Fragmento/MiniBatch loss at step  500 :  21.3469
Fragmento/MiniBatch accuracy:  92.96875
Validation accuracy:  83.7
Test accuracy:  90.2
Fragmento/MiniBatch loss at step  1000 :  0.947134
Fragmento/MiniBatch accuracy:  83.59375
Validation accuracy:  81.6
Test accuracy:  86.8
Fragmento/MiniBatch loss at step  1500 :  0.710645
Fragmento/MiniBatch accuracy:  84.375
Validation accuracy:  80.8
Test accuracy:  85.9
Fragmento/MiniBatch loss at step  2000 :  0.618725
Fragmento/MiniBatch accuracy:  90.625
Validation accuracy:  80.8
Test accuracy:  86.1
Fragmento/MiniBatch loss at step  2500 :  0.708762
Fragmento/MiniBatch accuracy:  85.15625
Validation accuracy:  81.1
Test accuracy:  84.7
Fragmento/MiniBatch loss at step  3000 :  0.598076
Fragmento/MiniBatch accuracy:  87.5
Validation accuracy:  79.7
Test accuracy:  84.1


---
Problem 2
---------
Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?

---

---
Neural Network Model w/ SGD & L2 Regularization
---------
Just 380 instances to train :O


In [29]:
###PREPARAR ESTRUCTURA NN: SGD & HIDDEN LAYER & L2 REGULARIZATION###

batch_size = 128

graph = tf.Graph()
with graph.as_default():

    ###DATA### - Igual que antes
    # Input data. For the training data, we use a placeholder that will be fed at run time with a training minibatch.
    '''
    En este caso, en vez de usar un 'Constant', es decir, el set de datos completo, usamos un 'Placeholder', que pilla el 
    set de datos completo pero está preparado para luego entregárselo al Graph en 'trocitos' o 'minibatches' del batch_size indicado
    '''
    tf_train_dataset = tf.placeholder(tf.float32,shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)

    ### Variables & Layers ### - Igual que antes
    '''Sinapsis Input a Hidden'''
    weights1 = tf.Variable(tf.truncated_normal([image_size * image_size, 1024]))
    biases1 = tf.Variable(tf.zeros([1024]))
    '''Sinapsis Hidden a Output'''
    weights2 = tf.Variable(tf.truncated_normal([1024, num_labels]))
    biases2 = tf.Variable(tf.zeros([num_labels]))

    '''Hidden Layer'''
    hidden1 = tf.nn.relu(tf.matmul(tf_train_dataset, weights1) + biases1)
    
    ### Training computation ###
    logits = tf.matmul(hidden1, weights2) + biases2 #Logits: La última capa y los útlimos weights y bias
    
    #LOSS
    '''
    L2 REGULARIZATION
    tf.nn.l2_loss: Computes half the L2 norm of a tensor without the `sqrt`:
                   output = sum(t ** 2) / 2
    '''
    BETA = 0.01
    softMaxCrossEntropy = tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits)
    L2RegLayer1 = tf.nn.l2_loss(weights1) #Añadimos regularización L2 a los Weights1
    L2RegLayer2 = tf.nn.l2_loss(weights2) #Añadimos regularización L2 a los Weights2
    L2RegComplete = L2RegLayer1 + L2RegLayer2
    
    loss = tf.reduce_mean(softMaxCrossEntropy + BETA*L2RegComplete)
    print('Loss w/L2: ',loss)
    
    ### Optimizer ### - Igual que antes
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)

    ### Predictions for the training, validation, and test data ### - Igual que antes
    train_prediction = tf.nn.softmax(logits) #Igual
    
    #VALID#  
    inputToHiddenMultVal = tf.matmul(tf_valid_dataset, weights1) + biases1 
    hiddenToOutputReLUVal = tf.nn.relu(inputToHiddenMultVal + biases1) 
    completeMultVal = tf.matmul(hiddenToOutputReLUVal,weights2) + biases2 
    valid_prediction = tf.nn.softmax(completeMultVal)
    
    #TEST# 
    inputToHiddenMultTest = tf.matmul(tf_test_dataset, weights1) + biases1 
    hiddenToOutputReLUTest = tf.nn.relu(inputToHiddenMultTest + biases1) 
    completeMultTest = tf.matmul(hiddenToOutputReLUTest,weights2) + biases2 
    test_prediction = tf.nn.softmax(completeMultTest)
    
    
print('Estructura de la Red Neuronal creada.')

Loss w/L2:  Tensor("Mean:0", shape=(), dtype=float32)
Estructura de la Red Neuronal creada.


In [36]:
###ENTRENAR Y EVALUAR NN SGD & HIDDEN LAYER & L2 REGULARIZATION ###

#Al usar Stochastic podemos aumentar mucho el nº de epochs sin sufrir en rendimiento
num_steps = 3001

batch_size = 128
num_train_instances = 380 #<-- Extreme case of overfitting
#Subsets chiquititos
train_dataset_small = train_dataset[:num_train_instances,:]
train_labes_small = train_labels[:num_train_instances,:]


with tf.Session(graph=graph) as session:
    #Inicializar la estructura que hemos definido anteriormente - Igual que en el anterior
    tf.global_variables_initializer().run()
    print("Initialized")
    
    for step in range(num_steps):
        #Pick an offset within the training data, which has been randomized. Note:We could use better randomization across epochs.
        #Lo curioso de este método es que son aleatorios pero van incrementando hasta llegar al tamaño del set y vuelve a empezar
        batchStartIndex = (step * batch_size) % (train_labes_small.shape[0] - batch_size) #Por cada paso pillamos batches aleatorios
        batchEndIndex = batchStartIndex + batch_size
        
        # Generate a minibatch.
        batch_data = train_dataset_small[batchStartIndex:batchEndIndex, :] #[Comienzo : final] --> [batchStartIndex : batchStartIndex + tamañoBatch]
        batch_labels = train_labes_small[batchStartIndex:batchEndIndex, :]
        
        # Prepare a dictionary telling the session where to feed the minibatch.
        '''
        KEY: of the dictionary is the placeholder node of the graph to be fed,
        VALUE: is the numpy array to feed to it.
        '''
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        
        #Igual que en el GD normal pero
        #Hay que usar el parámetro 'feed_dict' e indicarle el dict que hemos creado para decirle a la sesión cómo encontrar los datos
        dummyOpt, returnedLoss, returnedPred = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
        
        if (step % 500 == 0):
            
            print('Fragmento/MiniBatch loss at step ',step,': ',returnedLoss)
            
            acc = accuracy(returnedPred,batch_labels)
            print('Fragmento/MiniBatch accuracy: ',acc)
            
            validPred = valid_prediction.eval()
            validAcc = accuracy(validPred,valid_labels)
            print('Validation accuracy: ',validAcc)
            
            testPred = test_prediction.eval()
            testAcc = accuracy(testPred,test_labels)
            print('Test accuracy: ',testAcc)

Initialized
Fragmento/MiniBatch loss at step  0 :  3533.0
Fragmento/MiniBatch accuracy:  7.03125
Validation accuracy:  37.4
Test accuracy:  39.7
Fragmento/MiniBatch loss at step  500 :  21.0683
Fragmento/MiniBatch accuracy:  100.0
Validation accuracy:  75.6
Test accuracy:  83.3
Fragmento/MiniBatch loss at step  1000 :  0.443455
Fragmento/MiniBatch accuracy:  100.0
Validation accuracy:  78.4
Test accuracy:  83.9
Fragmento/MiniBatch loss at step  1500 :  0.287954
Fragmento/MiniBatch accuracy:  100.0
Validation accuracy:  78.2
Test accuracy:  84.0
Fragmento/MiniBatch loss at step  2000 :  0.26164
Fragmento/MiniBatch accuracy:  100.0
Validation accuracy:  77.9
Test accuracy:  83.9
Fragmento/MiniBatch loss at step  2500 :  0.260728
Fragmento/MiniBatch accuracy:  100.0
Validation accuracy:  78.1
Test accuracy:  83.9
Fragmento/MiniBatch loss at step  3000 :  0.254567
Fragmento/MiniBatch accuracy:  100.0
Validation accuracy:  78.5
Test accuracy:  84.1


---
La accuracy del Train sube hasta el máximo
La accuracy de validation y test baja
OVERFITTING!


---
Problem 3
---------
Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides `nn.dropout()` for that, but you have to make sure it's only inserted during training.

What happens to our extreme overfitting case?

---

Normal case

---

In [45]:
NODES = 1024
batch_size = 128

graph = tf.Graph()
with graph.as_default():

    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32,shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)

    # Variables.
    weights1 = tf.Variable(tf.truncated_normal([image_size * image_size, NODES]))
    biases1 = tf.Variable(tf.zeros([NODES]))
    weights2 = tf.Variable(tf.truncated_normal([NODES, num_labels]))
    biases2 = tf.Variable(tf.zeros([num_labels]))
    
    # Training computation.
    logits1 = tf.matmul(tf_train_dataset, weights1) + biases1
    reLULayer = tf.nn.relu(logits1)
    
    #Dropout @ reLU
    '''
    tf.nn.dropout: Computes dropout.
                   With probability `keep_prob`, outputs the input element scaled up by
                   `1 / keep_prob`, otherwise outputs `0`.  The scaling is so that the expected
                   sum is unchanged.
    '''
    keep_prob = tf.placeholder("float")
    dropout = tf.nn.dropout(reLULayer, keep_prob)
    
    logits2 = tf.matmul(dropout, weights2) + biases2
    
    #LOSS
    # Normal loss function
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=tf_train_labels))
    
    # Loss function with L2 Regularization with beta=0.01
    BETA = 0.01
    L2RegComplete = tf.nn.l2_loss(weights1) + tf.nn.l2_loss(weights2)
    loss = tf.reduce_mean(loss + BETA * L2RegComplete)

    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)

    # Predictions for the training
    train_prediction = tf.nn.softmax(logits2)
    
    # Predictions for validation 
    logits1 = tf.matmul(tf_valid_dataset, weights1) + biases1
    reLULayer= tf.nn.relu(logits1)
    logits2 = tf.matmul(reLULayer, weights2) + biases2
    
    valid_prediction = tf.nn.softmax(logits2)
    
    # Predictions for test
    logits1 = tf.matmul(tf_test_dataset, weights1) + biases1
    reLULayer= tf.nn.relu(logits1)
    logits2 = tf.matmul(reLULayer, weights2) + biases2
    
    test_prediction =  tf.nn.softmax(logits2)

In [51]:
###ENTRENAR Y EVALUAR NN SGD & HIDDEN LAYER & L2 REGULARIZATION ###

#Al usar Stochastic podemos aumentar mucho el nº de epochs sin sufrir en rendimiento
num_steps = 3001

with tf.Session(graph=graph) as session:
    #Inicializar la estructura que hemos definido anteriormente - Igual que en el anterior
    tf.global_variables_initializer().run()
    print("Initialized")
    
    for step in range(num_steps):
        #Pick an offset within the training data, which has been randomized. Note:We could use better randomization across epochs.
        #Lo curioso de este método es que son aleatorios pero van incrementando hasta llegar al tamaño del set y vuelve a empezar
        batchStartIndex = (step * batch_size) % (train_labels.shape[0] - batch_size) #Por cada paso pillamos batches aleatorios
        batchEndIndex = batchStartIndex + batch_size
        
        # Generate a minibatch.
        batch_data = train_dataset[batchStartIndex:batchEndIndex, :] #[Comienzo : final] --> [batchStartIndex : batchStartIndex + tamañoBatch]
        batch_labels = train_labels[batchStartIndex:batchEndIndex, :]
        
        # Prepare a dictionary telling the session where to feed the minibatch.
        '''
        KEY: of the dictionary is the placeholder node of the graph to be fed,
        VALUE: is the numpy array to feed to it.
        '''
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, keep_prob:0.5}#Añadimos el keep_prob
        
        #Igual que en el GD normal pero
        #Hay que usar el parámetro 'feed_dict' e indicarle el dict que hemos creado para decirle a la sesión cómo encontrar los datos
        dummyOpt, returnedLoss, returnedPred = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
        
        if (step % 500 == 0):
            
            print('Fragmento/MiniBatch loss at step ',step,': ',returnedLoss)
            
            acc = accuracy(returnedPred,batch_labels)
            print('Fragmento/MiniBatch accuracy: ',acc)
            
            validPred = valid_prediction.eval()
            validAcc = accuracy(validPred,valid_labels)
            print('Validation accuracy: ',validAcc)
            
            testPred = test_prediction.eval()
            testAcc = accuracy(testPred,test_labels)
            print('Test accuracy: ',testAcc)
            

Initialized
Fragmento/MiniBatch loss at step  0 :  3656.39
Fragmento/MiniBatch accuracy:  10.15625
Validation accuracy:  29.4
Test accuracy:  28.9
Fragmento/MiniBatch loss at step  500 :  21.5795
Fragmento/MiniBatch accuracy:  87.5
Validation accuracy:  86.0
Test accuracy:  89.3
Fragmento/MiniBatch loss at step  1000 :  1.05127
Fragmento/MiniBatch accuracy:  82.8125
Validation accuracy:  85.6
Test accuracy:  89.4
Fragmento/MiniBatch loss at step  1500 :  0.767157
Fragmento/MiniBatch accuracy:  85.9375
Validation accuracy:  85.2
Test accuracy:  89.6
Fragmento/MiniBatch loss at step  2000 :  0.681122
Fragmento/MiniBatch accuracy:  90.625
Validation accuracy:  84.3
Test accuracy:  88.9
Fragmento/MiniBatch loss at step  2500 :  0.799742
Fragmento/MiniBatch accuracy:  84.375
Validation accuracy:  85.0
Test accuracy:  89.0
Fragmento/MiniBatch loss at step  3000 :  0.683444
Fragmento/MiniBatch accuracy:  86.71875
Validation accuracy:  84.8
Test accuracy:  88.8


Overfitting case

---

In [57]:
#Al usar Stochastic podemos aumentar mucho el nº de epochs sin sufrir en rendimiento
num_steps = 3001

num_train_instances = 380 #<-- Extreme case of overfitting
#Subsets chiquititos
train_dataset_small = train_dataset[:num_train_instances,:]
train_labes_small = train_labels[:num_train_instances,:]

with tf.Session(graph=graph) as session:
    #Inicializar la estructura que hemos definido anteriormente - Igual que en el anterior
    tf.global_variables_initializer().run()
    print("Initialized")
    
    for step in range(num_steps):
        #Pick an offset within the training data, which has been randomized. Note:We could use better randomization across epochs.
        #Lo curioso de este método es que son aleatorios pero van incrementando hasta llegar al tamaño del set y vuelve a empezar
        batchStartIndex = (step * batch_size) % (train_labes_small.shape[0] - batch_size) #Por cada paso pillamos batches aleatorios
        batchEndIndex = batchStartIndex + batch_size
        
        # Generate a minibatch.
        batch_data = train_dataset_small[batchStartIndex:batchEndIndex, :] #[Comienzo : final] --> [batchStartIndex : batchStartIndex + tamañoBatch]
        batch_labels = train_labes_small[batchStartIndex:batchEndIndex, :]
        
        # Prepare a dictionary telling the session where to feed the minibatch.
        '''
        KEY: of the dictionary is the placeholder node of the graph to be fed,
        VALUE: is the numpy array to feed to it.
        '''
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, keep_prob:0.5}#Añadimos el keep_prob
        
        #Igual que en el GD normal pero
        #Hay que usar el parámetro 'feed_dict' e indicarle el dict que hemos creado para decirle a la sesión cómo encontrar los datos
        dummyOpt, returnedLoss, returnedPred = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
        
        if (step % 500 == 0):
            
            print('Fragmento/MiniBatch loss at step ',step,': ',returnedLoss)
            
            acc = accuracy(returnedPred,batch_labels)
            print('Fragmento/MiniBatch accuracy: ',acc)
            
            validPred = valid_prediction.eval()
            validAcc = accuracy(validPred,valid_labels)
            print('Validation accuracy: ',validAcc)
            
            testPred = test_prediction.eval()
            testAcc = accuracy(testPred,test_labels)
            print('Test accuracy: ',testAcc)
            

Initialized
Fragmento/MiniBatch loss at step  0 :  4.88492
Fragmento/MiniBatch accuracy:  10.9375
Validation accuracy:  11.5
Test accuracy:  11.7
Fragmento/MiniBatch loss at step  500 :  1.20006
Fragmento/MiniBatch accuracy:  98.4375
Validation accuracy:  79.2
Test accuracy:  84.7
Fragmento/MiniBatch loss at step  1000 :  0.843357
Fragmento/MiniBatch accuracy:  100.0
Validation accuracy:  80.0
Test accuracy:  84.8
Fragmento/MiniBatch loss at step  1500 :  0.644377
Fragmento/MiniBatch accuracy:  98.4375
Validation accuracy:  79.4
Test accuracy:  84.0
Fragmento/MiniBatch loss at step  2000 :  0.418307
Fragmento/MiniBatch accuracy:  99.21875
Validation accuracy:  79.1
Test accuracy:  84.4
Fragmento/MiniBatch loss at step  2500 :  0.333371
Fragmento/MiniBatch accuracy:  100.0
Validation accuracy:  78.3
Test accuracy:  83.9
Fragmento/MiniBatch loss at step  3000 :  0.26838
Fragmento/MiniBatch accuracy:  98.4375
Validation accuracy:  78.6
Test accuracy:  84.7


---
Problem 4
---------

Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is [97.1%](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html?showComment=1391023266211#c8758720086795711595).

One avenue you can explore is to add multiple layers.

Another one is to use learning rate decay:

    global_step = tf.Variable(0)  # count the number of steps taken.
    learning_rate = tf.train.exponential_decay(0.5, global_step, ...)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
 
 ---


---
SUPER Model From: Ritchieng.com
---------

http://www.ritchieng.com/machine-learning/deep-learning/tensorflow/regularization/



Model:

5 hidden layers NN
    ->RELUs
    ->Number of nodes decrease by 50% with each hidden layer that is deeper in the neural net
Overfitting measures
    ->L2 Regularization
        -->Learning rate (beta) with exponential decay

    ->Dropout

15,000 steps



In [58]:
import math as math
batch_size = 128
beta = 0.001

hidden_nodes_1 = 1024
hidden_nodes_2 = int(hidden_nodes_1 * 0.5)
hidden_nodes_3 = int(hidden_nodes_1 * np.power(0.5, 2))
hidden_nodes_4 = int(hidden_nodes_1 * np.power(0.5, 3))
hidden_nodes_5 = int(hidden_nodes_1 * np.power(0.5, 4))

graph = tf.Graph()
with graph.as_default():

    '''Input Data'''
    # For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)

    '''Variables'''
    # Hidden RELU layer 1
    weights_1 = tf.Variable(tf.truncated_normal([image_size * image_size, hidden_nodes_1], stddev=math.sqrt(2.0/(image_size*image_size))))
    biases_1 = tf.Variable(tf.zeros([hidden_nodes_1]))

    # Hidden RELU layer 2
    weights_2 = tf.Variable(tf.truncated_normal([hidden_nodes_1, hidden_nodes_2], stddev=math.sqrt(2.0/hidden_nodes_1)))
    biases_2 = tf.Variable(tf.zeros([hidden_nodes_2]))
    
    # Hidden RELU layer 3
    weights_3 = tf.Variable(tf.truncated_normal([hidden_nodes_2, hidden_nodes_3], stddev=math.sqrt(2.0/hidden_nodes_2)))
    biases_3 = tf.Variable(tf.zeros([hidden_nodes_3]))
    
    # Hidden RELU layer 4
    weights_4 = tf.Variable(tf.truncated_normal([hidden_nodes_3, hidden_nodes_4], stddev=math.sqrt(2.0/hidden_nodes_3)))
    biases_4 = tf.Variable(tf.zeros([hidden_nodes_4]))
    
    # Hidden RELU layer 5
    weights_5 = tf.Variable(tf.truncated_normal([hidden_nodes_4, hidden_nodes_5], stddev=math.sqrt(2.0/hidden_nodes_4)))
    biases_5 = tf.Variable(tf.zeros([hidden_nodes_5]))
    
    # Output layer
    weights_6 = tf.Variable(tf.truncated_normal([hidden_nodes_5, num_labels], stddev=math.sqrt(2.0/hidden_nodes_5)))
    biases_6 = tf.Variable(tf.zeros([num_labels]))
    
    '''Training computation'''
    
    # Hidden RELU layer 1
    logits_1 = tf.matmul(tf_train_dataset, weights_1) + biases_1
    hidden_layer_1 = tf.nn.relu(logits_1)
    # Dropout on hidden layer: RELU layer
    keep_prob = tf.placeholder("float")
    hidden_layer_1_dropout = tf.nn.dropout(hidden_layer_1, keep_prob)
    
    
    # Hidden RELU layer 2
    logits_2 = tf.matmul(hidden_layer_1_dropout, weights_2) + biases_2
    hidden_layer_2 = tf.nn.relu(logits_2)
    # Dropout on hidden layer: RELU layer
    hidden_layer_2_dropout = tf.nn.dropout(hidden_layer_2, keep_prob)
    
    # Hidden RELU layer 3
    logits_3 = tf.matmul(hidden_layer_2_dropout, weights_3) + biases_3
    hidden_layer_3 = tf.nn.relu(logits_3)
    # Dropout on hidden layer: RELU layer
    hidden_layer_3_dropout = tf.nn.dropout(hidden_layer_3, keep_prob)
    
    # Hidden RELU layer 4
    logits_4 = tf.matmul(hidden_layer_3_dropout, weights_4) + biases_4
    hidden_layer_4 = tf.nn.relu(logits_4)
    # Dropout on hidden layer: RELU layer

    hidden_layer_4_dropout = tf.nn.dropout(hidden_layer_4, keep_prob)
    
    # Hidden RELU layer 5
    logits_5 = tf.matmul(hidden_layer_4_dropout, weights_5) + biases_5
    hidden_layer_5 = tf.nn.relu(logits_5)
    # Dropout on hidden layer: RELU layer
    hidden_layer_5_dropout = tf.nn.dropout(hidden_layer_5, keep_prob)
    
    # Output layer
    logits_6 = tf.matmul(hidden_layer_5_dropout, weights_6) + biases_6 
    
    # Normal loss function
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits_6, labels=tf_train_labels))
    # Loss function with L2 Regularization with decaying learning rate beta=0.5
    regularizers = tf.nn.l2_loss(weights_1) + tf.nn.l2_loss(weights_2) + \
                   tf.nn.l2_loss(weights_3) + tf.nn.l2_loss(weights_4) + \
                   tf.nn.l2_loss(weights_5) + tf.nn.l2_loss(weights_6)
    loss = tf.reduce_mean(loss + beta * regularizers)

    '''Optimizer'''
    # Decaying learning rate
    global_step = tf.Variable(0)  # count the number of steps taken.
    start_learning_rate = 0.5
    learning_rate = tf.train.exponential_decay(start_learning_rate, global_step, 100000, 0.96, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
    
    # Predictions for the training
    train_prediction = tf.nn.softmax(logits_6)
    
    # Predictions for validation 
    valid_logits_1 = tf.matmul(tf_valid_dataset, weights_1) + biases_1
    valid_relu_1 = tf.nn.relu(valid_logits_1)
    
    valid_logits_2 = tf.matmul(valid_relu_1, weights_2) + biases_2
    valid_relu_2 = tf.nn.relu(valid_logits_2)
    
    valid_logits_3 = tf.matmul(valid_relu_2, weights_3) + biases_3
    valid_relu_3 = tf.nn.relu(valid_logits_3)
    
    valid_logits_4 = tf.matmul(valid_relu_3, weights_4) + biases_4
    valid_relu_4 = tf.nn.relu(valid_logits_4)
    
    valid_logits_5 = tf.matmul(valid_relu_4, weights_5) + biases_5
    valid_relu_5 = tf.nn.relu(valid_logits_5)
    
    valid_logits_6 = tf.matmul(valid_relu_5, weights_6) + biases_6
    
    valid_prediction = tf.nn.softmax(valid_logits_6)
    
    # Predictions for test
    test_logits_1 = tf.matmul(tf_test_dataset, weights_1) + biases_1
    test_relu_1 = tf.nn.relu(test_logits_1)
    
    test_logits_2 = tf.matmul(test_relu_1, weights_2) + biases_2
    test_relu_2 = tf.nn.relu(test_logits_2)
    
    test_logits_3 = tf.matmul(test_relu_2, weights_3) + biases_3
    test_relu_3 = tf.nn.relu(test_logits_3)
    
    test_logits_4 = tf.matmul(test_relu_3, weights_4) + biases_4
    test_relu_4 = tf.nn.relu(test_logits_4)
    
    test_logits_5 = tf.matmul(test_relu_4, weights_5) + biases_5
    test_relu_5 = tf.nn.relu(test_logits_5)
    
    test_logits_6 = tf.matmul(test_relu_5, weights_6) + biases_6
    
    test_prediction = tf.nn.softmax(test_logits_6)

In [60]:
num_steps = 15001

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print("Initialized")
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        # Generate a minibatch.
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        # Prepare a dictionary telling the session where to feed the minibatch.
        # The key of the dictionary is the placeholder node of the graph to be fed,
        # and the value is the numpy array to feed to it.
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, keep_prob : 0.5}
        _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 500 == 0):
            print("Minibatch loss at step {}: {}".format(step, l))
            print("Minibatch accuracy: {:.1f}".format(accuracy(predictions, batch_labels)))
            print("Validation accuracy: {:.1f}".format(accuracy(valid_prediction.eval(), valid_labels)))
    print("Test accuracy: {:.1f}".format(accuracy(test_prediction.eval(), test_labels)))

Initialized
Minibatch loss at step 0: 4.920271873474121
Minibatch accuracy: 10.9
Validation accuracy: 10.0
Minibatch loss at step 500: 1.890723466873169
Minibatch accuracy: 82.8
Validation accuracy: 84.4
Test accuracy: 88.1
