# Tarea de BackPropagation

En el presente laboratorio Ud. debe diseñar una red neuronal con dos capas intermedias para el problema de reconocimiento de dígitos manuscritos. Analice y estudie el código de la solución con una sola capa intermedia (NNBackpropagation3capas.ipynb), 
Para ello debe codear las funciones de:
1. Forward propagation (04 puntos)
2. Función de costo (02 puntos)
3. Back-propagation (08 puntos)
4. Predicción (02 puntos)
5. Qué parámetros hacen que Ud. encuentre el mejor resultado? (04 puntos)



In [119]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io import loadmat
%matplotlib inline

data = loadmat('ex3data1.mat')

In [120]:
X = data['X']
y = data['y']

X.shape, y.shape

((5000, 400), (5000, 1))

In [121]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False)
y_onehot = encoder.fit_transform(y)
y_onehot.shape

(5000, 10)

In [122]:
y[0], y_onehot[0,:]

(array([10], dtype=uint8),
 array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.]))

In [123]:
# initial setup
input_size = 400
hidden_size1 = 20
hidden_size2 = 30
num_labels = 10
learning_rate = 1

# randomly initialize a parameter array of the size of the full network's parameters
params = (np.random.random(size=hidden_size1 * (input_size + 1) + hidden_size2 * (hidden_size1 + 1) + num_labels * (hidden_size2 + 1)) - 0.5) * 0.25

m = X.shape[0]
X = np.matrix(X)
y = np.matrix(y)

# unravel the parameter array into parameter matrices for each layer
theta1 = np.matrix(np.reshape(params[:hidden_size1 * (input_size + 1)], (hidden_size1, (input_size + 1))))
theta2 = np.matrix(np.reshape(params[hidden_size1 * (input_size + 1):hidden_size1*(input_size+1)+hidden_size2*(hidden_size1+1)], (hidden_size2, (hidden_size1 + 1))))
theta3 = np.matrix(np.reshape(params[hidden_size1*(input_size+1)+hidden_size2*(hidden_size1+1):], (num_labels, (hidden_size2 + 1))))

theta1.shape, theta2.shape, theta3.shape

((20, 401), (30, 21), (10, 31))

In [124]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [125]:
def forward_propagate(X, theta1, theta2, theta3):
#Implemente código
    m = X.shape[0]

    a1 = np.insert(X, 0, values=np.ones(m), axis=1)
  
    z2 = a1 * theta1.T
 
    a2 = np.insert(sigmoid(z2), 0, values=np.ones(m), axis=1)
    
    z3 = a2 * theta2.T
    
    a3 = np.insert(sigmoid(z3), 0, values=np.ones(m), axis=1)
   
    z4 = a3 * theta3.T
   
    h = sigmoid(z4)
    
    return a1, z2, a2, z3, a3, z4, h

In [126]:
a1, z2, a2, z3, a3, z4, h = forward_propagate(X, theta1, theta2, theta3)
a1.shape, z2.shape, a2.shape, z3.shape, a3.shape, z4.shape, h.shape

((5000, 401),
 (5000, 20),
 (5000, 21),
 (5000, 30),
 (5000, 31),
 (5000, 10),
 (5000, 10))

In [127]:
def cost(params, input_size, hidden_size1, hidden_size2, num_labels, X, y, learning_rate):
#implemente código    
    m = X.shape[0]
    X = np.matrix(X)
    y = np.matrix(y)
    
    theta1 = np.matrix(np.reshape(params[:hidden_size1 * (input_size + 1)], (hidden_size1, (input_size + 1))))
    theta2 = np.matrix(np.reshape(params[hidden_size1 * (input_size + 1):hidden_size1*(input_size+1)+hidden_size2*(hidden_size1+1)], (hidden_size2, (hidden_size1 + 1))))
    theta3 = np.matrix(np.reshape(params[hidden_size1*(input_size+1)+hidden_size2*(hidden_size1+1):], (num_labels, (hidden_size2 + 1))))

    a1, z2, a2, z3, a3, z4, h = forward_propagate(X, theta1, theta2, theta3)
    
    J = 0
    for i in range(m):
        first_term = np.multiply(-y[i,:], np.log(h[i,:]))
        second_term = np.multiply((1 - y[i,:]), np.log(1 - h[i,:]))
        J += np.sum(first_term - second_term)
    
    J = J / m
    
    J += (float(learning_rate) / (2 * m)) * (np.sum(np.power(theta1[:,1:], 2)) + np.sum(np.power(theta2[:,1:], 2)) + np.sum(np.power(theta3[:,1:], 2)))
    
    return J

In [128]:
cost(params, input_size, hidden_size1, hidden_size2, num_labels, X, y_onehot, learning_rate)

6.9879689911048182

In [129]:
def sigmoid_gradient(z):
    return np.multiply(sigmoid(z), (1 - sigmoid(z)))

In [130]:
def backprop(params, input_size, hidden_size1, hidden_size2, num_labels, X, y, learning_rate):
#Implemente código
    m = X.shape[0]
    X = np.matrix(X)
    y = np.matrix(y)
    
    theta1 = np.matrix(np.reshape(params[:hidden_size1 * (input_size + 1)], (hidden_size1, (input_size + 1))))
    theta2 = np.matrix(np.reshape(params[hidden_size1 * (input_size + 1):hidden_size1*(input_size+1)+hidden_size2*(hidden_size1+1)], (hidden_size2, (hidden_size1 + 1))))
    theta3 = np.matrix(np.reshape(params[hidden_size1*(input_size+1)+hidden_size2*(hidden_size1+1):], (num_labels, (hidden_size2 + 1))))

    a1, z2, a2, z3, a3, z4, h = forward_propagate(X, theta1, theta2, theta3)
    
    J = 0
    delta1 = np.zeros(theta1.shape)
    delta2 = np.zeros(theta2.shape)
    delta3 = np.zeros(theta3.shape)
    
    for i in range(m):
        first_term = np.multiply(-y[i,:], np.log(h[i,:]))
        second_term = np.multiply((1 - y[i,:]), np.log(1 - h[i,:]))
        J += np.sum(first_term - second_term)
    
    J = J / m
    
    J += (float(learning_rate) / (2 * m)) * (np.sum(np.power(theta1[:,1:], 2)) + np.sum(np.power(theta2[:,1:], 2)) + np.sum(np.power(theta3[:,1:], 2)))
    
    for t in range(m):
        a1t = a1[t,:]
        z2t = z2[t,:]  
        a2t = a2[t,:]  
        z3t = z3[t,:]      
        a3t = a3[t,:]
        z4t = z4[t,:]
        ht = h[t,:]  # (1, 10)
        yt = y[t,:]  # (1, 10)
        d4t = ht - yt  # (1, 10)
        
        z3t = np.insert(z3t, 0, values=np.ones(1))
        d3t = np.multiply((theta3.T * d4t.T).T, sigmoid_gradient(z3t))
        
        z2t = np.insert(z2t, 0, values=np.ones(1))
        d2t = np.multiply((theta2.T * d3t[:,1:].T).T, sigmoid_gradient(z2t))
        
        delta1 = delta1 + (d2t[:,1:]).T * a1t
        delta2 = delta2 + (d3t[:,1:]).T * a2t
        delta3 = delta3 + d4t.T * a3t
        
    delta1 = delta1 / m
    delta2 = delta2 / m
    delta3 = delta3 / m
    
    delta1[:,1:] = delta1[:,1:] + (theta1[:,1:] * learning_rate) / m
    delta2[:,1:] = delta2[:,1:] + (theta2[:,1:] * learning_rate) / m
    delta3[:,1:] = delta3[:,1:] + (theta3[:,1:] * learning_rate) / m
    
    grad = np.concatenate((np.ravel(delta1), np.ravel(delta2), np.ravel(delta3)))
    

    return J, grad

In [131]:
J, grad = backprop(params, input_size, hidden_size1, hidden_size2, num_labels, X, y_onehot, learning_rate)
J, grad.shape

(6.9879689911048182, (8960,))

In [132]:
from scipy.optimize import minimize

# minimize the objective function
fmin = minimize(fun=backprop, x0=params, args=(input_size, hidden_size1,hidden_size2, num_labels, X, y_onehot, learning_rate), 
                method='TNC', jac=True, options={'maxiter': 250})
fmin

     fun: 0.31471072960888991
     jac: array([  3.59518512e-04,  -4.70839047e-06,  -7.06293239e-07, ...,
         6.62579329e-05,   1.79226446e-04,   3.61348910e-04])
 message: 'Max. number of function evaluations reached'
    nfev: 250
     nit: 21
  status: 3
 success: False
       x: array([-0.15438585, -0.02354195, -0.00353147, ..., -1.30000968,
       -1.96038596, -0.03605886])

In [133]:
# Implemente predicción
X = np.matrix(X)

theta1 = np.matrix(np.reshape(fmin.x[:hidden_size1 * (input_size + 1)], (hidden_size1, (input_size + 1))))
theta2 = np.matrix(np.reshape(fmin.x[hidden_size1 * (input_size + 1):hidden_size1*(input_size+1)+hidden_size2*(hidden_size1+1)], (hidden_size2, (hidden_size1 + 1))))
theta3 = np.matrix(np.reshape(fmin.x[hidden_size1*(input_size+1)+hidden_size2*(hidden_size1+1):], (num_labels, (hidden_size2 + 1))))

a1, z2, a2, z3, a3, z4, h = forward_propagate(X, theta1, theta2, theta3)
y_pred = np.array(np.argmax(h, axis=1) + 1)
y_pred

array([[10],
       [10],
       [10],
       ..., 
       [ 9],
       [ 9],
       [ 9]])

In [134]:
correct = [1 if a == b else 0 for (a, b) in zip(y_pred, y)]
accuracy = (sum(map(int, correct)) / float(len(correct)))
print ('accuracy = {0}%'.format(accuracy * 100))

accuracy = 99.48%


In [135]:
#5. Qué parámetros hacen que Ud. encuentre el mejor resultado?

#Para encontrar el mejor resultado, es mejor utilizar el valor más alto 
#y estable de learning rate. Además, que a menor número de hidden_size, 
#resulta ser más efectivo que números mayores, el cual tardaría mucho 
#en nuestro modelo.

# Laboratorio de BackPropagation

En el presente laboratorio Ud. debe usar la red neuronal desarrollada previamente en la tarea dejada en aula y desarrollar lo siguiente:
1. Tarea BackPropagation 4 capas (10 puntos)
2. Dividir la data en dos conjuntos (TrainSet y TestSet) (02 puntos)
3. Entrene el algoritmos con el TrainSet (04 puntos)
4. Realice la predicción con el TestSet (02 puntos)
5. Optimice su? (04 puntos)



Cargar la data original del MNIST y dividirla en Conjunto de Entrenamiento (primeros 60000) y 
conjunto de Test (10000 restantes)

In [144]:
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original')
mnist.data.shape

(70000, 784)

In [145]:
# Implemente su código de separación aquí
from sklearn.model_selection import train_test_split

X=mnist.data
y=mnist.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/7, random_state=42)

X_train.shape,X_test.shape,y_train.shape,y_test.shape

((60000, 784), (10000, 784), (60000,), (10000,))

In [146]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False)

y_onehot_train = encoder.fit_transform(y_train)
y_onehot_train.shape



(1, 60000)

In [147]:
# Su código de Entrenamiento aquí
input_size = 784
hidden_size1 = 20
hidden_size2 = 30
num_labels = 10
learning_rate = 1

params = (np.random.random(size=hidden_size1 * (input_size + 1) + hidden_size2 * (hidden_size1 + 1) + num_labels * (hidden_size2 + 1))) 

m = X_train.shape[0]
X_train = np.matrix(X_train)
y_train = np.matrix(y_train)

theta1 = np.matrix(np.reshape(params[:hidden_size1 * (input_size + 1)], (hidden_size1, (input_size + 1))))
theta2 = np.matrix(np.reshape(params[hidden_size1 * (input_size + 1):hidden_size1*(input_size+1)+hidden_size2*(hidden_size1+1)], (hidden_size2, (hidden_size1 + 1))))
theta3 = np.matrix(np.reshape(params[hidden_size1*(input_size+1)+hidden_size2*(hidden_size1+1):], (num_labels, (hidden_size2 + 1))))

theta1.shape, theta2.shape, theta3.shape

((20, 785), (30, 21), (10, 31))

In [148]:
def forward_propagate(X_train, theta1, theta2, theta3):

    m = X_train.shape[0]
  
    a1 = np.insert(X_train, 0, values=np.ones(m), axis=1)
  
    z2 = a1 * theta1.T

    a2 = np.insert(sigmoid(z2), 0, values=np.ones(m), axis=1)
   
    z3 = a2 * theta2.T

    a3 = np.insert(sigmoid(z3), 0, values=np.ones(m), axis=1)

    z4 = a3 * theta3.T

    h = sigmoid(z4)
    return a1, z2, a2, z3, a3, z4, h

In [149]:
a1, z2, a2, z3, a3, z4, h = forward_propagate(X_train, theta1, theta2, theta3)
a1.shape, z2.shape, a2.shape, z3.shape, a3.shape, z4.shape, h.shape

((60000, 785),
 (60000, 20),
 (60000, 21),
 (60000, 30),
 (60000, 31),
 (60000, 10),
 (60000, 10))

In [152]:
def cost(params, input_size, hidden_size1, hidden_size2, num_labels, X, y, learning_rate):
#implemente código    
    m = X.shape[0]
    X = np.matrix(X)
    y = np.matrix(y)
    
    theta1 = np.matrix(np.reshape(params[:hidden_size1 * (input_size + 1)], (hidden_size1, (input_size + 1))))
    theta2 = np.matrix(np.reshape(params[hidden_size1 * (input_size + 1):hidden_size1*(input_size+1)+hidden_size2*(hidden_size1+1)], (hidden_size2, (hidden_size1 + 1))))
    theta3 = np.matrix(np.reshape(params[hidden_size1*(input_size+1)+hidden_size2*(hidden_size1+1):], (num_labels, (hidden_size2 + 1))))

    a1, z2, a2, z3, a3, z4, h = forward_propagate(X, theta1, theta2, theta3)
    
    J = 0
    for i in range(m):
        first_term = np.multiply(-y[i,:], np.log(h[i,:]))
        second_term = np.multiply((1 - y[i,:]), np.log(1 - h[i,:]))
        J += np.sum(first_term - second_term)
    
    J = J / m
    
    J += (float(learning_rate) / (2 * m)) * (np.sum(np.power(theta1[:,1:], 2)) + np.sum(np.power(theta2[:,1:], 2)) + np.sum(np.power(theta3[:,1:], 2)))
    
    return J

In [153]:
cost(params, input_size, hidden_size1, hidden_size2, num_labels, X_train, y_onehot_train, learning_rate)

ValueError: operands could not be broadcast together with shapes (1,60000) (1,10) 

In [154]:
def sigmoid_gradient(z):
    return np.multiply(sigmoid(z), (1 - sigmoid(z)))

In [155]:
def backprop(params, input_size, hidden_size1, hidden_size2, num_labels, X_train, y_train, learning_rate):
#Implemente código
    m = X_train.shape[0]
    X = np.matrix(X_train)
    y = np.matrix(y_train)
    
    theta1 = np.matrix(np.reshape(params[:hidden_size1 * (input_size + 1)], (hidden_size1, (input_size + 1))))
    theta2 = np.matrix(np.reshape(params[hidden_size1 * (input_size + 1):hidden_size1*(input_size+1)+hidden_size2*(hidden_size1+1)], (hidden_size2, (hidden_size1 + 1))))
    theta3 = np.matrix(np.reshape(params[hidden_size1*(input_size+1)+hidden_size2*(hidden_size1+1):], (num_labels, (hidden_size2 + 1))))

    a1, z2, a2, z3, a3, z4, h = forward_propagate(X_train, theta1, theta2, theta3)
    
    J = 0
    delta1 = np.zeros(theta1.shape)
    delta2 = np.zeros(theta2.shape)
    delta3 = np.zeros(theta3.shape)
    
    for i in range(m):
        first_term = np.multiply(-y[i,:], np.log(h[i,:]))
        second_term = np.multiply((1 - y[i,:]), np.log(1 - h[i,:]))
        J += np.sum(first_term - second_term)
    
    J = J / m
    
    J += (float(learning_rate) / (2 * m)) * (np.sum(np.power(theta1[:,1:], 2)) + np.sum(np.power(theta2[:,1:], 2)) + np.sum(np.power(theta3[:,1:], 2)))
    
    for t in range(m):
        a1t = a1[t,:]
        z2t = z2[t,:]  
        a2t = a2[t,:]  
        z3t = z3[t,:]      
        a3t = a3[t,:]
        z4t = z4[t,:]
        ht = h[t,:]  # (1, 10)
        yt = y[t,:]  # (1, 10)
        d4t = ht - yt  # (1, 10)
        
        z3t = np.insert(z3t, 0, values=np.ones(1))
        d3t = np.multiply((theta3.T * d4t.T).T, sigmoid_gradient(z3t))
        
        z2t = np.insert(z2t, 0, values=np.ones(1))
        d2t = np.multiply((theta2.T * d3t[:,1:].T).T, sigmoid_gradient(z2t))
        
        delta1 = delta1 + (d2t[:,1:]).T * a1t
        delta2 = delta2 + (d3t[:,1:]).T * a2t
        delta3 = delta3 + d4t.T * a3t
        
    delta1 = delta1 / m
    delta2 = delta2 / m
    delta3 = delta3 / m
    
    delta1[:,1:] = delta1[:,1:] + (theta1[:,1:] * learning_rate) / m
    delta2[:,1:] = delta2[:,1:] + (theta2[:,1:] * learning_rate) / m
    delta3[:,1:] = delta3[:,1:] + (theta3[:,1:] * learning_rate) / m
    
    grad = np.concatenate((np.ravel(delta1), np.ravel(delta2), np.ravel(delta3)))
    

    return J, grad

In [156]:
J, grad = backprop(params, input_size, hidden_size1, hidden_size2, num_labels, X_train, y_onehot_train, learning_rate)
J, grad.shape

ValueError: operands could not be broadcast together with shapes (1,60000) (1,10) 

In [None]:
from scipy.optimize import minimize

# minimize the objective function
fmin = minimize(fun=backprop, x0=params, args=(input_size, hidden_size1,hidden_size2, num_labels, X_train, y_onehot_train, learning_rate), method='TNC', jac=True, options={'maxiter': 250})
fmin

In [None]:
# Su código de predicción aqui
X = np.matrix(X_test)

theta1 = np.matrix(np.reshape(fmin.x[:hidden_size1 * (input_size + 1)], (hidden_size1, (input_size + 1))))
theta2 = np.matrix(np.reshape(fmin.x[hidden_size1 * (input_size + 1):hidden_size1*(input_size+1)+hidden_size2*(hidden_size1+1)], (hidden_size2, (hidden_size1 + 1))))
theta3 = np.matrix(np.reshape(fmin.x[hidden_size1*(input_size+1)+hidden_size2*(hidden_size1+1):], (num_labels, (hidden_size2 + 1))))

a1, z2, a2, z3, a3, z4, h = forward_propagate(X_test, theta1, theta2, theta3)
y_pred = np.array(np.argmax(h, axis=1) + 1)
y_pred