In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np

# Definición de la RED
Dados los pesos y la estructura de una red neuronal con una softmax a la salida y un MSE como Loss calcular todo lo que se pide a continuación (No es comun usar MSE con la softmax pero a fines didácticos simplifica. Queda como ejercicio adicional resolver el mismo ejercicio pero con una categorical crossentropy a la saluda)

Las funciones de activación de las capas A1 y A2 son sigmoideas (Queda como ejercicio también probar con otras funciones de activación)

![red.png](red.png)

# Pesos de la red

In [3]:
weights = np.load('weights_softmax_3_layers.npy', allow_pickle=True)
capas = ['Capa Densa 1 ws: 2x3', 'Capa Densa 1 biases:', 'Capa Densa 2 - ws: 3x3', 'Capa Densa 2 - biases', 'Capa Densa 3 - ws: 2x3', 'Capa Densa 3 - biases']
for i, layer in enumerate(weights):
    print(capas[i])
    print(layer)
    print()

Capa Densa 1 ws: 2x3
[[0.10820953 0.3432914  0.1744045 ]
 [0.05457611 0.54989725 0.34384015]]

Capa Densa 1 biases:
[-0.67943245 -0.00294854  0.15257952]

Capa Densa 2 - ws: 3x3
[[-0.7706185  -0.17550795]
 [-0.10197585  0.45046437]
 [ 0.00585397  0.3024927 ]]

Capa Densa 2 - biases
[-0.10661452 -0.34508756]

Capa Densa 3 - ws: 2x3
[[-0.49749678 -0.40208894 -0.85052264]
 [ 1.0619878   0.07141189  0.17314   ]]

Capa Densa 3 - biases
[-0.29359275 -0.7259881   0.578059  ]



# Vector de entrada de ejemplo

In [4]:
X = np.array([[3.4, 2.1]])
print(X.shape)

(1, 2)


# Implementación D_1

In [5]:
D1_out =  np.dot(X,weights[0])+weights[1]
print(D1_out.shape)
print(D1_out)

(1, 3)
[[-0.19691022  2.31902646  1.46761914]]


# Implementación A_1

In [6]:
A1_out = 1/(1+(np.exp((-D1_out))))
print(A1_out)

[[0.45093089 0.91044059 0.81269524]]


# Implementación D_2

In [7]:
D2_out = np.dot(A1_out, weights[2]) + weights[3]
print(D2_out)

[[-0.54219567  0.23172592]]


# Implementación A_2

In [8]:
A2_out = 1/(1+(np.exp((-D2_out))))
print(A2_out)

[[0.36767696 0.55767363]]


# Implementación D_3

In [9]:
D3_out = np.dot(A2_out, weights[4]) + weights[5]
print(D3_out)

[[ 0.11573172 -0.8340024   0.36189705]]


# Implementación SoftMax

In [10]:
a,b,c = np.split(D3_out[0], 3)
P_1= np.exp(a) / (np.exp(a) + np.exp(b) + np.exp(c))
P_2= np.exp(b) / (np.exp(a) + np.exp(b) + np.exp(c))
P_3= np.exp(c) / (np.exp(a) + np.exp(b) + np.exp(c))
P_est= np.array([P_1,P_2,P_3]).T

print(P_est)

[[0.37510012 0.14510518 0.4797947 ]]


# Vector de salida

In [11]:
P_true = np.array([[1, 0, 0]])
print(P_true)

[[1 0 0]]


# Implementación MSE

In [12]:
MSE=(((P_est[0][0]-P_true[0][0])**2)+
 ((P_est[0][1]-P_true[0][1])**2)+
 ((P_est[0][2]-P_true[0][2])**2))/3

print(MSE)

0.2139194440951749


In [13]:
sum((sum((P_est-P_true)**2))/3)   # otro metodo

0.21391944409517488

# Implementar Gradiente MSE

In [14]:
MSE_grad=(sum(((P_est-P_true))*P_est.T)*2)/3
print(MSE_grad)

[-0.41659992  0.09673679  0.31986314]


# Implementar SoftMax Jacobiano

In [15]:
# Primera matriz
softmax_out = P_est
print(softmax_out)
print()
print(np.diag(softmax_out.reshape(-1)))

[[0.37510012 0.14510518 0.4797947 ]]

[[0.37510012 0.         0.        ]
 [0.         0.14510518 0.        ]
 [0.         0.         0.4797947 ]]


In [16]:
# Segunda matriz
softmax_out.T.dot(softmax_out)

array([[0.1407001 , 0.05442897, 0.17997105],
       [0.05442897, 0.02105551, 0.0696207 ],
       [0.17997105, 0.0696207 , 0.23020296]])

In [17]:
softmax_jac = (np.diag(softmax_out.reshape(-1))) - softmax_out.T.dot(softmax_out)

print(softmax_jac)

[[ 0.23440002 -0.05442897 -0.17997105]
 [-0.05442897  0.12404967 -0.0696207 ]
 [-0.17997105 -0.0696207   0.24959175]]


# Calcular el error propagado hasta la salida de D3

In [18]:
error_D3 = softmax_jac.dot(MSE_grad)
print(error_D3)
error_D3.shape

[-0.16048242  0.01240618  0.14807624]


(3,)

# Calculo de error propagado a la salida de A2

In [19]:
error_A2 = weights[4].dot(error_D3)
print(error_A2)
error_A2.shape

[-0.05109109 -0.14390649]


(2,)

# Jacobiano de sigmoidea

In [20]:
sigmoid = 1/(1+(np.exp((-D2_out))))
sigmoid_jac = sigmoid*(1-sigmoid)
sigmoid_jac = np.diag(sigmoid_jac.reshape(-1))
print(sigmoid_jac)

[[0.23249061 0.        ]
 [0.         0.24667375]]


# Calculo de error propagado a la salida de D2

In [21]:
error_D2 = sigmoid_jac.dot(error_A2)
print(error_D2)

[-0.0118782  -0.03549795]


# Calculo del error propagado a la salida de D1

In [22]:
error_A1 = weights[2].dot(error_D2)

In [23]:
sigmoid = 1/(1+(np.exp((-D1_out))))
sigmoid_jac = sigmoid*(1-sigmoid)
sigmoid_jac = np.diag(sigmoid_jac.reshape(-1))

In [24]:
error_D1 = sigmoid_jac.dot(error_A1)
print(error_D1)

[ 0.00380889 -0.00120508 -0.00164512]


# Calculo del gradiente de los pesos de D1, D2, D3

In [25]:
g_1_ws = error_D1*X.T
print(g_1_ws)
g_1_b = error_D1
print(g_1_b)

[[ 0.01295024 -0.00409727 -0.00559341]
 [ 0.00799867 -0.00253067 -0.00345476]]
[ 0.00380889 -0.00120508 -0.00164512]


In [26]:
g_2_ws = error_D2*A1_out.T
print(g_2_ws)
g_2_b = error_D2
print(g_2_b)

[[-0.00535625 -0.01600712]
 [-0.0108144  -0.03231878]
 [-0.00965336 -0.02884902]]
[-0.0118782  -0.03549795]


In [27]:
g_3_ws = error_D3*A2_out.T
print(g_3_ws)
g_3_b = error_D3
print(g_3_b)

[[-0.05900569  0.00456147  0.05444422]
 [-0.08949681  0.0069186   0.08257822]]
[-0.16048242  0.01240618  0.14807624]
