# Possível Solução para Lab I (LUCAS)

In [46]:
import numpy as np
import matplotlib.pyplot as plt

## Definição do *dataset*

In [2]:
X = np.array([[0.5, 0.1], [0.2, 0.6]])
print(X.shape)
Y = np.array([[0.7], [0.8]])
print(Y.shape)

(2, 2)
(2, 1)


## Definição dos Parâmetros Treináveis

In [3]:
theta1 = np.array([[0.5, 0.2], 
                   [0.6, -0.1], 
                   [-0.4, -0.3]])
print(theta1.shape)

theta2 = np.array([[0.7, -0.1, 0.2]])
print(theta2.shape)

(3, 2)
(1, 3)


## Forward Pass

Nosso objetivo aqui é calcular o valor da saída da rede

$z^{(1)} = X\cdot\theta^{(1)^\top}$

In [4]:
z1 = np.matmul(X, theta1.T)
print(z1.shape)
print(z1)

(2, 3)
[[ 0.27  0.29 -0.23]
 [ 0.22  0.06 -0.26]]


$sigmoid(z)=\frac{1}{1+e^{-z}}$

In [5]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

$a^{(1)} = sigmoid(z^{(1)})$

In [6]:
a1 = sigmoid(z1)

$z^{(2)} = a^{(1)}\cdot\theta^{(2)^\top}$

In [7]:
z2 = np.matmul(a1, theta2.T)
print(z2.shape)
print(z2)

(2, 1)
[[0.42831585]
 [0.42391866]]


$a^{(2)} = sigmoid(z^{(2)})$

In [8]:
a2 = sigmoid(z2)
print(a2.shape)
print(a2)

(2, 1)
[[0.60547144]
 [0.60442057]]


## Loss Function

Nosso objetivo aqui é computar quão diferente a saída da rede está em relação ao valor da variável alvo $y$.

$loss = \frac{1}{2}(y-\hat{y})^T(y-\hat{y})$

In [9]:
loss = 0.5*np.matmul((Y-a2).T,(Y-a2))
print(loss)

[[0.02359348]]


## Backward Pass

Nosso objetivo aqui é computar os gradientes da função de custo em relação aos parâmetros treináveis, $\frac{\partial J}{\partial \theta_1}$ e $\frac{\partial J}{\partial \theta_2}$.

$\delta^{(2)} = \frac{\partial J}{\partial a_2}\frac{\partial a2}{\partial z_2}$

In [10]:
delta2 = (a2 - Y) * a2 * (1 - a2)
delta2

array([[-0.02258058],
       [-0.04676233]])

$\delta^{(1)} = \frac{\partial J}{\partial a_2}\frac{\partial a2}{\partial z_2}\frac{\partial z_2}{\partial a_1}\frac{\partial a1}{\partial z_1} = \delta^{(2)}\theta^{(2)}\odot a'^{(1)}$

In [11]:
delta1 = np.matmul(delta2, theta2) * a1 * (1 - a1)
delta1

array([[-0.00388045,  0.00055281, -0.00111423],
       [-0.00808518,  0.00116801, -0.00229904]])

$\frac{\partial J}{\partial \theta_2} = \delta^{(2)^\top} a^{(1)}$

In [12]:
dJdtheta2 = np.matmul(delta2.T, a1)
print(dJdtheta2.shape)
print(dJdtheta2)

(1, 3)
[[-0.03874806 -0.03699839 -0.03035622]]


$\frac{\partial J}{\partial \theta_1} = \delta^{(1)^\top} X$

In [13]:
dJdtheta1 = np.matmul(delta1.T, X)
print(dJdtheta1.shape)
print(dJdtheta1)

(3, 2)
[[-0.00355726 -0.00523915]
 [ 0.00051001  0.00075609]
 [-0.00101692 -0.00149085]]


## Optimizer

Vamos usar os gradientes calculados no *backward pass* para atualizar os parâmetros treináveis.

$\theta = \theta -\eta\frac{\partial J}{\partial \theta}$

In [14]:
theta2 = theta2 - 0.1 * dJdtheta2
theta1 = theta1 - 0.1 * dJdtheta1
print(theta2)
print(theta1)

[[ 0.70387481 -0.09630016  0.20303562]]
[[ 0.50035573  0.20052392]
 [ 0.599949   -0.10007561]
 [-0.39989831 -0.29985092]]


# MLP (minha versão)

## Forward Pass

In [6]:
import numpy as np
import math

In [7]:
theta_um = np.array([[0.5, 0.2],
                     [0.6, -0.1], 
                     [-0.4, -0.3]])
print(theta_um.shape)

theta_dois = np.array([[0.7, -0.1, 0.2]])
print(theta_dois.shape)

(3, 2)
(1, 3)


In [8]:
X = np.array([[0.5, 0.1], [0.2, 0.6]])
print(X.shape)
Y = np.array([[0.6], 
              [0.8]])
print(Y.shape)

(2, 2)
(2, 1)


In [20]:
def sigmoid(z):
    return 1/ (1 + np.exp(-z))
sigmoid(0) # tem que ser 0.5

0.5

In [21]:
z1 = np.matmul(X, theta_um.T)
z1

array([[ 0.27,  0.29, -0.23],
       [ 0.22,  0.06, -0.26]])

In [22]:
a1 = sigmoid(z1)
a1

array([[0.5670929 , 0.57199613, 0.44275215],
       [0.55477924, 0.5149955 , 0.43536371]])

In [29]:
z2 =np.matmul(a1, theta_dois.T)
z2

array([[0.42831585],
       [0.42391866]])

In [30]:
a2 = sigmoid(z2)
a2

array([[0.60547144],
       [0.60442057]])

## Função de custo

In [34]:
r =  Y - a2
loss =np.matmul(r.T, r)/2
loss

array([[0.01914063]])

## Backward Pass

In [35]:
# dJda2 == derivada de J em relação a A2
dJda2 = a2 - Y
da2dz2 = a2 * (1 - a2)

delta_dois = dJda2 * da2dz2
delta_dois

array([[ 0.00130699],
       [-0.04676233]])

In [None]:
da1dz1 = a1 * (1- a1)
delta_um = np.matmul(delta_dois, theta_dois) * da1dz1

delta_um

array([[ 2.24605550e-04, -3.19973721e-05,  6.44930033e-05],
       [-8.08518134e-03,  1.16800666e-03, -2.29904316e-03]])

# Gradiente

In [40]:
djdteta2 = np.matmul(delta_dois.T, a1)
djdteta2

djdteta1 = np.matmul(delta_um.T, X)
djdteta1

array([[-0.00150473, -0.00482865],
       [ 0.0002176 ,  0.0006976 ],
       [-0.00042756, -0.00137298]])

## Otimizador

In [43]:
eta = 0.1 # learning rate
teta1 = theta_um - eta * djdteta1
teta2 = theta_dois - eta * djdteta2

print(teta1, teta1.shape)
print(teta2, teta2. shape)

[[ 0.50015047  0.20048286]
 [ 0.59997824 -0.10006976]
 [-0.39995724 -0.2998627 ]] (3, 2)
[[ 0.70252016 -0.09766652  0.20197799]] (1, 3)


# Generalizar

In [None]:
def delta_L(aL, Y):
  dj_daL = aL - 
  daLdzL = aL * (1 - aL)
  δ_L = dj_daL * daLdzL
  return δ_L

δ_2 = delta_L(a2, Y)
print(δ_2)

def delta_l(delta_l_plus_one, theta_l_plus_one, al):
  daldzl = al * (1 - al)
  δ_l = np.matmul(delta_l_plus_one, theta_l_plus_one) * daldzl
  return δ_l

#  da1dz1 = a1 * (1 - a1)
#  δ_1 = np.matmul(δ_2, θ_2) * da1dz1
δ_1 = delta_l(δ_2, θ_2, a1)
print("δ_1: ", δ_1)

def gradient(delta_i, a_previous):
  return np.matmul(delta_i.T, a_previous) 

#  djdθ2 = np.matmul(δ_2.T, a1)
djdθ2 = gradient(δ_2, a1)
print("djdθ2: ", djdθ2)

#  djdθ1 = np.matmul(δ_1.T, X)
djdθ1 = gradient(δ_1, X)
print("djdθ1: ", djdθ1)

def optimize(old_theta, gradient, eta=0.1):
  return old_theta - eta * gradient

# θ = θ - η(∂J / ∂θ)
#  η = 0.1
#  θ1_new = θ_1 - η * djdθ1
θ1_new = optimize(θ_1, djdθ1)
print("θ1_new: ", θ1_new, θ1_new.shape)

#  θ2_new = θ_2 - η * djdθ2
θ2_new = optimize(θ_2, djdθ2)
print("θ2_new: ", θ2_new, θ2_new.shape)

epochs = 1000
η = 0.1
losses = []

for epoch in range(epochs):
    z1 = np.matmul(X, θ_1.T)
    a1 = sigmoid(z1)
    z2 = np.matmul(a1, θ_2.T)
    a2 = sigmoid(z2)
    
    r = y - a2
    loss = np.matmul(r.T, r) / 2
    losses.append(loss[0, 0])

    δ_2 = delta_L(a2, y)
    δ_1 = delta_l(δ_2, θ_2, a1)
    
    djdθ2 = gradient(δ_2, a1)
    djdθ1 = gradient(δ_1, X)
    
    θ_1 = optimize(θ_1, djdθ1, η)
    θ_2 = optimize(θ_2, djdθ2, η)
    if (epoch + 1) % 100 == 0:
        print(f"Época {epoch + 1}: Loss = {loss[0, 0]:.6f}")

plt.figure(figsize=(10, 6))
plt.plot(range(1, epochs + 1), losses, 'b-', linewidth=2)
plt.xlabel('Época', fontsize=12)
plt.ylabel('Loss', fontsize=12)
plt.title('Loss x Época de Treinamento', fontsize=14)
plt.grid(True, alpha=0.3)
plt.xlim(1, epochs)
plt.show()

print(f"\nLoss inicial: {losses[0]:.6f}")
print(f"Loss final: {losses[-1]:.6f}")
print(f"Redução da loss: {(losses[0] - losses[-1]) / losses[0] * 100:.2f}%")

[[ 0.00130699]
 [-0.04676233]]


ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 2 is different from 1)

# Formulas
---
## Forward Pass 
---
$z^{(1)} = X\cdot\theta^{(1)^\top}$
---
$a^{(1)} = sigmoid(z^{(1)})$
---
$z^{(2)} = a^{(1)}\cdot\theta^{(2)^\top}$
---
$a^{(2)} = sigmoid(z^{(2)})$
---
## Função de Custo
$loss = \frac{1}{2}(y-\hat{y})^T(y-\hat{y})$
---
## Backward Pass:
$\delta^{(2)} = \frac{\partial J}{\partial a_2}\frac{\partial a2}{\partial z_2}$
---
$\delta^{(1)} = \frac{\partial J}{\partial a_2}\frac{\partial a2}{\partial z_2}\frac{\partial z_2}{\partial a_1}\frac{\partial a1}{\partial z_1} = \delta^{(2)}\theta^{(2)}\odot a'^{(1)}$
---
$\frac{\partial J}{\partial \theta_2} = \delta^{(2)^\top} a^{(1)}$
---
$\frac{\partial J}{\partial \theta_1} = \delta^{(1)^\top} X$
---
## Otimizador
---
$\theta = \theta -\eta\frac{\partial J}{\partial \theta}$
---