In [25]:
import numpy as np
import torch

In [26]:
input = np.array([[73,67,43],
 [91,88,64],
  [87,134,58],
  [102,43,37],
   [69,96,70]],dtype = 'float32')

In [27]:
targets = np.array([[56,70],
                    [81,101],
                    [119,113],
                     [22,37],
                    [103,119]], dtype='float32')

In [28]:
inputs = torch.from_numpy(input)
targets = torch.from_numpy(targets)
print(inputs)
print(targets)

tensor([[ 73.,  67.,  43.],
        [ 91.,  88.,  64.],
        [ 87., 134.,  58.],
        [102.,  43.,  37.],
        [ 69.,  96.,  70.]])
tensor([[ 56.,  70.],
        [ 81., 101.],
        [119., 113.],
        [ 22.,  37.],
        [103., 119.]])


In [29]:
# weigths and baises
w = torch.randn(2,3, requires_grad=True)
b = torch.randn(2, requires_grad=True)
print(w)
print(b)

tensor([[ 0.3088, -2.9144,  1.3369],
        [-0.2791,  0.5479, -0.1951]], requires_grad=True)
tensor([-0.3735,  0.8594], requires_grad=True)


In [33]:
# we define model
def model(x):
  return x @ w.t() + b

@ denote matrix multiplication in Pytorch. .t() --> transope of tensor

In [30]:
w.t()

tensor([[ 0.3088, -0.2791],
        [-2.9144,  0.5479],
        [ 1.3369, -0.1951]], grad_fn=<TBackward0>)

In [31]:
inputs @ w.t() + b

tensor([[-115.6079,    8.8065],
        [-143.1769,   11.1922],
        [-286.4966,   38.6822],
        [ -44.7271,  -11.2663],
        [-165.2652,   20.5450]], grad_fn=<AddBackward0>)

In [34]:
preds = model(inputs)
print(preds)

tensor([[-115.6079,    8.8065],
        [-143.1769,   11.1922],
        [-286.4966,   38.6822],
        [ -44.7271,  -11.2663],
        [-165.2652,   20.5450]], grad_fn=<AddBackward0>)


In [35]:
print(targets)

tensor([[ 56.,  70.],
        [ 81., 101.],
        [119., 113.],
        [ 22.,  37.],
        [103., 119.]])


# Loss Function
Before we improve the our model, we need a way to evaluate how well our model is performing . We can compare the model's predictions with actual targets using following methods.
1. calculate the differnce between the two matrices( pred and targets).
2. square the all elements of the difference matrix remove negative values.
3.Calculate the average of the elements in resulting matric


In [42]:
def mse(t1, t2):
  diff = t1 - t2
  return torch.sum(diff**2)/ diff.numel()

In [36]:
diff = preds - targets

In [40]:
torch.sum(diff**2)

tensor(349906.9688, grad_fn=<SumBackward0>)

In [41]:
diff**2

tensor([[ 29449.2793,   3744.6404],
        [ 50255.2656,   8065.4395],
        [164427.4844,   5523.1416],
        [  4452.5117,   2329.6321],
        [ 71966.2188,   9693.3818]], grad_fn=<PowBackward0>)

In [43]:
loss = mse(preds, targets)
print(loss)

tensor(34990.6953, grad_fn=<DivBackward0>)


# Compute Gradients


In [44]:
loss.backward()

In [45]:
print(w)
print(w.grad)

tensor([[ 0.3088, -2.9144,  1.3369],
        [-0.2791,  0.5479, -0.1951]], requires_grad=True)
tensor([[-18704.4297, -22836.9141, -13298.5469],
        [ -6164.3677,  -6697.7529,  -4273.4307]])


The loss is a quardatic function of our weights and baises and our objectives is to find the set of weights where the loss is the lowest.If we plot a graph of the loss w.r.t any individual weight or bias elements, it will look like the figure show below . An important insight from calculus is that the gradient indicates the rate of the change of the loss, i.e the loss functions's slope w.r.t the weights and biases.

if a gradient element is positive:
1. increasing the weight element's value slightly will increase the loss
2. decreasing the weight element's value will decrease the loss


if gradient element is negative:
1. increasing the weight element's value slightly will decrease the loss
2. decreasing the weight elements value slightly will increasing the loss

Before we proceed, we reset the gradients to zero by invoking the .zero() method. We need to do this because PyTorch accumulate gradients. The next time we invoke .backward on the loss,  the new gradient values are added to the existing values, which may lead to  unexpected results.

In [46]:
w.grad.zero_()
b.grad.zero_()
print(w.grad)
print(b.grad)

tensor([[0., 0., 0.],
        [0., 0., 0.]])
tensor([0., 0.])


## Adjust the weights  and baises using Gradient descent

we'll reduce the loss and improve our model using the gradient descent optimization algorithm , which has the following steps:

1. Generate the Predictions
2. Calculate the loss
3. compute the gradients w.r.t the weight and baises
4. Adjust the weights by subtracting a small quantity proportional to gradient 
5. Reset the gradient to Zero

Let's Implements the above step by step

In [47]:
# Generate Predictions
preds = model(inputs)
print(preds)

tensor([[-115.6079,    8.8065],
        [-143.1769,   11.1922],
        [-286.4966,   38.6822],
        [ -44.7271,  -11.2663],
        [-165.2652,   20.5450]], grad_fn=<AddBackward0>)


In [49]:
loss = mse(preds, targets)
print(loss)

tensor(34990.6953, grad_fn=<DivBackward0>)


In [50]:
loss.backward()

In [51]:
print(w.grad)
print(b.grad)

tensor([[-18704.4297, -22836.9141, -13298.5469],
        [ -6164.3677,  -6697.7529,  -4273.4307]])
tensor([-227.2547,  -74.4081])


In [52]:
# Adjust the weights and reset gradient
with torch.no_grad():
  w -= w.grad * 1e-5
  b -= b.grad * 1e-5
  w.grad.zero_()
  b.grad.zero_()

# A few things note above:
1. We use torch.no_grad to indicate to PyTorch that we shouldn't track, calculate, or modify gradients while updating the weights and baises.
2. We Multiply the gradient with very small number (10^5 in this case ) to ensure that we don't modify the weights by avery lar?ge amount.    we want to take a small step in the downhill direction of the gradient , not a gaint leap. This number is called learning.

In [56]:
preds = model(inputs)
print(preds)

tensor([[ -80.9323,   19.6323],
        [ -97.5460,   25.4315],
        [-231.9068,   55.4995],
        [ -10.9060,   -0.5167],
        [-121.1244,   34.2204]], grad_fn=<AddBackward0>)


In [57]:
loss = mse(preds, targets)

In [58]:
loss

tensor(24522.8164, grad_fn=<DivBackward0>)

Train Multiple Epoch

In [59]:
# Train 100 epoch
for i in range(100):
  preds = model(inputs)
  loss = mse(preds, targets)
  loss.backward()
  with torch.no_grad():
    w -= w.grad * 1e-5
    b -= b.grad * 1e-5
    w.grad.zero_()
    b.grad.zero_()

In [60]:
# calculate loss
preds = model(inputs)
loss = mse(preds,targets)
print(loss)

tensor(973.6049, grad_fn=<DivBackward0>)


In [61]:
preds

tensor([[ 67.9784,  68.8003],
        [102.7303,  91.3133],
        [ 54.6313, 131.9175],
        [ 85.6193,  46.2112],
        [ 99.8900,  99.4562]], grad_fn=<AddBackward0>)

In [63]:
targets

tensor([[ 56.,  70.],
        [ 81., 101.],
        [119., 113.],
        [ 22.,  37.],
        [103., 119.]])