In [9]:
import numpy as np
import torch
import torch.nn as nn

gpu = torch.cuda.current_device()
torch.cuda.get_device_name()

'NVIDIA GeForce GTX 1650 with Max-Q Design'

Do a gradient descent completely manually with numpy

In [18]:
# linear regression has formula f(x) = w * x
# so pytorch has to actually predict w

# let us say f(x) = 2 * x
# (pytorch doesn't know this. it has to predict that w=2)
X = np.array([1,2,3,4,5,6,7,8], dtype=np.float32)
Y = np.array([2,4,6,8,10,12,14,16], dtype=np.float32)

# initially set w to 0
w = 0.0

# model prediction
def forward(x):
    return w * x

# loss: MSE = 1/N * (wx - y)**2
def loss(y, y_predicted):
    return ((y_predicted - y)**2).mean()

# gradient
# dloss/dw = 1/N * 2 * (wx-y) * x
# dloss/dw = 1/N * 2 * x . (wx-y) <- dot product
def gradient(x, y, y_predicted):
    return np.dot(2 * x, y_predicted - y).mean()

# training
learning_rate = 0.001
n_iters = 25

print(f"Prediction before training for f(5) == {forward(5)}")
for epoch in range(n_iters):

    # prediction -> forward pass
    y_pred = forward(X)

    # loss
    l = loss(Y, y_pred)

    # gradients
    dw = gradient(X, Y, y_pred)

    # now update weights based on gradients
    w -= learning_rate * dw

    # print info
    if epoch % 2 == 0:
        print(f'epoch {epoch+1}: w = {w:.3f}, loss = {l:.8f}')

print(f"Prediction after training for f(5) == {forward(5)}")

Prediction before training for f(5) == 0.0
epoch 1: w = 0.816, loss = 102.00000000
epoch 3: w = 1.585, loss = 12.52815056
epoch 5: w = 1.855, loss = 1.53877044
epoch 7: w = 1.949, loss = 0.18899959
epoch 9: w = 1.982, loss = 0.02321389
epoch 11: w = 1.994, loss = 0.00285125
epoch 13: w = 1.998, loss = 0.00035020
epoch 15: w = 1.999, loss = 0.00004301
epoch 17: w = 2.000, loss = 0.00000528
epoch 19: w = 2.000, loss = 0.00000065
epoch 21: w = 2.000, loss = 0.00000008
epoch 23: w = 2.000, loss = 0.00000001
epoch 25: w = 2.000, loss = 0.00000000
Prediction after training for f(5) == 9.999979684352876


Now lets replace gradient calculation with builtin methods

In [19]:
X = torch.tensor([1,2,3,4,5,6,7,8], dtype=torch.float32)
Y = torch.tensor([2,4,6,8,10,12,14,16], dtype=torch.float32)

# instead of writing gradient function myself, use existing method
w = torch.tensor(0.0, dtype=torch.float32, requires_grad=True)

# model prediction
def forward(x):
    return w * x

# loss: MSE = 1/N * (wx - y)**2
def loss(y, y_predicted):
    return ((y_predicted - y)**2).mean()

# training
learning_rate = 0.001
n_iters = 100

print(f"Prediction before training for f(5) == {forward(5)}")
for epoch in range(n_iters):

    # prediction -> forward pass
    y_pred = forward(X)

    # loss
    l = loss(Y, y_pred)

    # gradients == backward pass == dl/dw
    l.backward()
    # these backward values will be accumulated in w.grad (dw = w.grad)

    # now update weights based on gradients
    with torch.no_grad():
        w -= learning_rate * w.grad

    # now reset gradients
    w.grad.zero_()

    # print info
    if epoch % 10 == 0:
        print(f'epoch {epoch+1}: w = {w:.3f}, loss = {l:.8f}')

print(f"Prediction after training for f(5) == {forward(5)}")

Prediction before training for f(5) == 0.0


AttributeError: 'numpy.float32' object has no attribute 'backward'

The grad of torch is not as accurate as numerical gradient I created manually
So need more steps to get correct answer

Now lets replace other manual functions with pytorch builtins

Normal pytorch design paradigm:

1. Design model (input, output, forward pass)
2. Construct loss and optimizer
3. Training loop
    1. forward pass: compute prediction
    2. backward pass: gradients
    3. update weights

In [12]:
X = torch.tensor([[1], [2], [3], [4]], dtype=torch.float32)
Y = torch.tensor([[2], [4], [6], [8]], dtype=torch.float32)

X_test = torch.tensor([5], dtype=torch.float32)
n_samples, n_features = X.shape
print(n_samples, n_features)

input_size = n_features
output_size = n_features

# model prediction
# use pytorch models -> no need for weights anymore
model = nn.Linear(in_features=input_size, out_features=output_size)

# replace loss with built in torch methods
loss = nn.MSELoss()

# training
learning_rate = 0.001
n_iters = 100

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

print(f"Prediction before training for f(5) == {model(X_test).item()}")

for epoch in range(n_iters):

    # prediction -> forward pass
    y_pred = model(X)

    # loss
    l = loss(Y, y_pred)

    # gradients == backward pass == dl/dw
    l.backward()
    # these backward values will be accumulated in w.grad (dw = w.grad)

    # now update weights based on gradients
    # automatic
    optimizer.step()

    # now reset gradients
    optimizer.zero_grad()

    # print info
    if epoch % 10 == 0:
        [w, b] = model.parameters()
        print(f'epoch {epoch+1}: w = {w[0][0].item():.3f}, loss = {l:.8f}')

print(f"Prediction after training for f(5) == {model(X_test).item()}")

4 1
Prediction before training for f(5) == 4.6803364753723145
epoch 1: w = 1.003, loss = 9.04720402
epoch 11: w = 1.153, loss = 6.46027613
epoch 21: w = 1.280, loss = 4.61313438
epoch 31: w = 1.388, loss = 3.29422188
epoch 41: w = 1.478, loss = 2.35247684
epoch 51: w = 1.555, loss = 1.68004370
epoch 61: w = 1.620, loss = 1.19990563
epoch 71: w = 1.675, loss = 0.85707104
epoch 81: w = 1.721, loss = 0.61227602
epoch 91: w = 1.760, loss = 0.43748266
Prediction after training for f(5) == 8.967496871948242


In [13]:
# suppose we want a custom model
class MyLinearRegression(nn.Module):

    def __init__(self, in_features, out_features):
        super(MyLinearRegression, self).__init__()

        # define layers
        self.lin = nn.Linear(in_features, out_features)

    def forward(self, X):
        return self.lin(X)

model = MyLinearRegression(4, 1)
