In [1]:
## Based on code at:
#  - https://docs.pytorch.org/tutorials/beginner/pytorch_with_examples.html
#  - Cleaned up and hardcoding made as variables for clarity

Basic Numpy

In [2]:
import numpy as np
import math

Num_data_samples = 2000
# Create random input and output data
x = np.linspace(-math.pi, math.pi, Num_data_samples)
y = np.sin(x)

# Randomly initialize weights
a = np.random.randn()
b = np.random.randn()
c = np.random.randn()
d = np.random.randn()


In [3]:
# Initialize to a low value
learning_rate = 1e-6
Num_rounds = 2400

for t in range(Num_rounds):
    # Forward pass: compute predicted y
    # y = a + b x + c x^2 + d x^3
    y_pred = a + b * x + c * x ** 2 + d * x ** 3

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    if t % 100 == 99:
        print(t, loss)

    # Backprop to compute gradients of a, b, c, d with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_a = grad_y_pred.sum()
    grad_b = (grad_y_pred * x).sum()
    grad_c = (grad_y_pred * x ** 2).sum()
    grad_d = (grad_y_pred * x ** 3).sum()

    # Update weights
    a -= learning_rate * grad_a
    b -= learning_rate * grad_b
    c -= learning_rate * grad_c
    d -= learning_rate * grad_d

print(f'Result: y = {a} + {b} x + {c} x^2 + {d} x^3')

99 1617.7257491795092
199 1074.4080122492314
299 714.6452743319054
399 476.3995214523051
499 318.6099773697166
599 214.09505920277644
699 144.85961435186064
799 98.98925392484904
899 68.59490721602583
999 48.45236802689706
1099 35.1017816162582
1199 26.251534353191595
1299 20.383618562434613
1399 16.492356512505026
1499 13.911402725324804
1599 12.199187258433536
1699 11.063050443184833
1799 10.30899586327683
1899 9.808407110819058
1999 9.475998622466731
2099 9.255206926554628
2199 9.10851026551354
2299 9.0110130530381
2399 8.946193411246012
Result: y = -0.0038022392025033828 + 0.8462655372239958 x + 0.0006559495319652625 x^2 + -0.09184036800323231 x^3


Using PyTorch

In [4]:
# Uncomment as needed
# !pip install -U  torch



In [5]:
import torch
import math

# We want to be able to train our model on an `accelerator <https://pytorch.org/docs/stable/torch.html#accelerators>`__
# such as CUDA, MPS, MTIA, or XPU. If the current accelerator is available, we will use it. Otherwise, we use the CPU.

dtype = torch.float
device = "cpu"
#device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")
torch.set_default_device(device)

# Create Tensors to hold input and outputs.
# By default, requires_grad=False, which indicates that we do not need to
# compute gradients with respect to these Tensors during the backward pass.
Num_data_samples = 2000
x = torch.linspace(-math.pi, math.pi, Num_data_samples, dtype=dtype)
y = torch.sin(x)

# Create random Tensors for weights. For a third order polynomial, we need
# 4 weights: y = a + b x + c x^2 + d x^3
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
a = torch.randn((), dtype=dtype, requires_grad=True)
b = torch.randn((), dtype=dtype, requires_grad=True)
c = torch.randn((), dtype=dtype, requires_grad=True)
d = torch.randn((), dtype=dtype, requires_grad=True)

Using cpu device


In [6]:
learning_rate = 1e-6
Num_rounds = 2400

for t in range(Num_rounds):
    # Forward pass: compute predicted y using operations on Tensors.
    y_pred = a + b * x + c * x ** 2 + d * x ** 3

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call a.grad, b.grad. c.grad and d.grad will be Tensors holding
    # the gradient of the loss with respect to a, b, c, d respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    with torch.no_grad():
        a -= learning_rate * a.grad
        b -= learning_rate * b.grad
        c -= learning_rate * c.grad
        d -= learning_rate * d.grad

        # Manually zero the gradients after updating weights
        a.grad = None
        b.grad = None
        c.grad = None
        d.grad = None

print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')

99 788.2459106445312
199 529.4957885742188
299 356.85321044921875
399 241.600830078125
499 164.61688232421875
599 113.16415405273438
699 78.75384521484375
799 55.726043701171875
899 40.30509948730469
999 29.970699310302734
1099 23.039966583251953
1199 18.388355255126953
1299 15.26382827758789
1399 13.163288116455078
1499 11.749972343444824
1599 10.798200607299805
1699 10.156644821166992
1799 9.723793029785156
1899 9.431455612182617
1999 9.233831405639648
2099 9.10009765625
2199 9.009488105773926
2299 8.948043823242188
2399 8.906332969665527
Result: y = -0.006694426294416189 + 0.8499247431755066 x + 0.0011548977345228195 x^2 + -0.09236085414886475 x^3


Illustrating PyTorch Optimization

In [7]:
# -*- coding: utf-8 -*-
import torch
import math

Num_data_samples = 2000
# Create Tensors to hold input and outputs.
x = torch.linspace(-math.pi, math.pi, Num_data_samples)
y = torch.sin(x)

# Prepare the input tensor (x, x^2, x^3).
p = torch.tensor([1, 2, 3])
xx = x.unsqueeze(-1).pow(p)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(3, 1),
    torch.nn.Flatten(0, 1)
)
loss_fn = torch.nn.MSELoss(reduction='sum')

In [10]:
# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use RMSprop; the optim package contains many other
# optimization algorithms. The first argument to the RMSprop constructor tells the
# optimizer which Tensors it should update.

learning_rate = 1e-3
#learning_rate = 1e-6       // seems to give delayed convergence in contrast to others 
Num_rounds = 2400

optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)
for t in range(Num_rounds):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(xx)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()


linear_layer = model[0]
print(f'Result: y = {linear_layer.bias.item()} + {linear_layer.weight[:, 0].item()} x + {linear_layer.weight[:, 1].item()} x^2 + {linear_layer.weight[:, 2].item()} x^3')

99 32466.4921875
199 15057.2236328125
299 5981.9755859375
399 1732.780517578125
499 290.064453125
599 27.054338455200195
699 10.477029800415039
799 9.66016960144043
899 9.270905494689941
999 9.075874328613281
1099 8.944184303283691
1199 8.85744857788086
1299 8.823195457458496
1399 8.817428588867188
1499 8.817171096801758
1599 8.855120658874512
1699 8.817290306091309
1799 8.965662002563477
1899 8.997116088867188
1999 8.890856742858887
2099 8.873276710510254
2199 8.90768051147461
2299 8.94318962097168
2399 8.923628807067871
Result: y = -0.0005040726391598582 + 0.8572428226470947 x + -0.0005040737451054156 x^2 + -0.09282831102609634 x^3
