In [1]:
#PyTorch: nn
#Computational graphs and autograd are a very powerful paradigm for defining complex operators and automatically taking derivatives; however for large neural networks raw autograd can be a bit too low-level.

#When building neural networks we frequently think of arranging the computation into layers, some of which have learnable parameters which will be optimized during learning.

In [2]:
#A Module receives input Tensors and computes output Tensors,
#but may also hold internal state such as Tensors containing learnable parameters.
#The nn package also defines a set of useful loss functions that are commonly used when training neural networks.

In [3]:
import torch 
import math

In [4]:
# Create Tensors to hold input and outputs.
x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)

In [5]:
# For this example, the output y is a linear function of (x, x^2, x^3), so
# we can consider it as a linear layer neural network. Let's prepare the
# tensor (x, x^2, x^3).

In [6]:
p = torch.tensor([1, 2, 3])
xx = x.unsqueeze(-1).pow(p)

In [7]:
#x.unsqueeze(-1) has shape (2000, 1), and p has shape(3,), 
#for this case, broadcasting semantics will apply to obtain a tensor
# of shape (2000, 3) 

In [8]:
#Use the nn package to define our model as a sequence of layers. nn.Sequential
#is a Module which contains other Modules, and applies them in sequence to
#produce its output. The Linear Module computes output from input using a
#linear function, and holds internal Tensors for its weight and bias.
#The Flatten layer flatens the output of the linear layer to a 1D tensor,to match the shape of `y`.

In [9]:
model=torch.nn.Sequential(
torch.nn.Linear(3,1),
    torch.nn.Flatten(0,1)
)

In [10]:
# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.

In [12]:
loss_fn=torch.nn.MSELoss(reduction='sum')

In [13]:
#define my learning rate 
learning_rate = 1e-6

In [15]:
for t in range (2000):
    
    
    
    #forward pass
    y_pred=model(xx)
    loss=loss_fn(y_pred,y)
    if t%100==99:
        print(t,loss.item())
        
    # Zero the gradients before running the backward pass.    
    model.zero_grad()
     # Backward pass: compute gradient of the loss with respect to all the learnable parameters
        #Internally, the parameters of each Module are stored
    # in Tensors with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()
     # Update the weights using gradient descent. Each parameter is a Tensor, so
    # we can access its gradients like we did before.
    with torch.no_grad():
        for param in model.parameters():
            param-= learning_rate * param.grad
# You can access the first layer of `model` like accessing the first item of a list
linear_layer = model[0]

# For linear layer, its parameters are stored as `weight` and `bias`.
print(f'Result: y = {linear_layer.bias.item()} + {linear_layer.weight[:, 0].item()} x + {linear_layer.weight[:, 1].item()} x^2 + {linear_layer.weight[:, 2].item()} x^3')
        
       
         
    
        

99 1266.07421875
199 844.2590942382812
299 564.1411743164062
399 378.0689392089844
499 254.4310760498047
599 172.25282287597656
699 117.61331176757812
799 81.27111053466797
899 57.090110778808594
999 40.994300842285156
1099 30.27586555480957
1199 23.135208129882812
1299 18.375917434692383
1399 15.202280044555664
1499 13.084905624389648
1599 11.671426773071289
1699 10.7273530960083
1799 10.09642219543457
1899 9.674477577209473
1999 9.392143249511719
Result: y = 0.011751042678952217 + 0.8360802531242371 x + -0.002027250826358795 x^2 + -0.0903915986418724 x^3


In [16]:
#PyTorch: optim
#Up to this point we have updated the weights of our models by manually mutating the Tensors holding learnable parameters with torch.no_grad().
#This is not a huge burden for simple optimization algorithms like stochastic gradient descent, 
#but in practice we often train neural networks using more sophisticated optimizers like AdaGrad, RMSProp, Adam, etc.

In [17]:
#The optim package in PyTorch abstracts the idea of an optimization algorithm 
#and provides implementations of commonly used optimization algorithms.

In [18]:
import torch 
import time
# Create Tensors to hold input and outputs.
x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)
dtype = torch.float 
device= torch.device('cpu')

In [19]:
#prepare holders for the inputs
#Prepare the input tensor (x, x^2, x^3)

In [20]:
p = torch.tensor([1, 2, 3])
xx=x.unsqueeze(-1).pow(p)

In [23]:
#define sequential model 
model= torch.nn.Sequential (
torch.nn.Linear(3,1),
    torch.nn.Flatten(0,1)

)

loss_fun=torch.nn.MSELoss(reduction='sum')

In [24]:
learning_rate=1e-3

In [27]:
optimizer=torch.optim.RMSprop(model.parameters(), lr=learning_rate)

In [28]:
for t in range (2000):
    #compute forward pass
    y_pred=model(xx)
    loss=loss_fun(y_pred,y)
    if t%100==99:
        print(t,loss.item())
    
    
    
    #backward pass
    #set optimzer into zero grad 
    optimizer.zero_grad()
    loss.backward()
    # Calling the step function on an Optimizer makes an update to its
    # parameters
    
    optimizer.step()
linear_layer=model[0]
print(f'Result: y = {linear_layer.bias.item()} + {linear_layer.weight[:, 0].item()} x + {linear_layer.weight[:, 1].item()} x^2 + {linear_layer.weight[:, 2].item()} x^3')

99 31576.798828125
199 15276.3896484375
299 6843.498046875
399 2829.45361328125
499 1379.159912109375
599 1016.84033203125
699 870.7041015625
799 728.7461547851562
899 587.0197143554688
999 456.8905944824219
1099 344.12969970703125
1199 249.4978790283203
1299 172.4946746826172
1399 112.39306640625
1499 68.08667755126953
1599 38.06401443481445
1699 20.32501792907715
1799 11.953274726867676
1899 9.360214233398438
1999 8.91268253326416
Result: y = 0.0005779920611530542 + 0.8522453904151917 x + 0.0005821323720738292 x^2 + -0.09227350354194641 x^3
