In [16]:
import numpy as np
import torch
from torch.utils import data
from d2l import torch as d2l

In [17]:
true_w = torch.tensor([2, -3.4])
true_b = 4.2
features, labels = d2l.synthetic_data(true_w, true_b, 1000)

In [18]:
def load_array(data_arrays, batch_size, is_train=True): #@save
    """Construct a PyTorch data iterator."""
    dataset = data.TensorDataset(*data_arrays)
    return data.DataLoader(dataset, batch_size, shuffle=is_train)
batch_size = 10
data_iter = load_array((features, labels), batch_size)

In [19]:
next(iter(data_iter)) 

[tensor([[ 0.5410,  0.0362],
         [ 0.8261, -1.6003],
         [ 1.2515, -1.1159],
         [ 0.6271,  0.7902],
         [ 1.2835,  0.7375],
         [-0.2747, -0.4686],
         [ 1.3218,  0.5434],
         [ 0.0323, -0.7965],
         [ 1.0484,  0.9792],
         [-0.4642,  1.4009]]),
 tensor([[ 5.1578],
         [11.2952],
         [10.5064],
         [ 2.7712],
         [ 4.2741],
         [ 5.2398],
         [ 5.0068],
         [ 6.9869],
         [ 2.9674],
         [-1.4870]])]

In [20]:
from torch import nn
net = nn.Sequential(nn.Linear(2, 1))

In [21]:
print(net)
print(type(nn.Linear(2, 1)))
print(net[0])  # only 1 element because there's only one input layer.

Sequential(
  (0): Linear(in_features=2, out_features=1, bias=True)
)
<class 'torch.nn.modules.linear.Linear'>
Linear(in_features=2, out_features=1, bias=True)


In [22]:
print(f"Weight values: {net[0].weight}")
print(f"Weight data: {net[0].weight.data}")
print(f"Bias values: {net[0].bias}")
print(f"Bias data: {net[0].bias.data}")

Weight values: Parameter containing:
tensor([[-0.0621, -0.0414]], requires_grad=True)
Weight data: tensor([[-0.0621, -0.0414]])
Bias values: Parameter containing:
tensor([-0.1753], requires_grad=True)
Bias data: tensor([-0.1753])


In [23]:
net[0].weight.data.normal_(0, 0.01)
net[0].bias.data.fill_(0)

tensor([0.])

In [24]:
print(f"Weight values: {net[0].weight}")
print(f"Weight data: {net[0].weight.data}")
print(f"Bias values: {net[0].bias}")
print(f"Bias data: {net[0].bias.data}")

Weight values: Parameter containing:
tensor([[-0.0021, -0.0099]], requires_grad=True)
Weight data: tensor([[-0.0021, -0.0099]])
Bias values: Parameter containing:
tensor([0.], requires_grad=True)
Bias data: tensor([0.])


In [25]:
print(tuple(net.parameters()))

(Parameter containing:
tensor([[-0.0021, -0.0099]], requires_grad=True), Parameter containing:
tensor([0.], requires_grad=True))


In [26]:
loss = nn.MSELoss()
trainer = torch.optim.SGD(net.parameters(), lr=0.03)

In [27]:
num_epochs = 3
for epoch in range(num_epochs):
    for X, y in data_iter:
        l = loss(net(X), y)  # loss(10x2, 10x1)
        trainer.zero_grad()
        l.backward()
        trainer.step()
    l = loss(net(features), labels)
    print(f"epoch {epoch + 1}, loss {l:f}")      

epoch 1, loss 0.000224
epoch 2, loss 0.000101
epoch 3, loss 0.000100


In [28]:
w = net[0].weight.data
print('error in estimating w:', true_w - w.reshape(true_w.shape))
b = net[0].bias.data
print('error in estimating b:', true_b - b)

error in estimating w: tensor([-4.9591e-05, -4.0388e-04])
error in estimating b: tensor([-0.0002])


# Takeaway

**We can initialize the parameters by replacing their values with methods ending with _.**

# Exercises

1.  If we replace nn.MSELoss(reduction='sum') with nn.MSELoss(), how can we change the
learning rate for the code to behave identically. Why?

https://hy38.github.io/D2L-3-linear-regression-concise


In [29]:
loss = nn.MSELoss(reduction='sum')
trainer = torch.optim.SGD(net.parameters(), lr=0.03)

num_epochs = 3
for epoch in range(num_epochs):
    for X, y in data_iter:
        l = loss(net(X), y)  # loss(10x2, 10x1)
        trainer.zero_grad()
        l.backward()
        trainer.step()
    l = loss(net(features), labels)
    print(f"epoch {epoch + 1}, loss {l:f}")  

loss = nn.MSELoss(reduction='mean')
trainer = torch.optim.SGD(net.parameters(), lr=0.3)

num_epochs = 3
for epoch in range(num_epochs):
    for X, y in data_iter:
        l = loss(net(X), y)  # loss(10x2, 10x1)
        trainer.zero_grad()
        l.backward()
        trainer.step()
    l = loss(net(features), labels)
    print(f"epoch {epoch + 1}, loss {l:f}")  

epoch 1, loss 0.116181
epoch 2, loss 0.100443
epoch 3, loss 0.126965
epoch 1, loss 0.000127
epoch 2, loss 0.000144
epoch 3, loss 0.000110


2. Review the PyTorch documentation to see what loss functions and initialization methods are
provided. Replace the loss by Huberʼs loss.


In [34]:
loss = nn.SmoothL1Loss(reduction='sum')  # HuberLoss can have a similar behavior as an MSE
trainer = torch.optim.SGD(net.parameters(), lr=0.03)

num_epochs = 3
for epoch in range(num_epochs):
    for X, y in data_iter:
        l = loss(net(X), y)  # loss(10x2, 10x1)
        trainer.zero_grad()
        l.backward()
        trainer.step()
    l = loss(net(features), labels)
    print(f"epoch {epoch + 1}, loss {l:f}")  

epoch 1, loss 0.051458
epoch 2, loss 0.050454
epoch 3, loss 0.051122


3. How do you access the gradient of net[0].weight?

In [53]:
loss = nn.SmoothL1Loss(reduction='sum')  # HuberLoss
trainer = torch.optim.SGD(net.parameters(), lr=0.03)

# num_epochs = 3
# for epoch in range(num_epochs):
#     for X, y in data_iter:
#         l = loss(net(X), y)  # loss(10x2, 10x1)
#         trainer.zero_grad()
#         l.backward()
#         trainer.step()
#     l = loss(net(features), labels)
#     print(f"epoch {epoch + 1}, loss {l:f}")  
    
w = torch.tensor((1., 2.), requires_grad=True)  # only need this gradient. add floats
b = torch.ones(2, 1)
y = torch.matmul(w, b)
y.sum().backward(retain_graph=True)  # added argument to see twice as much for gradient value because it was not
# reinitialized
print(w.grad)
print(w)
y.sum().backward(retain_graph=True)
print(w.grad)
print(w)
y.sum().backward()
print(w.grad)  # messing up with the gradient values because it keeps adding them.
print(w)

# need to use grad._zero() or zero_grad()

tensor([1., 1.])
tensor([1., 2.], requires_grad=True)
tensor([2., 2.])
tensor([1., 2.], requires_grad=True)
tensor([3., 3.])
tensor([1., 2.], requires_grad=True)
