In [1]:
import torch

In [2]:
BATCH_SIZE = 16
DIM_IN = 1000
HIDDEN_SIZE = 100
DIM_OUT = 10

x = torch.randn(BATCH_SIZE, DIM_IN, requires_grad=False)
y = torch.randn(BATCH_SIZE, DIM_OUT, requires_grad=False)

class TinyModel(torch.nn.Module):
  def __init__(self):
    super().__init__()
    self.layer1 = torch.nn.Linear(DIM_IN, HIDDEN_SIZE)
    self.relu = torch.nn.ReLU()
    self.layer2 = torch.nn.Linear(HIDDEN_SIZE, DIM_OUT)

  def forward(self, x):
    x = self.layer1(x)
    x = self.relu(x)
    x = self.layer2(x)
    return x

model = TinyModel()

In [11]:
print(model.layer1.weight[0][0:10])
print(model.layer1.weight.grad)

tensor([ 0.0112, -0.0042,  0.0257, -0.0009,  0.0228,  0.0296,  0.0184, -0.0062,
        -0.0295,  0.0125], grad_fn=<SliceBackward0>)
None


In [15]:
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
prediction = model(x)
print(prediction.shape)

torch.Size([16, 10])


In [17]:
loss = (y - prediction).pow(2).sum()
loss

tensor(226.9576, grad_fn=<SumBackward0>)

In [None]:
loss.backward()
print(model.layer1.weight[0][0:10])
print(model.layer1.weight.grad[0][0:10])

tensor([ 0.0112, -0.0042,  0.0257, -0.0009,  0.0228,  0.0296,  0.0184, -0.0062,
        -0.0295,  0.0125], grad_fn=<SliceBackward0>)
tensor([ 0.4507, -0.3378, -1.0488,  2.2511, -0.3311, -0.0488, -0.4433, -1.6705,
         0.1399,  0.8129])


In [20]:
optimizer.step()
print(model.layer1.weight[0][0:10])
print(model.layer1.weight.grad[0][0:10])

tensor([ 0.0107, -0.0039,  0.0267, -0.0031,  0.0231,  0.0297,  0.0188, -0.0046,
        -0.0296,  0.0117], grad_fn=<SliceBackward0>)
tensor([ 0.4507, -0.3378, -1.0488,  2.2511, -0.3311, -0.0488, -0.4433, -1.6705,
         0.1399,  0.8129])


In [21]:
for i in range(5):
  prediction = model(x)
  loss = (y - prediction).pow(2).sum()
  loss.backward()
  print(model.layer1.weight.grad[0][0:10])

tensor([ 1.0226, -0.2810, -1.3845,  2.7138, -0.5061, -0.1703, -0.3826, -1.7349,
         0.5724,  0.5333])
tensor([ 1.5944, -0.2243, -1.7202,  3.1764, -0.6812, -0.2918, -0.3220, -1.7992,
         1.0048,  0.2537])
tensor([ 2.1663, -0.1675, -2.0559,  3.6391, -0.8563, -0.4133, -0.2613, -1.8635,
         1.4372, -0.0260])
tensor([ 2.7381, -0.1108, -2.3916,  4.1017, -1.0313, -0.5348, -0.2007, -1.9279,
         1.8696, -0.3056])
tensor([ 3.3099, -0.0541, -2.7272,  4.5644, -1.2064, -0.6563, -0.1400, -1.9922,
         2.3020, -0.5852])


In [22]:
optimizer.zero_grad(set_to_none=False)
print(model.layer1.weight.grad[0][0:10])

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
