In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

In [22]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 5x5 square convolution
        # kernel
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
        
    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is square you can only specify a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
    def num_flat_features(self, x):
        size = x.size()[1:]    # all dimensions except the batch dimension???
        num_features = 1
        print('size of x: ', x.size())
        for s in size:
            num_features *= s
        return num_features

In [18]:
net = Net()
print(net)

Net(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


In [21]:
params = list(net.parameters())
print(len(params))
print(params[1])    # conv1's .weight

10
Parameter containing:
tensor([-0.1871, -0.1161,  0.1710,  0.0899,  0.0018,  0.1119],
       requires_grad=True)


In [5]:
print(torch.__version__)

1.1.0


In [23]:
img = torch.randn(1, 1, 32, 32)
out = net(img)
print(out)

size of x:  torch.Size([1, 16, 5, 5])
tensor([[ 0.1038, -0.0736, -0.0909,  0.1255, -0.0222,  0.0582,  0.0834, -0.0961,
          0.0763,  0.0220]], grad_fn=<AddmmBackward>)


In [7]:
net.zero_grad()
out.backward(torch.randn(1, 10))

In [8]:
output = net(img)
target = torch.randn(10)    # a dummy target, for example
target = target.view(1, -1)    # make it the same shape as output
criterion = nn.MSELoss()

loss = criterion(output, target)
print(loss)

tensor(1.1589, grad_fn=<MseLossBackward>)


In [9]:
print(loss.grad_fn)    # MSELoss
print(loss.grad_fn.next_functions[0][0])    # Linear
print(loss.grad_fn.next_functions[0][0].next_functions[0][0])    # ReLU

<MseLossBackward object at 0x7f1157cac0f0>
<AddmmBackward object at 0x7f1157cac208>
<AccumulateGrad object at 0x7f1157cac0f0>


In [10]:
net.zero_grad()    # zeros the gradient buffers of all paprameters

print('conv1.bias.grad before backward')
print(net.conv1.bias.grad)

loss.backward()

print('conv1.bias.grad after backward')
print(net.conv1.bias.grad)

conv1.bias.grad before backward
tensor([0., 0., 0., 0., 0., 0.])
conv1.bias.grad after backward
tensor([-0.0104,  0.0139, -0.0099, -0.0025,  0.0044,  0.0093])


In [11]:
learning_rate = 0.01
for f in net.parameters():
    f.data.sub_(f.grad.data * learning_rate)
    # Conv2d -> 4D Tensor: nSamples x nChannels x Height x Width
    #                  or: output   x input     x H      X W ???
    print(f.size())

torch.Size([6, 1, 5, 5])
torch.Size([6])
torch.Size([16, 6, 5, 5])
torch.Size([16])
torch.Size([120, 400])
torch.Size([120])
torch.Size([84, 120])
torch.Size([84])
torch.Size([10, 84])
torch.Size([10])


In [12]:
import torch.optim as optim

# create your optimizer
optimizer = optim.SGD(net.parameters(), lr=0.01)

# in your training loop:
optimizer.zero_grad()    # zero the gradient buffers
output = net(img)
loss = criterion(output, target)
loss.backward()
optimizer.step()    # does the update

In [13]:
x = torch.randn(1)
print(x)
print(x.item())

tensor([0.0212])
0.021170184016227722


In [14]:
# let us run this cell only if CUDA is available
# We will use ``torch.device`` objects to move tensors in and out of GPU
if torch.cuda.is_available():
    device = torch.device("cuda")          # a CUDA device object
    y = torch.ones_like(x, device=device)  # directly create a tensor on GPU
    x = x.to(device)                       # or just use strings ``.to("cuda")``
    z = x + y
    print(z)
    print(z.to("cpu", torch.double))       # ``.to`` can also change dtype together!

    Found GPU0 GeForce GTX 660 which is of cuda capability 3.0.
    PyTorch no longer supports this GPU because it is too old.
    The minimum cuda capability that we support is 3.5.
    


RuntimeError: cuda runtime error (48) : no kernel image is available for execution on the device at /pytorch/aten/src/THC/generic/THCTensorMath.cu:16

In [None]:
torch.cuda.is_available()