In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch

### Tensors

#### Basics

In [3]:
torch.empty(5, 4)

tensor([[-0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000]])

In [7]:
torch.rand(5, 4)

tensor([[0.6622, 0.1148, 0.8905, 0.6815],
        [0.1778, 0.3767, 0.6672, 0.8088],
        [0.8123, 0.7168, 0.8233, 0.0852],
        [0.9552, 0.5699, 0.5575, 0.4409],
        [0.7661, 0.3184, 0.2364, 0.6215]])

In [8]:
torch.zeros(5, 4)

tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]])

In [9]:
x = torch.tensor([[1, 2, 3], [4, 5, 6]])
x

tensor([[1, 2, 3],
        [4, 5, 6]])

In [10]:
x.size()

torch.Size([2, 3])

In [11]:
y = torch.rand(5, 4)
y

tensor([[0.1635, 0.8862, 0.3673, 0.2578],
        [0.4794, 0.2664, 0.6679, 0.9998],
        [0.5770, 0.4683, 0.5257, 0.0643],
        [0.5916, 0.3146, 0.1784, 0.0361],
        [0.3267, 0.2900, 0.1372, 0.4784]])

In [12]:
y.tolist()[:1]

[[0.1635279655456543,
  0.8862454891204834,
  0.36734676361083984,
  0.2578318119049072]]

In [13]:
y.numpy()

array([[ 0.16352797,  0.88624549,  0.36734676,  0.25783181],
       [ 0.47940516,  0.2664358 ,  0.6679309 ,  0.99979913],
       [ 0.57697415,  0.46834332,  0.52573955,  0.06427157],
       [ 0.59158486,  0.31462371,  0.17836815,  0.03613251],
       [ 0.32673222,  0.28996468,  0.13719589,  0.47843015]], dtype=float32)

#### GPU

In [4]:
torch.cuda.is_available()

True

In [None]:
device = torch.device('cuda')
x = x.to(device)
xx = x * x

In [None]:
xx.to('cpu').numpy()

### Autograd

#### Grad setup

In [91]:
x = torch.tensor([3, 1], requires_grad=True, dtype=torch.float)

In [97]:
y = x + 2
y

tensor([5., 3.], grad_fn=<AddBackward>)

In [98]:
z = y * y + torch.rand(1, 2)
z

tensor([[25.9221,  9.5707]], grad_fn=<ThAddBackward>)

In [80]:
z.requires_grad_(True)
z.requires_grad

True

In [99]:
out = z.mean()
out

tensor(17.7464, grad_fn=<MeanBackward1>)

In [100]:
out.backward()

In [101]:
x.grad

tensor([10.,  6.])

In [76]:
# Gradient is cumulative
x.grad.data.zero_()

tensor([0., 0.])

#### Join existing gradinet

In [84]:
x = torch.tensor([3, 1], requires_grad=True, dtype=torch.float)
y = x + 2
z = y * y + torch.rand(1, 2)
out = z.mean()


ready_gradient = torch.tensor(10, dtype=torch.float)
out.backward(ready_gradient)

x.grad

tensor([50., 30.])

#### Turning of gradient

In [85]:
print(x.requires_grad)
print((x ** 2).requires_grad)

with torch.no_grad():
    print((x ** 2).requires_grad)

True
True
False


### Models

In [90]:
dtype = torch.float
device = torch.device("cpu")

N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        w1.grad.zero_()
        w2.grad.zero_()

0 25875530.0
1 23477310.0
2 25139320.0
3 27555168.0
4 27564088.0
5 23420998.0
6 16451155.0
7 9805631.0
8 5341537.5
9 2906914.0
10 1697768.5
11 1101633.5
12 791485.4375
13 614261.4375
14 501464.375
15 422042.125
16 361631.96875
17 313463.40625
18 273781.5625
19 240483.578125
20 212181.75
21 187912.453125
22 166969.3125
23 148792.421875
24 132967.484375
25 119103.7265625
26 106923.9140625
27 96194.8671875
28 86710.875
29 78309.203125
30 70847.8984375
31 64203.12890625
32 58277.8046875
33 52987.87890625
34 48246.3359375
35 43989.8359375
36 40159.37890625
37 36705.375
38 33587.5390625
39 30768.173828125
40 28215.33984375
41 25899.32421875
42 23798.056640625
43 21887.525390625
44 20148.19140625
45 18563.166015625
46 17116.125
47 15794.2216796875
48 14584.9150390625
49 13477.3095703125
50 12462.15625
51 11531.015625
52 10676.0341796875
53 9890.4248046875
54 9168.173828125
55 8503.908203125
56 7891.564453125
57 7327.599609375
58 6807.47607421875
59 6327.38720703125
60 5883.56494140625
61 5473

396 0.0005013043992221355
397 0.00048699017497710884
398 0.000472259329399094
399 0.00045960803981870413
400 0.0004471045976970345
401 0.00043520593317225575
402 0.00042335345642641187
403 0.0004112966125831008
404 0.00040016122511588037
405 0.00038875493919476867
406 0.00037827156484127045
407 0.00036886133602820337
408 0.0003590162086766213
409 0.000349351204931736
410 0.0003400190907996148
411 0.00033140479354187846
412 0.00032377056777477264
413 0.0003151167184114456
414 0.0003078094741795212
415 0.00029987021116539836
416 0.00029239553259685636
417 0.0002848532749339938
418 0.000278331310255453
419 0.0002719059120863676
420 0.0002641771570779383
421 0.0002582455053925514
422 0.0002521526475902647
423 0.0002469007740728557
424 0.00024147699878085405
425 0.00023602857254445553
426 0.00023036942002363503
427 0.0002254108985653147
428 0.0002194691332988441
429 0.00021511735394597054
430 0.00021064399334136397
431 0.00020559744734782726
432 0.0002014078781940043
433 0.00019669308676384

In [None]:
N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(reduction='sum')


learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    y_pred = model(x)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.item())

    #optimizer.zero_grad()
    model.zero_grad()
    
    loss.backward()

    optimizer.step()

In [5]:
import random
class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)

    def forward(self, x):
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred


N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = DynamicNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 658.7985229492188
1 654.87548828125
2 651.9625854492188
3 652.5747680664062
4 650.5530395507812
5 699.179931640625
6 645.9085693359375
7 596.5341796875
8 603.4854736328125
9 640.6136474609375
10 589.0440673828125
11 638.3933715820312
12 627.1065063476562
13 624.5576782226562
14 553.2034301757812
15 634.4506225585938
16 632.9016723632812
17 630.8174438476562
18 602.0353393554688
19 594.7101440429688
20 284.49945068359375
21 470.2591857910156
22 448.33001708984375
23 547.2080688476562
24 198.239501953125
25 509.330810546875
26 568.8136596679688
27 316.98529052734375
28 428.1928405761719
29 494.2279052734375
30 359.2505798339844
31 326.2535095214844
32 295.0746765136719
33 265.4187316894531
34 307.094482421875
35 236.27230834960938
36 207.75161743164062
37 250.3546142578125
38 263.0369567871094
39 246.3397979736328
40 92.18124389648438
41 204.2117919921875
42 152.2781524658203
43 144.4793701171875
44 125.41231536865234
45 101.7720947265625
46 129.4358367919922
47 239.03396606445312
48 9

389 0.17942774295806885
390 0.785534143447876
391 1.4661474227905273
392 0.5995476841926575
393 0.20321378111839294
394 0.18399570882320404
395 0.16826759278774261
396 0.5119891166687012
397 0.4378575384616852
398 0.33476343750953674
399 0.2180519998073578
400 0.3213740289211273
401 0.27723294496536255
402 0.21260696649551392
403 0.27224284410476685
404 0.2927769720554352
405 0.45235589146614075
406 0.2761409282684326
407 0.18918414413928986
408 0.3462660014629364
409 0.18316973745822906
410 0.35173335671424866
411 0.1747814565896988
412 0.13188958168029785
413 0.13506630063056946
414 0.19955064356327057
415 0.18865376710891724
416 1.8408479690551758
417 1.4305853843688965
418 1.320589542388916
419 1.2433329820632935
420 0.42504656314849854
421 0.6870096325874329
422 0.9327274560928345
423 0.27909931540489197
424 0.6972293257713318
425 0.5941821336746216
426 0.271371066570282
427 0.4581674039363861
428 0.5121147632598877
429 0.6815093755722046
430 0.3699913024902344
431 0.5696294903755

In [12]:
params = list(model.named_parameters())
params

[('input_linear.weight', Parameter containing:
  tensor([[-0.0125, -0.0227, -0.0143,  ...,  0.0092, -0.0204,  0.0069],
          [-0.0307,  0.0067, -0.0283,  ..., -0.0128,  0.0063, -0.0012],
          [ 0.0343,  0.0150, -0.0090,  ..., -0.0149, -0.0342,  0.0276],
          ...,
          [ 0.0265,  0.0033,  0.0405,  ...,  0.0130,  0.0005,  0.0003],
          [-0.0046, -0.0193, -0.0104,  ...,  0.0070, -0.0235, -0.0180],
          [ 0.0223,  0.0050,  0.0194,  ...,  0.0280,  0.0031,  0.0013]],
         requires_grad=True)), ('input_linear.bias', Parameter containing:
  tensor([-0.0157, -0.0241,  0.0133,  0.0260, -0.0204, -0.0091, -0.0268, -0.0352,
          -0.0032,  0.0019,  0.0213,  0.0101,  0.0306, -0.0148, -0.0394, -0.0184,
          -0.0396,  0.0113, -0.0358, -0.0210, -0.0276,  0.0173,  0.0182, -0.0173,
          -0.0056, -0.0255,  0.0114,  0.0161,  0.0019, -0.0322, -0.0153,  0.0058,
           0.0019,  0.0182, -0.0124,  0.0025,  0.0041, -0.0167, -0.0139, -0.0068,
           0.0114, -

In [10]:
params[0].name