In [14]:
# -*- coding: utf-8 -*-
"""
PyTorch: Control Flow + Weight Sharing
--------------------------------------

To showcase the power of PyTorch dynamic graphs, we will implement a very strange
model: a fully-connected ReLU network that on each forward pass randomly chooses
a number between 1 and 4 and has that many hidden layers, reusing the same
weights multiple times to compute the innermost hidden layers.
"""
import random
import torch


class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we construct three nn.Linear instances that we will use
        in the forward pass.
        """
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        For the forward pass of the model, we randomly choose either 0, 1, 2, or 3
        and reuse the middle_linear Module that many times to compute hidden layer
        representations.

        Since each forward pass builds a dynamic computation graph, we can use normal
        Python control-flow operators like loops or conditional statements when
        defining the forward pass of the model.

        Here we also see that it is perfectly safe to reuse the same Module many
        times when defining a computational graph. This is a big improvement from Lua
        Torch, where each Module could be used only once.
        """
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the class defined above
model = DynamicNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 1.0542627573013306
1 1.1173288822174072
2 1.1172029972076416
3 1.0542488098144531
4 1.0542404651641846
5 1.1165510416030884
6 1.059468150138855
7 1.1159738302230835
8 1.0541918277740479
9 1.0549159049987793
10 1.0549092292785645
11 1.054147481918335
12 1.0541293621063232
13 1.1142168045043945
14 1.0548746585845947
15 1.0548667907714844
16 1.0591609477996826
17 1.1130893230438232
18 1.1127442121505737
19 1.0540049076080322
20 1.0589834451675415
21 1.0539674758911133
22 1.058863878250122
23 1.1109473705291748
24 1.053905725479126
25 1.0538841485977173
26 1.1098965406417847
27 1.1094889640808105
28 1.1089974641799927
29 1.1084315776824951
30 1.0537811517715454
31 1.0537614822387695
32 1.1067116260528564
33 1.054693341255188
34 1.1055989265441895
35 1.104999303817749
36 1.1043379306793213
37 1.053651213645935
38 1.05464506149292
39 1.0536162853240967
40 1.0535966157913208
41 1.0535728931427002
42 1.0535463094711304
43 1.1005899906158447
44 1.1001250743865967
45 1.0995829105377197
46 1.05

408 1.033905267715454
409 1.051121711730957
410 1.033759355545044
411 0.9974982142448425
412 1.0335981845855713
413 0.9970890879631042
414 1.0469236373901367
415 1.0510724782943726
416 1.0510646104812622
417 0.9961392283439636
418 0.9958550333976746
419 1.0510386228561401
420 1.0330556631088257
421 1.0468271970748901
422 0.99462890625
423 1.0510058403015137
424 1.0467852354049683
425 1.050988793373108
426 0.9934751391410828
427 0.9931535720825195
428 1.032656192779541
429 1.032601237297058
430 1.0325300693511963
431 1.0324461460113525
432 1.0323492288589478
433 1.0466644763946533
434 1.0321414470672607
435 1.0466315746307373
436 0.9907413721084595
437 1.0508993864059448
438 1.0465786457061768
439 1.0465599298477173
440 1.0316081047058105
441 1.0465176105499268
442 0.989547848701477
443 1.0313751697540283
444 1.0464532375335693
445 1.0508391857147217
446 1.0464091300964355
447 1.050823450088501
448 1.046363353729248
449 1.0463402271270752
450 1.0309112071990967
451 1.0462881326675415
45

In [11]:
# -*- coding: utf-8 -*-
"""
PyTorch: nn
-----------
A fully-connected ReLU network with one hidden layer, trained to predict y from x
by minimizing squared Euclidean distance.
This implementation uses the nn package from PyTorch to build the network.
PyTorch autograd makes it easy to define computational graphs and take gradients,
but raw autograd can be a bit too low-level for defining complex neural networks;
this is where the nn package can help. The nn package defines a set of Modules,
which you can think of as a neural network layer that has produces output from
input and may have some trainable weights.
"""
import torch

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
loss_fn = torch.nn.MSELoss()

learning_rate = 1e-4
for t in range(1000):
    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Tensor of input data to the Module and it produces
    # a Tensor of output data.
    y_pred = model(x)

    # Compute and print loss. We pass Tensors containing the predicted and true
    # values of y, and the loss function returns a Tensor containing the
    # loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.item())

    # Zero the gradients before running the backward pass.
    model.zero_grad()

    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Tensors with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()

    # Update the weights using gradient descent. Each parameter is a Tensor, so
    # we can access its gradients like we did before.
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

0 1.0897797346115112
1 1.0896512269973755
2 1.0895212888717651
3 1.0893927812576294
4 1.0892634391784668
5 1.0891343355178833
6 1.0890061855316162
7 1.088876485824585
8 1.0887473821640015
9 1.0886183977127075
10 1.0884896516799927
11 1.0883605480194092
12 1.0882306098937988
13 1.0881025791168213
14 1.0879738330841064
15 1.0878448486328125
16 1.087716817855835
17 1.0875866413116455
18 1.0874578952789307
19 1.0873301029205322
20 1.0872013568878174
21 1.087072491645813
22 1.086943507194519
23 1.086814522743225
24 1.0866872072219849
25 1.086557388305664
26 1.0864289999008179
27 1.0863006114959717
28 1.0861722230911255
29 1.086043119430542
30 1.0859140157699585
31 1.0857863426208496
32 1.085658311843872
33 1.0855296850204468
34 1.0854014158248901
35 1.0852726697921753
36 1.0851435661315918
37 1.0850155353546143
38 1.0848878622055054
39 1.0847598314285278
40 1.0846306085586548
41 1.0845028162002563
42 1.0843745470046997
43 1.0842459201812744
44 1.0841176509857178
45 1.083990216255188
46 1.08

495 1.0293488502502441
496 1.029233455657959
497 1.0291180610656738
498 1.0290015935897827
499 1.028886318206787
500 1.0287705659866333
501 1.0286556482315063
502 1.0285392999649048
503 1.0284249782562256
504 1.028308629989624
505 1.028193473815918
506 1.0280777215957642
507 1.027963399887085
508 1.0278480052947998
509 1.0277317762374878
510 1.0276155471801758
511 1.0275018215179443
512 1.0273867845535278
513 1.0272715091705322
514 1.0271564722061157
515 1.0270413160324097
516 1.02692711353302
517 1.0268118381500244
518 1.026698112487793
519 1.026583194732666
520 1.0264688730239868
521 1.0263532400131226
522 1.0262377262115479
523 1.0261242389678955
524 1.0260095596313477
525 1.0258947610855103
526 1.0257799625396729
527 1.025665283203125
528 1.0255510807037354
529 1.0254361629486084
530 1.025321364402771
531 1.025207281112671
532 1.0250920057296753
533 1.0249778032302856
534 1.024864673614502
535 1.024749755859375
536 1.024634838104248
537 1.0245206356048584
538 1.0244065523147583
539

998 0.974368691444397
999 0.9742655754089355


In [13]:
# -*- coding: utf-8 -*-
"""
PyTorch: Custom nn Modules
--------------------------
A fully-connected ReLU network with one hidden layer, trained to predict y from x
by minimizing squared Euclidean distance.
This implementation defines the model as a custom Module subclass. Whenever you
want a model more complex than a simple sequence of existing Modules you will
need to define your model this way.
"""
import torch


class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the class defined above
model = TwoLayerNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.
criterion = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 1.1857242584228516
1 1.1855802536010742
2 1.1854356527328491
3 1.1852918863296509
4 1.1851478815078735
5 1.1850032806396484
6 1.1848595142364502
7 1.1847151517868042
8 1.1845701932907104
9 1.1844279766082764
10 1.1842836141586304
11 1.1841398477554321
12 1.1839959621429443
13 1.1838518381118774
14 1.1837084293365479
15 1.1835647821426392
16 1.1834213733673096
17 1.183276653289795
18 1.1831339597702026
19 1.1829900741577148
20 1.1828453540802002
21 1.1827023029327393
22 1.1825588941574097
23 1.1824147701263428
24 1.1822717189788818
25 1.1821279525756836
26 1.1819837093353271
27 1.1818407773971558
28 1.1816976070404053
29 1.1815539598464966
30 1.1814110279083252
31 1.1812670230865479
32 1.1811234951019287
33 1.1809803247451782
34 1.1808370351791382
35 1.1806944608688354
36 1.180551290512085
37 1.180408239364624
38 1.180265188217163
39 1.1801230907440186
40 1.1799801588058472
41 1.1798373460769653
42 1.1796939373016357
43 1.1795514822006226
44 1.1794092655181885
45 1.1792676448822021
46

392 1.131608247756958
393 1.1314761638641357
394 1.1313436031341553
395 1.1312109231948853
396 1.1310783624649048
397 1.1309454441070557
398 1.1308141946792603
399 1.1306818723678589
400 1.1305502653121948
401 1.130418062210083
402 1.1302862167358398
403 1.1301532983779907
404 1.1300208568572998
405 1.1298894882202148
406 1.1297566890716553
407 1.1296249628067017
408 1.129492998123169
409 1.1293607950210571
410 1.1292284727096558
411 1.12909734249115
412 1.128965139389038
413 1.1288336515426636
414 1.1287015676498413
415 1.1285693645477295
416 1.1284376382827759
417 1.1283057928085327
418 1.128174066543579
419 1.1280428171157837
420 1.1279100179672241
421 1.1277782917022705
422 1.127646803855896
423 1.1275147199630737
424 1.1273831129074097
425 1.1272515058517456
426 1.1271204948425293
427 1.1269880533218384
428 1.1268575191497803
429 1.1267249584197998
430 1.126593828201294
431 1.1264623403549194
432 1.126331090927124
433 1.1261999607086182
434 1.1260684728622437
435 1.125937581062317

In [16]:
x.shape

torch.Size([64, 1000])