In [1]:
# -*- coding: utf-8 -*-
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 24419108.7871
1 19497043.5842
2 18270594.6538
3 18196855.215
4 17752567.7322
5 16043078.4853
6 13089881.2161
7 9637286.75584
8 6541819.17713
9 4223417.7121
10 2688458.62631
11 1737696.91694
12 1164184.02235
13 817697.451636
14 603388.950312
15 465706.038674
16 373000.685244
17 307435.684512
18 258884.723184
19 221447.805732
20 191532.200915
21 167022.666646
22 146613.055052
23 129355.11752
24 114604.643385
25 101883.914833
26 90847.9281811
27 81217.1436222
28 72782.5274722
29 65375.8153231
30 58850.1027037
31 53084.9090465
32 47971.0405415
33 43417.9343829
34 39357.6029503
35 35732.7443327
36 32486.8999716
37 29575.3749844
38 26958.5026664
39 24602.5262734
40 22478.088388
41 20560.4497126
42 18825.3087184
43 17254.5011135
44 15829.9118384
45 14537.6002055
46 13362.8431367
47 12292.6431424
48 11317.5013757
49 10428.3057209
50 9615.945954
51 8873.54673564
52 8194.10718449
53 7571.46292859
54 7000.83269691
55 6476.92573811
56 5995.59636843
57 5553.18307065
58 5146.31804738
59 4771.85944

460 1.69068929356e-06
461 1.61063330727e-06
462 1.53438976325e-06
463 1.46178248939e-06
464 1.39263298563e-06
465 1.32679493988e-06
466 1.26407989841e-06
467 1.20434912533e-06
468 1.14746088901e-06
469 1.09328109608e-06
470 1.04168121221e-06
471 9.92531808342e-07
472 9.45722762245e-07
473 9.01133511986e-07
474 8.58664313468e-07
475 8.182107487e-07
476 7.79675023617e-07
477 7.42968832254e-07
478 7.08000575116e-07
479 6.74688408264e-07
480 6.4295578982e-07
481 6.12725459587e-07
482 5.83930126905e-07
483 5.56496761554e-07
484 5.30366310388e-07
485 5.05469199683e-07
486 4.8174581887e-07
487 4.59142963866e-07
488 4.37607503736e-07
489 4.17090749767e-07
490 3.97540895952e-07
491 3.78915113881e-07
492 3.61166510821e-07
493 3.44259681797e-07
494 3.2814968421e-07
495 3.1279649885e-07
496 2.98165127253e-07
497 2.84223226279e-07
498 2.7093742274e-07
499 2.58278436984e-07


In [2]:
# -*- coding: utf-8 -*-

import torch


dtype = torch.FloatTensor
# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in).type(dtype)
y = torch.randn(N, D_out).type(dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H).type(dtype)
w2 = torch.randn(H, D_out).type(dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 37575160.69284865
1 41176332.808640584
2 46879525.87228203
3 44468617.149181716
4 30733771.331126094
5 15234787.033177197
6 6387448.185360312
7 2984697.826572947
8 1792787.3040200286
9 1306076.860323403
10 1044929.5575709369
11 869389.4046270618
12 736530.3218035296
13 630323.7661659063
14 543183.5198928714
15 470728.845975504
16 409964.6930851148
17 358646.4682919767
18 315098.9782443922
19 277847.8200891984
20 245826.08379245282
21 218155.55080228858
22 194150.0712610688
23 173227.3094241948
24 154916.08206074702
25 138830.93586424546
26 124676.87457031652
27 112186.41356887325
28 101123.72751529072
29 91311.43870130321
30 82573.83149181238
31 74783.08293196838
32 67820.53283353458
33 61582.19768251656
34 55984.48836814263
35 50959.98684128754
36 46439.82010069565
37 42366.37563658973
38 38689.97409420411
39 35364.89336883431
40 32353.65631621424
41 29625.327915935668
42 27156.62186428209
43 24915.462459727758
44 22877.067707689937
45 21020.742039572207
46 19329.20210007159
47 1778

362 0.0009512282082063117
363 0.0009172216191261212
364 0.0008872537820670684
365 0.0008581109760344013
366 0.0008292768616801338
367 0.0008016702878403947
368 0.0007741539477247039
369 0.0007487170950744304
370 0.0007242702347858287
371 0.0006997093728461357
372 0.0006778528819761931
373 0.0006574634893667108
374 0.0006362229841320455
375 0.0006160458900507093
376 0.0005968024156627477
377 0.0005787905063507487
378 0.0005610445309159884
379 0.0005436603766743042
380 0.0005270443868407582
381 0.0005108281039649804
382 0.0004964700791013216
383 0.00048165903627257756
384 0.00046740782363055566
385 0.000454093910330966
386 0.0004414874677054481
387 0.0004286375009909388
388 0.00041521515918302443
389 0.0004044152573840637
390 0.0003927724686352829
391 0.00038142512045877264
392 0.0003717181692210747
393 0.00036147180398021783
394 0.000351303434405148
395 0.00034201394373906124
396 0.00033327873589990287
397 0.00032397515278177746
398 0.00031604827652993717
399 0.0003076619152705129
400 0

In [3]:
# -*- coding: utf-8 -*-
import torch
from torch.autograd import Variable

dtype = torch.FloatTensor
# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs, and wrap them in Variables.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Variables during the backward pass.
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

# Create random Tensors for weights, and wrap them in Variables.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Variables during the backward pass.
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Variables; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Variables.
    # Now loss is a Variable of shape (1,) and loss.data is a Tensor of shape
    # (1,); loss.data[0] is a scalar value holding the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.data[0])

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Variables with requires_grad=True.
    # After this call w1.grad and w2.grad will be Variables holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Update weights using gradient descent; w1.data and w2.data are Tensors,
    # w1.grad and w2.grad are Variables and w1.grad.data and w2.grad.data are
    # Tensors.
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data

    # Manually zero the gradients after updating weights
    w1.grad.data.zero_()
    w2.grad.data.zero_()

0 31903012.0
1 29630278.0
2 30323632.0
3 29057076.0
4 24013580.0
5 16397935.0
6 9651351.0
7 5256356.0
8 2923620.0
9 1768240.125
10 1192139.5
11 882315.25
12 696795.9375
13 572600.4375
14 481731.40625
15 411236.125
16 354588.21875
17 307876.46875
18 268810.0
19 235806.9375
20 207690.90625
21 183580.671875
22 162808.234375
23 144821.140625
24 129191.984375
25 115549.5625
26 103600.890625
27 93108.8203125
28 83857.703125
29 75691.8828125
30 68438.15625
31 61998.1015625
32 56260.02734375
33 51134.0078125
34 46547.68359375
35 42432.98828125
36 38735.1484375
37 35406.5234375
38 32404.890625
39 29693.724609375
40 27240.896484375
41 25018.912109375
42 23002.4765625
43 21171.86328125
44 19506.44921875
45 17988.859375
46 16604.37890625
47 15339.900390625
48 14183.3310546875
49 13125.384765625
50 12155.2275390625
51 11268.548828125
52 10456.4755859375
53 9709.51953125
54 9021.8642578125
55 8388.2626953125
56 7803.857421875
57 7264.5341796875
58 6766.30712890625
59 6305.7236328125
60 5880.38476562

406 0.0002488697646185756
407 0.00024161530018318444
408 0.00023469024745281786
409 0.00022857695876155049
410 0.00022239843383431435
411 0.000216619111597538
412 0.0002105733728967607
413 0.00020500377286225557
414 0.00019961883663199842
415 0.00019398833683226258
416 0.0001888374681584537
417 0.00018403874128125608
418 0.0001798836310626939
419 0.0001754351396812126
420 0.00017101156117860228
421 0.00016642465197946876
422 0.00016184923879336566
423 0.00015831271593924612
424 0.00015487062046304345
425 0.00015096206334419549
426 0.00014727350207976997
427 0.00014411458687391132
428 0.0001403062924509868
429 0.0001373151026200503
430 0.0001341283059446141
431 0.00013101052900310606
432 0.00012813946523237973
433 0.00012486062769312412
434 0.00012229967978782952
435 0.00011969201295869425
436 0.0001171688491012901
437 0.000114455382572487
438 0.0001120874730986543
439 0.00010996368655469269
440 0.00010701724386308342
441 0.00010472511348780245
442 0.00010244622535537928
443 0.000100614

In [4]:
# -*- coding: utf-8 -*-
import torch
from torch.autograd import Variable


class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    def forward(self, input):
        """
        In the forward pass we receive a Tensor containing the input and return a
        Tensor containing the output. You can cache arbitrary Tensors for use in the
        backward pass using the save_for_backward method.
        """
        self.save_for_backward(input)
        return input.clamp(min=0)

    def backward(self, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = self.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input


dtype = torch.FloatTensor
# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

# Create random Tensors for weights, and wrap them in Variables.
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Construct an instance of our MyReLU class to use in our network
    relu = MyReLU()

    # Forward pass: compute predicted y using operations on Variables; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.data[0])

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data

    # Manually zero the gradients after updating weights
    w1.grad.data.zero_()
    w2.grad.data.zero_()

0 31540544.0
1 30696770.0
2 33331906.0
3 34058848.0
4 29165704.0
5 19959290.0
6 11161026.0
7 5672261.0
8 2970676.75
9 1760928.625
10 1201874.75
11 912455.9375
12 737995.875
13 617410.9375
14 526164.4375
15 453256.375
16 393243.875
17 342992.28125
18 300420.0625
19 264104.25
20 232934.1875
21 206082.78125
22 182870.625
23 162692.453125
24 145075.671875
25 129648.859375
26 116102.7734375
27 104177.5234375
28 93650.4140625
29 84322.6796875
30 76052.375
31 68689.2890625
32 62131.2890625
33 56281.3046875
34 51048.65625
35 46357.5859375
36 42149.05859375
37 38366.31640625
38 34959.640625
39 31889.5
40 29116.51171875
41 26611.779296875
42 24345.087890625
43 22293.361328125
44 20431.921875
45 18742.0234375
46 17206.109375
47 15808.142578125
48 14535.5400390625
49 13376.599609375
50 12318.923828125
51 11354.68359375
52 10471.8486328125
53 9664.458984375
54 8924.58984375
55 8246.9404296875
56 7625.84130859375
57 7055.53369140625
58 6531.8466796875
59 6049.85693359375
60 5606.38623046875
61 5198.

410 0.00016163912368938327
411 0.00015799392713233829
412 0.0001548701839055866
413 0.00015070704102981836
414 0.00014738747267983854
415 0.0001447433460270986
416 0.00014143908629193902
417 0.00013862128253094852
418 0.0001353378320345655
419 0.00013251476048026234
420 0.00012976315338164568
421 0.00012720286031253636
422 0.00012437145051080734
423 0.00012229791900608689
424 0.00011943380377488211
425 0.00011756971071008593
426 0.00011502600682433695
427 0.00011273130803601816
428 0.00011065127182519063
429 0.00010841819312190637
430 0.00010628215386532247
431 0.0001043293159455061
432 0.00010198822565143928
433 0.00010090007708640769
434 9.880557627184317e-05
435 9.67182859312743e-05
436 9.512882388662547e-05
437 9.336975926999003e-05
438 9.141334157902747e-05
439 9.010422945721075e-05
440 8.850725862430409e-05
441 8.686359797138721e-05
442 8.506963058607653e-05
443 8.365562825929374e-05
444 8.220586460083723e-05
445 8.077902748482302e-05
446 7.920174539322034e-05
447 7.8254575782921

In [6]:
# -*- coding: utf-8 -*-
import torch
from torch.autograd import Variable

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Variables for its weight and bias.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
loss_fn = torch.nn.MSELoss(size_average=False)

learning_rate = 1e-4
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Variable of input data to the Module and it produces
    # a Variable of output data.
    y_pred = model(x)

    # Compute and print loss. We pass Variables containing the predicted and true
    # values of y, and the loss function returns a Variable containing the
    # loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.data[0])

    # Zero the gradients before running the backward pass.
    model.zero_grad()

    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Variables with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()

    # Update the weights using gradient descent. Each parameter is a Variable, so
    # we can access its data and gradients like we did before.
    for param in model.parameters():
        param.data -= learning_rate * param.grad.data

0 689.3182373046875
1 638.02392578125
2 593.3311767578125
3 553.7654418945312
4 518.1622924804688
5 486.12640380859375
6 457.1466064453125
7 430.4496765136719
8 405.54461669921875
9 382.3022766113281
10 360.593505859375
11 340.2492370605469
12 321.18817138671875
13 303.40960693359375
14 286.6517333984375
15 270.75543212890625
16 255.64219665527344
17 241.31967163085938
18 227.74810791015625
19 214.85931396484375
20 202.64576721191406
21 191.02023315429688
22 180.01644897460938
23 169.5679168701172
24 159.65106201171875
25 150.2464141845703
26 141.333251953125
27 132.89772033691406
28 124.92241668701172
29 117.37234497070312
30 110.19620513916016
31 103.41656494140625
32 97.02875518798828
33 91.00948333740234
34 85.34481811523438
35 80.01917266845703
36 75.0257568359375
37 70.33831787109375
38 65.94003295898438
39 61.80942153930664
40 57.93907928466797
41 54.3062629699707
42 50.89249038696289
43 47.697547912597656
44 44.70981979370117
45 41.90272521972656
46 39.27647399902344
47 36.8173

365 0.00018537745927460492
366 0.00018038587586488575
367 0.0001755331177264452
368 0.00017082654812838882
369 0.00016624550335109234
370 0.00016179440717678517
371 0.0001574713533045724
372 0.00015326398715842515
373 0.00014917981752660125
374 0.0001452093420084566
375 0.00014134762750472873
376 0.0001375911815557629
377 0.0001339427981292829
378 0.00013039169425610453
379 0.00012694310862571
380 0.00012358791718725115
381 0.00012032391532557085
382 0.00011715470463968813
383 0.00011406895646359771
384 0.00011107153113698587
385 0.0001081541195162572
386 0.00010531736916163936
387 0.00010256120003759861
388 9.987592056859285e-05
389 9.727147698868066e-05
390 9.473005047766492e-05
391 9.226287511410192e-05
392 8.986131433630362e-05
393 8.752451685722917e-05
394 8.525430166628212e-05
395 8.304322545882314e-05
396 8.089774928521365e-05
397 7.880342309363186e-05
398 7.676869427086785e-05
399 7.478820771211758e-05
400 7.286084291990846e-05
401 7.098599598975852e-05
402 6.916296842973679e-0

In [7]:
# -*- coding: utf-8 -*-
import torch
from torch.autograd import Variable

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(size_average=False)

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algoriths. The first argument to the Adam constructor tells the
# optimizer which Variables it should update.
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(x)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.data[0])

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable weights
    # of the model)
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()

0 664.4680786132812
1 647.805419921875
2 631.7362670898438
3 616.1707763671875
4 601.0084838867188
5 586.29248046875
6 571.958984375
7 558.0211181640625
8 544.4584350585938
9 531.28076171875
10 518.4517211914062
11 505.9383850097656
12 493.722900390625
13 481.8951110839844
14 470.376708984375
15 459.1427917480469
16 448.1701965332031
17 437.537841796875
18 427.1794738769531
19 417.0557861328125
20 407.24517822265625
21 397.6942138671875
22 388.3934631347656
23 379.34527587890625
24 370.51824951171875
25 361.8854064941406
26 353.4512939453125
27 345.2228088378906
28 337.15411376953125
29 329.2489318847656
30 321.524169921875
31 314.0033874511719
32 306.6524658203125
33 299.46490478515625
34 292.42694091796875
35 285.4956359863281
36 278.7000732421875
37 272.064453125
38 265.5989074707031
39 259.2781066894531
40 253.10488891601562
41 247.07418823242188
42 241.15631103515625
43 235.37506103515625
44 229.69004821777344
45 224.1064910888672
46 218.6099853515625
47 213.19754028320312
48 207.

369 0.001164036919362843
370 0.0011172490194439888
371 0.0010722419247031212
372 0.0010289272759109735
373 0.0009872530354186893
374 0.0009471909143030643
375 0.0009086282807402313
376 0.0008715667645446956
377 0.0008359146304428577
378 0.0008016401552595198
379 0.0007686926401220262
380 0.0007370128878392279
381 0.0007065685349516571
382 0.0006773151690140367
383 0.000649201450869441
384 0.0006221838411875069
385 0.000596238358411938
386 0.0005713026621378958
387 0.0005473548080772161
388 0.0005243605119176209
389 0.0005022753030061722
390 0.000481066555948928
391 0.0004607111040968448
392 0.00044116657227277756
393 0.0004224077274557203
394 0.00040440092561766505
395 0.00038712628884240985
396 0.00037054804852232337
397 0.00035463954554870725
398 0.00033938480191864073
399 0.0003247512795496732
400 0.0003107113006990403
401 0.0002972523507196456
402 0.0002843401161953807
403 0.000271965836873278
404 0.0002600968291517347
405 0.00024872770882211626
406 0.00023782353673595935
407 0.000

In [8]:
# -*- coding: utf-8 -*-
import torch
from torch.autograd import Variable


class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        In the forward function we accept a Variable of input data and we must return
        a Variable of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Variables.
        """
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs, and wrap them in Variables
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

# Construct our model by instantiating the class defined above
model = TwoLayerNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.
criterion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.data[0])

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 688.2852172851562
1 638.2798461914062
2 595.4622802734375
3 558.5010986328125
4 525.97607421875
5 496.7709045410156
6 469.994873046875
7 445.47601318359375
8 422.8056945800781
9 401.6919250488281
10 381.77667236328125
11 362.8035888671875
12 344.6605529785156
13 327.4183654785156
14 310.8876037597656
15 295.17559814453125
16 280.1298522949219
17 265.70916748046875
18 251.86476135253906
19 238.55288696289062
20 225.8187255859375
21 213.58966064453125
22 201.9269256591797
23 190.80712890625
24 180.2139434814453
25 170.1385040283203
26 160.56460571289062
27 151.45753479003906
28 142.80992126464844
29 134.5947265625
30 126.8155517578125
31 119.46200561523438
32 112.50730895996094
33 105.94361877441406
34 99.74138641357422
35 93.87779235839844
36 88.35882568359375
37 83.16580200195312
38 78.26249694824219
39 73.6474609375
40 69.30593872070312
41 65.23048400878906
42 61.40176010131836
43 57.81111145019531
44 54.44020080566406
45 51.27296447753906
46 48.291629791259766
47 45.49197006225586


355 0.0007327256607823074
356 0.0007119267829693854
357 0.000691689201630652
358 0.0006720258970744908
359 0.0006529496167786419
360 0.0006344093126244843
361 0.0006164147052913904
362 0.0005989289493300021
363 0.0005819257930852473
364 0.0005654048873111606
365 0.0005493668140843511
366 0.0005337801412679255
367 0.0005186482449062169
368 0.0005039435927756131
369 0.0004896674654446542
370 0.0004757866554427892
371 0.0004623030836228281
372 0.0004492016742005944
373 0.0004364759079180658
374 0.0004241107380948961
375 0.0004121129750274122
376 0.0004004354414064437
377 0.00038909580325707793
378 0.0003780801489483565
379 0.0003673890314530581
380 0.00035699355066753924
381 0.0003468998475000262
382 0.0003370832419022918
383 0.0003275632916484028
384 0.0003182901709806174
385 0.0003092909755650908
386 0.0003005412872880697
387 0.000292052369331941
388 0.0002838002110365778
389 0.00027578038861975074
390 0.0002679858007468283
391 0.0002604170877020806
392 0.00025306170573458076
393 0.0002

In [9]:
# -*- coding: utf-8 -*-
import random
import torch
from torch.autograd import Variable


class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we construct three nn.Linear instances that we will use
        in the forward pass.
        """
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        For the forward pass of the model, we randomly choose either 0, 1, 2, or 3
        and reuse the middle_linear Module that many times to compute hidden layer
        representations.

        Since each forward pass builds a dynamic computation graph, we can use normal
        Python control-flow operators like loops or conditional statements when
        defining the forward pass of the model.

        Here we also see that it is perfectly safe to reuse the same Module many
        times when defining a computational graph. This is a big improvement from Lua
        Torch, where each Module could be used only once.
        """
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs, and wrap them in Variables
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

# Construct our model by instantiating the class defined above
model = DynamicNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.data[0])

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 652.5556640625
1 647.4774780273438
2 644.8255615234375
3 640.8162231445312
4 641.3463134765625
5 611.7316284179688
6 683.1651611328125
7 589.841552734375
8 622.74072265625
9 566.6224975585938
10 553.787353515625
11 617.32666015625
12 526.394775390625
13 494.2415771484375
14 498.7276306152344
15 626.13818359375
16 402.4190979003906
17 454.8075866699219
18 606.4920043945312
19 288.094482421875
20 620.8954467773438
21 210.8287811279297
22 583.0838623046875
23 611.5741577148438
24 123.36203002929688
25 325.6608581542969
26 531.875
27 508.4171142578125
28 88.98529052734375
29 83.87355041503906
30 229.34132385253906
31 210.65560913085938
32 374.5656433105469
33 72.56144714355469
34 444.0881652832031
35 67.40788269042969
36 57.10730743408203
37 242.02957153320312
38 132.7821502685547
39 201.78912353515625
40 107.70003509521484
41 252.30601501464844
42 217.8782958984375
43 85.45332336425781
44 79.04580688476562
45 182.4626007080078
46 171.54551696777344
47 95.66666412353516
48 139.8961181640

375 0.6635239720344543
376 0.5894345641136169
377 0.586792528629303
378 0.5839008688926697
379 0.13044673204421997
380 0.4893609285354614
381 0.41751864552497864
382 0.6221040487289429
383 0.6810686588287354
384 0.6565805673599243
385 0.13925859332084656
386 0.34548354148864746
387 0.3276978135108948
388 0.10707710683345795
389 0.0903167799115181
390 0.5218789577484131
391 0.7399966716766357
392 0.2680879235267639
393 0.29644274711608887
394 0.1425321102142334
395 0.2364768385887146
396 0.07660381495952606
397 0.5711069107055664
398 0.7266483902931213
399 0.10363893210887909
400 0.46980008482933044
401 0.10784044861793518
402 0.7883647084236145
403 0.7186522483825684
404 0.045705169439315796
405 0.04827046021819115
406 0.3627696931362152
407 0.6260862350463867
408 0.5257791876792908
409 0.5358779430389404
410 0.5755000114440918
411 0.605871856212616
412 0.43347465991973877
413 0.4216812252998352
414 0.5291188955307007
415 0.11921969056129456
416 0.5392459630966187
417 0.434529393911361