## Numpy nn

In [1]:
import numpy as np

In [5]:
batch_size=64
input_dim = 1000
hidden_dim = 100
output_dim = 10
learning_rate = 1e-6

In [4]:
# Create random input and output data
x = np.random.randn(batch_size, input_dim)
y = np.random.randn(batch_size, output_dim)

In [7]:
# Randomly initialize weights
w1 = np.random.randn(input_dim, hidden_dim)
w2 = np.random.randn(hidden_dim, output_dim)


In [8]:
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 28631161.21938608
1 22197727.82012119
2 20192763.563169774
3 19187981.26965983
4 17554210.24439212
5 14748186.636293663
6 11230257.246386005
7 7829872.339223415
8 5152218.205506105
9 3308986.183355677
10 2144675.172159327
11 1436973.1468359975
12 1010114.2485635405
13 747026.6831791116
14 578778.5944643455
15 465802.7129901863
16 385793.0393950048
17 326246.04179616325
18 280048.14954772696
19 243025.41813849634
20 212568.48040564518
21 187035.72733555498
22 165329.0319730537
23 146702.2618000725
24 130585.82845633746
25 116575.34921696049
26 104333.09381865585
27 93593.22164052192
28 84140.99539213278
29 75790.50900258841
30 68392.72720850859
31 61823.47610282271
32 55977.129021195404
33 50757.94227119356
34 46093.82676883243
35 41915.16693635116
36 38166.890111536864
37 34798.659097202464
38 31764.647221196235
39 29028.62841096585
40 26557.07328388015
41 24322.8529256001
42 22298.8767639067
43 20462.906736435034
44 18795.054957392145
45 17278.26851254759
46 15897.65100267353
47 146

392 0.0002628998103709743
393 0.0002512413923426621
394 0.00024009797943354315
395 0.0002294514549496251
396 0.00021927648048442766
397 0.0002095558364681318
398 0.00020026785347228803
399 0.000191390890436886
400 0.00018290950431818745
401 0.00017480544006057036
402 0.00016706115504339162
403 0.00015966771096368024
404 0.00015259704610804
405 0.00014583939760706005
406 0.00013938300127723535
407 0.00013321367499567776
408 0.00012731707587665826
409 0.00012168194438500295
410 0.00011629853057118935
411 0.0001111542278083523
412 0.0001062366350271676
413 0.00010154021821265096
414 9.705199151588005e-05
415 9.27602937302162e-05
416 8.865884900189486e-05
417 8.474007440972725e-05
418 8.099523224957868e-05
419 7.741586163617233e-05
420 7.399513935840147e-05
421 7.072628772529678e-05
422 6.76022662954578e-05
423 6.461796423507771e-05
424 6.176505330909417e-05
425 5.9037547424988895e-05
426 5.64310933914741e-05
427 5.394043675771356e-05
428 5.15595494771776e-05
429 4.928382656687524e-05
430 

## PyTorch examples

In [11]:

import torch
import torch.nn as nn

In [12]:
batch_size=64
input_dim = 1000
hidden_dim = 100
output_dim = 10
learning_rate = 1e-6

In [13]:
dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# Create random input and output data
x = torch.randn(batch_size, input_dim, device=device, dtype=dtype)
y = torch.randn(batch_size, output_dim, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(input_dim, hidden_dim, device=device, dtype=dtype)
w2 = torch.randn(hidden_dim, output_dim, device=device, dtype=dtype)


In [14]:
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 45143464.0
1 51202024.0
2 54903964.0
3 43292300.0
4 23078112.0
5 9023942.0
6 3736988.75
7 2094830.875
8 1493435.375
9 1180273.375
10 969816.125
11 810129.25
12 683477.3125
13 581088.5625
14 497528.5
15 428670.5625
16 371358.8125
17 323345.96875
18 282833.125
19 248471.84375
20 219128.9375
21 193936.15625
22 172217.21875
23 153408.78125
24 137048.15625
25 122755.859375
26 110227.703125
27 99205.203125
28 89474.078125
29 80868.890625
30 73224.3125
31 66419.5234375
32 60348.78125
33 54922.5
34 50057.0703125
35 45688.21875
36 41759.15625
37 38217.9296875
38 35020.71484375
39 32128.408203125
40 29506.46484375
41 27125.662109375
42 24965.5
43 22999.2265625
44 21205.556640625
45 19569.96484375
46 18077.763671875
47 16711.833984375
48 15460.9296875
49 14314.2001953125
50 13261.8955078125
51 12296.119140625
52 11408.5732421875
53 10591.953125
54 9840.283203125
55 9147.1806640625
56 8507.8798828125
57 7917.9345703125
58 7373.353515625
59 6870.53759765625
60 6405.25732421875
61 5974.54541015625

378 0.0030396836809813976
379 0.0029377478640526533
380 0.0028406621422618628
381 0.0027464451268315315
382 0.002655335236340761
383 0.0025670581962913275
384 0.0024835686199367046
385 0.002401078585535288
386 0.002319528954103589
387 0.0022454706486314535
388 0.0021712377201765776
389 0.0021010541822761297
390 0.0020323721691966057
391 0.0019673448987305164
392 0.0019046784145757556
393 0.0018419533735141158
394 0.0017850095173344016
395 0.0017304078210145235
396 0.0016753755044192076
397 0.0016222045524045825
398 0.0015722067328169942
399 0.0015224460512399673
400 0.0014768141554668546
401 0.0014324082294479012
402 0.0013877797173336148
403 0.001349163125269115
404 0.0013086075196042657
405 0.0012681756634265184
406 0.0012306474382057786
407 0.0011952304048463702
408 0.0011605907930061221
409 0.0011260902974754572
410 0.0010932747973129153
411 0.0010622998233884573
412 0.0010315238032490015
413 0.0010010921396315098
414 0.0009726813295856118
415 0.000944778323173523
416 0.00091793207

In [15]:
w1 = torch.randn(input_dim, hidden_dim, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(hidden_dim, output_dim, device=device, dtype=dtype, requires_grad=True)

In [16]:
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 34531312.0
1 29785736.0
2 27738524.0
3 24380654.0
4 18855230.0
5 12686971.0
6 7739644.0
7 4557467.0
8 2762474.75
9 1789224.125
10 1251928.75
11 936851.375
12 736992.5
13 599729.5625
14 499081.75
15 421611.25
16 360165.0
17 310202.21875
18 268923.875
19 234553.796875
20 205547.015625
21 180877.203125
22 159742.640625
23 141572.875
24 125868.0703125
25 112229.8125
26 100319.0078125
27 89889.0078125
28 80729.9140625
29 72658.9765625
30 65539.6328125
31 59236.203125
32 53642.4453125
33 48652.78125
34 44200.859375
35 40218.22265625
36 36649.9453125
37 33450.30078125
38 30575.51171875
39 27989.33984375
40 25654.171875
41 23541.689453125
42 21623.5859375
43 19883.49609375
44 18304.185546875
45 16867.544921875
46 15560.310546875
47 14368.34765625
48 13280.9365234375
49 12286.408203125
50 11376.7880859375
51 10543.865234375
52 9779.5263671875
53 9077.95703125
54 8433.232421875
55 7840.322265625
56 7294.947265625
57 6792.49462890625
58 6328.98974609375
59 5899.7421875
60 5503.1826171875
61 513

379 0.003205407178029418
380 0.0030923315789550543
381 0.002985327271744609
382 0.0028836752753704786
383 0.0027830598410218954
384 0.002688275882974267
385 0.002597820945084095
386 0.002508633304387331
387 0.002425374696031213
388 0.002343389205634594
389 0.0022633131593465805
390 0.00218779593706131
391 0.00211345124989748
392 0.0020442786626517773
393 0.0019790788646787405
394 0.0019125549588352442
395 0.001849759486503899
396 0.001789786503650248
397 0.0017329362453892827
398 0.0016798072028905153
399 0.001624282798729837
400 0.001573839457705617
401 0.0015208232216536999
402 0.0014739768812432885
403 0.0014290327671915293
404 0.0013853521086275578
405 0.0013423720374703407
406 0.0012999941827729344
407 0.0012591048143804073
408 0.0012202493380755186
409 0.001183751504868269
410 0.0011483118869364262
411 0.0011136020766571164
412 0.0010807253420352936
413 0.0010487277759239078
414 0.0010189750464633107
415 0.000988585059531033
416 0.0009591634734533727
417 0.0009304715204052627
418

In [17]:
# Create random Tensors to hold inputs and outputs
x = torch.randn(batch_size, input_dim)
y = torch.randn(batch_size, output_dim)

In [21]:
model = torch.nn.Sequential(
    torch.nn.Linear(input_dim, hidden_dim),
    torch.nn.ReLU(),
    torch.nn.Linear(hidden_dim, output_dim),
)


In [22]:
loss_fn = torch.nn.MSELoss(reduction='sum')

In [23]:
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(x)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.item())

    # Zero the gradients before running the backward pass.
    model.zero_grad()

    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Tensors with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()

    # Update the weights using gradient descent.
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

0 619.0841674804688
1 618.5880737304688
2 618.0925903320312
3 617.59765625
4 617.103271484375
5 616.6098022460938
6 616.1168212890625
7 615.6245727539062
8 615.1329345703125
9 614.6419677734375
10 614.151611328125
11 613.6618041992188
12 613.1724853515625
13 612.6839599609375
14 612.1962890625
15 611.7094116210938
16 611.22314453125
17 610.7374267578125
18 610.2522583007812
19 609.7681884765625
20 609.284912109375
21 608.8026123046875
22 608.3211059570312
23 607.840576171875
24 607.3605346679688
25 606.881103515625
26 606.402099609375
27 605.9238891601562
28 605.4461669921875
29 604.9691162109375
30 604.49267578125
31 604.0167236328125
32 603.54150390625
33 603.0677490234375
34 602.594482421875
35 602.121826171875
36 601.649658203125
37 601.178466796875
38 600.7079467773438
39 600.2379760742188
40 599.7686157226562
41 599.300048828125
42 598.8319702148438
43 598.3644409179688
44 597.8973999023438
45 597.4308471679688
46 596.96484375
47 596.4993286132812
48 596.0343627929688
49 595.5701

403 463.6087646484375
404 463.305908203125
405 463.0032958984375
406 462.70135498046875
407 462.3996276855469
408 462.09820556640625
409 461.7972717285156
410 461.4965515136719
411 461.19598388671875
412 460.895751953125
413 460.59588623046875
414 460.2962341308594
415 459.99676513671875
416 459.6977233886719
417 459.3990173339844
418 459.1006164550781
419 458.80267333984375
420 458.5048522949219
421 458.2073059082031
422 457.9098205566406
423 457.6125183105469
424 457.3155822753906
425 457.0188903808594
426 456.7223815917969
427 456.4264831542969
428 456.130859375
429 455.8354797363281
430 455.5408020019531
431 455.24652099609375
432 454.95220947265625
433 454.658447265625
434 454.3649597167969
435 454.0716247558594
436 453.77850341796875
437 453.4858703613281
438 453.193115234375
439 452.9008483886719
440 452.60906982421875
441 452.31768798828125
442 452.0263977050781
443 451.7359924316406
444 451.44561767578125
445 451.15545654296875
446 450.8655700683594
447 450.5760192871094
448 4

In [24]:
# Create random Tensors to hold inputs and outputs
x = torch.randn(batch_size, input_dim)
y = torch.randn(batch_size, output_dim)

In [26]:
model = torch.nn.Sequential(
    torch.nn.Linear(input_dim, hidden_dim),
    torch.nn.ReLU(),
    torch.nn.Linear(hidden_dim, output_dim),
)

In [27]:
loss_fn = torch.nn.MSELoss(reduction='sum')

In [28]:
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [29]:
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(x)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.item())

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()

0 654.2635498046875
1 654.0885620117188
2 653.9136352539062
3 653.7387084960938
4 653.5638427734375
5 653.3890991210938
6 653.2144165039062
7 653.039794921875
8 652.8652954101562
9 652.6907958984375
10 652.516357421875
11 652.3419189453125
12 652.1675415039062
13 651.9933471679688
14 651.819091796875
15 651.6448974609375
16 651.470703125
17 651.2965698242188
18 651.1224975585938
19 650.9484252929688
20 650.7744140625
21 650.6004638671875
22 650.426513671875
23 650.2526245117188
24 650.0787963867188
25 649.9050903320312
26 649.7315063476562
27 649.5579223632812
28 649.3843383789062
29 649.2108154296875
30 649.037353515625
31 648.8639526367188
32 648.6905517578125
33 648.5171508789062
34 648.3438720703125
35 648.170654296875
36 647.9974365234375
37 647.82421875
38 647.6510620117188
39 647.4779663085938
40 647.304931640625
41 647.1318969726562
42 646.9589233398438
43 646.7859497070312
44 646.6130981445312
45 646.4402465820312
46 646.2674560546875
47 646.0947875976562
48 645.9220581054688


415 586.1666870117188
416 586.0142822265625
417 585.8618774414062
418 585.7095336914062
419 585.55712890625
420 585.4049072265625
421 585.2527465820312
422 585.1006469726562
423 584.9486083984375
424 584.796630859375
425 584.6444702148438
426 584.492431640625
427 584.340576171875
428 584.1886596679688
429 584.0368041992188
430 583.885009765625
431 583.7332763671875
432 583.58154296875
433 583.4298095703125
434 583.278076171875
435 583.1265869140625
436 582.9751586914062
437 582.8237915039062
438 582.6723022460938
439 582.5208740234375
440 582.3695068359375
441 582.2181396484375
442 582.0669555664062
443 581.9157104492188
444 581.7645874023438
445 581.613525390625
446 581.46240234375
447 581.3114013671875
448 581.160400390625
449 581.0095825195312
450 580.858642578125
451 580.707763671875
452 580.5570068359375
453 580.4063110351562
454 580.255615234375
455 580.1049194335938
456 579.9542236328125
457 579.8037109375
458 579.6531982421875
459 579.5028686523438
460 579.3523559570312
461 579

In [30]:
class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(input_dim, hidden_dim)
        self.linear2 = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred



# Create random Tensors to hold inputs and outputs
x = torch.randn(batch_size, input_dim)
y = torch.randn(batch_size, output_dim)

# Construct our model by instantiating the class defined above
model = TwoLayerNet(input_dim, hidden_dim, output_dim)

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 792.139404296875
1 733.0504150390625
2 682.4179077148438
3 638.3485717773438
4 599.0303955078125
5 563.7425537109375
6 531.515869140625
7 501.8516540527344
8 474.2546081542969
9 448.5835876464844
10 424.53607177734375
11 401.9566955566406
12 380.6769104003906
13 360.6289367675781
14 341.69122314453125
15 323.6953430175781
16 306.6184997558594
17 290.3197021484375
18 274.7073059082031
19 259.656982421875
20 245.2937469482422
21 231.6143035888672
22 218.5709991455078
23 206.15737915039062
24 194.32659912109375
25 183.0753173828125
26 172.39547729492188
27 162.28619384765625
28 152.69544982910156
29 143.6285400390625
30 135.048828125
31 126.9215316772461
32 119.25977325439453
33 112.0439224243164
34 105.25660705566406
35 98.87010955810547
36 92.83873748779297
37 87.17607879638672
38 81.85491180419922
39 76.85790252685547
40 72.16357421875
41 67.75545501708984
42 63.61322784423828
43 59.73256301879883
44 56.08938980102539
45 52.658512115478516
46 49.4407958984375
47 46.42708969116211
48 

375 0.00012305733980610967
376 0.00011927060404559597
377 0.0001156047364929691
378 0.00011205022019566968
379 0.00010860854672500864
380 0.000105269682535436
381 0.00010203843703493476
382 9.892821981338784e-05
383 9.590686386218295e-05
384 9.298405348090455e-05
385 9.015003888634965e-05
386 8.740356133785099e-05
387 8.473949128529057e-05
388 8.216092828661203e-05
389 7.965771510498598e-05
390 7.72370767663233e-05
391 7.48891252442263e-05
392 7.26176003809087e-05
393 7.041213393677026e-05
394 6.827577453805134e-05
395 6.620574276894331e-05
396 6.419856799766421e-05
397 6.224818207556382e-05
398 6.0362704971339554e-05
399 5.853378024767153e-05
400 5.676051659975201e-05
401 5.504591899807565e-05
402 5.33813945367001e-05
403 5.176584454602562e-05
404 5.0200560508528724e-05
405 4.868550968240015e-05
406 4.7214460209943354e-05
407 4.5788983698002994e-05
408 4.44089419033844e-05
409 4.307184644858353e-05
410 4.1773215343710035e-05
411 4.051451833220199e-05
412 3.9290989661822096e-05
413 3.8

In [2]:
X = torch.tensor(([2, 9], [1, 5], [3, 6]), dtype=torch.float) # 3 X 2 tensor
y = torch.tensor(([92], [100], [89]), dtype=torch.float) # 3 X 1 tensor
xPredicted = torch.tensor(([4, 8]), dtype=torch.float) # 1 X 2 tensor

In [3]:

print(X.size())
print(y.size())

torch.Size([3, 2])
torch.Size([3, 1])


In [4]:
# max function returns both a tensor and the corresponding indices. 
#We use _ to capture the indices which we won't use here 
#because we are only interested in the max values to conduct the scaling
X_max, _ = torch.max(X, 0)
xPredicted_max, _ = torch.max(xPredicted, 0)

X = torch.div(X, X_max)
xPredicted = torch.div(xPredicted, xPredicted_max)
y = y / 100  # max test score is 100

In [5]:
class Neural_Network(nn.Module):
    def __init__(self, ):
        super(Neural_Network, self).__init__()
        # parameters
        # TODO: parameters can be parameterized instead of declaring them here
        self.inputSize = 2
        self.outputSize = 1
        self.hiddenSize = 3
        
        # weights
        self.W1 = torch.randn(self.inputSize, self.hiddenSize) # 3 X 2 tensor
        self.W2 = torch.randn(self.hiddenSize, self.outputSize) # 3 X 1 tensor
        
    def forward(self, X):
        self.z = torch.matmul(X, self.W1) # 3 X 3 ".dot" does not broadcast in PyTorch
        self.z2 = self.sigmoid(self.z) # activation function
        self.z3 = torch.matmul(self.z2, self.W2)
        o = self.sigmoid(self.z3) # final activation function
        return o
        
    def sigmoid(self, s):
        return 1 / (1 + torch.exp(-s))
    
    def sigmoidPrime(self, s):
        # derivative of sigmoid
        return s * (1 - s)
    
    def backward(self, X, y, o):
        self.o_error = y - o # error in output
        self.o_delta = self.o_error * self.sigmoidPrime(o) # derivative of sig to error
        self.z2_error = torch.matmul(self.o_delta, torch.t(self.W2))
        self.z2_delta = self.z2_error * self.sigmoidPrime(self.z2)
        self.W1 += torch.matmul(torch.t(X), self.z2_delta)
        self.W2 += torch.matmul(torch.t(self.z2), self.o_delta)
        
    def train(self, X, y):
        # forward + backward pass for training
        o = self.forward(X)
        self.backward(X, y, o)
        
    def saveWeights(self, model):
        # we will use the PyTorch internal storage functions
        torch.save(model, "NN")
        # you can reload model with all the weights and so forth with:
        # torch.load("NN")
        
    def predict(self):
        print ("Predicted data based on trained weights: ")
        print ("Input (scaled): \n" + str(xPredicted))
        print ("Output: \n" + str(self.forward(xPredicted)))

In [6]:
def __init__(self, ):
    super(Neural_Network, self).__init__()
    # parameters
    # TODO: parameters can be parameterized instead of declaring them here
    self.inputSize = 2
    self.outputSize = 1
    self.hiddenSize = 3
# weights
    self.W1 = torch.randn(self.inputSize, self.hiddenSize) 
    self.W2 = torch.randn(self.hiddenSize, self.outputSize)

In [7]:
def forward(self, X):
    self.z = torch.matmul(X, self.W1) 
    self.z2 = self.sigmoid(self.z) # activation function
    self.z3 = torch.matmul(self.z2, self.W2)
    o = self.sigmoid(self.z3) # final activation function
    return o

In [8]:
def backward(self, X, y, o):
    self.o_error = y - o # error in output
    self.o_delta = self.o_error * self.sigmoidPrime(o) 
    self.z2_error = torch.matmul(self.o_delta, torch.t(self.W2))
    self.z2_delta = self.z2_error * self.sigmoidPrime(self.z2)
    self.W1 += torch.matmul(torch.t(X), self.z2_delta)
    self.W2 += torch.matmul(torch.t(self.z2), self.o_delta)

In [9]:
NN = Neural_Network()

In [10]:
for i in range(1000):  # trains the NN 1,000 times
    print ("#" + str(i) + " Loss: " + str(torch.mean((y - NN(X))**2).detach().item()))  # mean sum squared loss
    NN.train(X, y)
NN.saveWeights(NN)
NN.predict()

#0 Loss: 0.10512202978134155
#1 Loss: 0.08384952694177628
#2 Loss: 0.06776101142168045
#3 Loss: 0.05552127957344055
#4 Loss: 0.046129703521728516
#5 Loss: 0.03884904459118843
#6 Loss: 0.03314093127846718
#7 Loss: 0.028613435104489326
#8 Loss: 0.024980681017041206
#9 Loss: 0.022033102810382843
#10 Loss: 0.019615909084677696
#11 Loss: 0.017613735049962997
#12 Loss: 0.015939762815833092
#13 Loss: 0.014527995139360428
#14 Loss: 0.013327733613550663
#15 Loss: 0.012299705296754837
#16 Loss: 0.011413104832172394
#17 Loss: 0.01064358651638031
#18 Loss: 0.009971750900149345
#19 Loss: 0.009381978772580624
#20 Loss: 0.008861619047820568
#21 Loss: 0.008400346152484417
#22 Loss: 0.007989642210304737
#23 Loss: 0.00762246735394001
#24 Loss: 0.007292961701750755
#25 Loss: 0.006996193900704384
#26 Loss: 0.0067280251532793045
#27 Loss: 0.006484937388449907
#28 Loss: 0.006263914052397013
#29 Loss: 0.006062424276024103
#30 Loss: 0.005878235679119825
#31 Loss: 0.00570945767685771
#32 Loss: 0.00555443344637

#503 Loss: 0.0023963304702192545
#504 Loss: 0.00239451858215034
#505 Loss: 0.002392713213339448
#506 Loss: 0.0023909055162221193
#507 Loss: 0.002389096887782216
#508 Loss: 0.002387288259342313
#509 Loss: 0.002385478001087904
#510 Loss: 0.0023836686741560698
#511 Loss: 0.002381860977038741
#512 Loss: 0.0023800514172762632
#513 Loss: 0.002378236735239625
#514 Loss: 0.002376429969444871
#515 Loss: 0.002374622505158186
#516 Loss: 0.002372811548411846
#517 Loss: 0.0023710010573267937
#518 Loss: 0.0023691861424595118
#519 Loss: 0.0023673747200518847
#520 Loss: 0.00236556027084589
#521 Loss: 0.002363750943914056
#522 Loss: 0.0023619404528290033
#523 Loss: 0.002360125770792365
#524 Loss: 0.002358313649892807
#525 Loss: 0.002356501528993249
#526 Loss: 0.0023546821903437376
#527 Loss: 0.0023528714664280415
#528 Loss: 0.0023510565515607595
#529 Loss: 0.002349247457459569
#530 Loss: 0.002347423927858472
#531 Loss: 0.0023456162307411432
#532 Loss: 0.0023437992203980684
#533 Loss: 0.0023419824428856

#771 Loss: 0.0019125706749036908
#772 Loss: 0.001910814899019897
#773 Loss: 0.0019090613350272179
#774 Loss: 0.0019073073053732514
#775 Loss: 0.0019055576995015144
#776 Loss: 0.0019038012251257896
#777 Loss: 0.0019020545296370983
#778 Loss: 0.0019003035267814994
#779 Loss: 0.0018985526403412223
#780 Loss: 0.0018968075746670365
#781 Loss: 0.0018950592493638396
#782 Loss: 0.0018933146493509412
#783 Loss: 0.0018915646942332387
#784 Loss: 0.001889820327050984
#785 Loss: 0.001888072700239718
#786 Loss: 0.0018863306613638997
#787 Loss: 0.0018845921149477363
#788 Loss: 0.001882848679088056
#789 Loss: 0.0018811059417203069
#790 Loss: 0.0018793646013364196
#791 Loss: 0.0018776260549202561
#792 Loss: 0.0018758842488750815
#793 Loss: 0.001874149776995182
#794 Loss: 0.0018724128603935242
#795 Loss: 0.0018706744303926826
#796 Loss: 0.0018689380958676338
#797 Loss: 0.0018672067672014236
#798 Loss: 0.0018654754385352135
#799 Loss: 0.0018637403845787048
#800 Loss: 0.0018620073096826673
#801 Loss: 0.00

  "type " + obj.__name__ + ". It won't be checked "
