source : https://pytorch.org/tutorials/beginner/pytorch_with_examples.html

**Implementing a network using numpy**

In [1]:
# Single hidden layer FC NN with ReLu layer.
# -------------------------------------------

import numpy as np

# N is batch size ;
# D_in is dimensions of input
# H is hidden dimension
# D_out is dimension of output
N, D_in, H, D_out = 64, 1000, 100, 10

# create random i/p and o/p
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # forward pass
    h = x.dot(w1)
    h_relu = np.maximum(h,0)
    y_pred = h_relu.dot(w2)
    
    # compute loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
    # backprop (you need to do on notebook to find formulas)
    grad_y_pred = 2 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    # update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 29551108.511124745
1 23935964.539800826
2 21519345.489000596
3 19335484.333840072
4 16196407.211910473
5 12377271.192525791
6 8653124.451074753
7 5709009.827877121
8 3684925.0165451607
9 2414218.4221959184
10 1645357.7786513176
11 1179296.2784598884
12 888572.552378549
13 698738.9261924803
14 567732.7206531004
15 472324.5032067926
16 399526.4883436852
17 341929.23250627005
18 295036.79159262095
19 256176.66785078513
20 223600.46962920477
21 196009.3923370939
22 172454.69548786635
23 152244.94337021548
24 134790.89423167898
25 119653.06603267323
26 106488.89795270059
27 95011.18399671177
28 84968.13807557675
29 76150.31364551933
30 68382.84604562736
31 61522.256088432965
32 55459.595754935784
33 50091.17332563544
34 45314.256111426155
35 41058.28553833549
36 37258.8277710454
37 33860.74575785809
38 30818.13060426426
39 28088.625665662163
40 25633.888281971507
41 23424.880077316484
42 21439.383347158848
43 19645.26924358641
44 18022.27271539608
45 16552.123968522465
46 15223.8209595826

## Numpy is great, but can't use GPUs

In [6]:
import torch

dtype = torch.float
device = torch.device("cpu")

# N is batch size ;
# D_in is dimensions of input
# H is hidden dimension
# D_out is dimension of output
N, D_in, H, D_out = 64, 1000, 100, 10

# random i/p and o/p
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# random init weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # fwd pass
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    # compute loss (.item() converts to numpy / python obj)
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)
    
    # backprop
    grad_y_pred = 2 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    # update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 25485804.0
1 19858852.0
2 18547880.0
3 18936574.0
4 19243104.0
5 18332210.0
6 15643934.0
7 11918927.0
8 8170188.0
9 5239007.5
10 3263819.25
11 2055883.25
12 1345048.875
13 930510.625
14 682681.6875
15 528005.75
16 425712.75
17 353937.0
18 300651.375
19 259185.1875
20 225817.09375
21 198204.0625
22 174930.15625
23 155066.484375
24 137962.109375
25 123147.3125
26 110238.0078125
27 98936.890625
28 88991.484375
29 80216.140625
30 72454.0625
31 65560.65625
32 59431.53515625
33 53970.4765625
34 49089.79296875
35 44718.5703125
36 40794.12890625
37 37267.7734375
38 34091.54296875
39 31225.1953125
40 28633.4140625
41 26287.130859375
42 24160.3828125
43 22229.2734375
44 20472.626953125
45 18872.7578125
46 17414.796875
47 16084.833984375
48 14871.392578125
49 13760.935546875
50 12743.42578125
51 11810.470703125
52 10953.6904296875
53 10166.298828125
54 9442.1337890625
55 8775.7001953125
56 8161.42041015625
57 7594.6005859375
58 7071.50146484375
59 6589.0498046875
60 6142.9521484375
61 5730.3867

### Now, utilising power of PyTorch's autograd

In [7]:
import torch

dtype = torch.float
device = torch.device("cpu")

# N is batch size ;
# D_in is dimensions of input
# H is hidden dimension
# D_out is dimension of output
N, D_in, H, D_out = 64, 1000, 100, 10

# i/p and o/p
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# random weight init
# also set required_grad = True here
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())
    
    # backward prop
    # use autograd package. This'll compute gradient for all which had set requires_grad=true
    loss.backward()
    
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        w1.grad.zero_()
        w2.grad.zero_()
        

0 23622586.0
1 19423820.0
2 19403704.0
3 21335138.0
4 23196608.0
5 23058904.0
6 19778126.0
7 14474153.0
8 9163596.0
9 5323730.0
10 3018281.5
11 1781307.75
12 1135863.25
13 793935.0625
14 601757.8125
15 484606.9375
16 406264.03125
17 349232.6875
18 304963.0625
19 268997.5625
20 238933.8125
21 213372.75
22 191330.84375
23 172168.0625
24 155399.671875
25 140639.015625
26 127596.140625
27 116022.5546875
28 105728.328125
29 96535.6640625
30 88300.2265625
31 80899.5234375
32 74235.0234375
33 68220.265625
34 62779.2734375
35 57850.5
36 53381.125
37 49315.6171875
38 45609.453125
39 42227.98046875
40 39137.09765625
41 36307.01953125
42 33713.234375
43 31334.431640625
44 29147.8359375
45 27133.892578125
46 25278.8515625
47 23568.33203125
48 21988.765625
49 20528.48046875
50 19178.3828125
51 17928.53515625
52 16770.884765625
53 15696.6728515625
54 14699.33984375
55 13773.4833984375
56 12913.3525390625
57 12112.462890625
58 11367.1669921875
59 10672.9326171875
60 10025.984375
61 9422.7265625
62 88

### Now let's use keras of PyTorch, which is basically nn module

In [8]:
import torch

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
        torch.nn.Linear(D_in, H),
        torch.nn.ReLU(),
        torch.nn.Linear(H, D_out)
)

loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
for t in range(500):
    
    y_pred = model(x)
    
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    
    model.zero_grad()
    
    loss.backward()
    
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

0 704.0001220703125
1 645.2449340820312
2 595.6692504882812
3 553.0368041992188
4 515.7166137695312
5 482.5581970214844
6 452.6296081542969
7 425.54693603515625
8 400.7381896972656
9 377.9009704589844
10 356.6917419433594
11 336.98291015625
12 318.6568298339844
13 301.4396667480469
14 285.1703796386719
15 269.87799072265625
16 255.4132537841797
17 241.6710205078125
18 228.68649291992188
19 216.37405395507812
20 204.68751525878906
21 193.5727081298828
22 182.98512268066406
23 172.94448852539062
24 163.35903930664062
25 154.2490692138672
26 145.60406494140625
27 137.38015747070312
28 129.5695343017578
29 122.17377471923828
30 115.15148162841797
31 108.4977798461914
32 102.20355224609375
33 96.23759460449219
34 90.60570526123047
35 85.28981018066406
36 80.26959228515625
37 75.54010009765625
38 71.09806823730469
39 66.90679168701172
40 62.96592330932617
41 59.25621795654297
42 55.762210845947266
43 52.482852935791016
44 49.39862060546875
45 46.49058532714844
46 43.754310607910156
47 41.179

In [9]:
import torch

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
        torch.nn.Linear(D_in, H),
        torch.nn.ReLU(),
        torch.nn.Linear(H, D_out)
)

loss_fn = torch.nn.MSELoss(reduction='sum')
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    
    y_pred = model(x)
    
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    
    model.zero_grad()
    
    loss.backward()
    
    optimizer.step()

0 655.1982421875
1 638.2799072265625
2 621.8156127929688
3 605.819580078125
4 590.2789306640625
5 575.23095703125
6 560.5726928710938
7 546.2724609375
8 532.3982543945312
9 518.9382934570312
10 505.88525390625
11 493.2015380859375
12 480.8591613769531
13 468.900390625
14 457.3102111816406
15 446.1036071777344
16 435.3158874511719
17 424.8406677246094
18 414.6940002441406
19 404.7720642089844
20 395.0428466796875
21 385.55841064453125
22 376.39251708984375
23 367.4981994628906
24 358.825927734375
25 350.3348388671875
26 342.03369140625
27 333.9560546875
28 326.05914306640625
29 318.367431640625
30 310.8645935058594
31 303.55218505859375
32 296.42559814453125
33 289.46234130859375
34 282.6187438964844
35 275.933349609375
36 269.395263671875
37 262.9846496582031
38 256.7025451660156
39 250.52943420410156
40 244.48902893066406
41 238.58958435058594
42 232.79098510742188
43 227.11453247070312
44 221.5706329345703
45 216.12948608398438
46 210.80398559570312
47 205.59178161621094
48 200.47567