# Torch Study
## Three main features of PyTorch
 - Tensor: similar to numpy.array but can run on GPUs
 - Autograd: automatic differentiation for all operations on Tensors
 - NN: nn.module, framework to build neural network easily

## Materials
 - [PyTorch Documentation](https://pytorch.org/docs/stable/index.html)
 - [PyTorch Examples](https://github.com/jcjohnson/pytorch-examples)

## Examples
 - 3 Layers Neural Network
   - Input Layer: 1000 neurons
   - Hidden Layer: 100 neurons
      - ReLU Activation Function
   - Output Layer: 10 neurons
- Training Data: 100 samples
   - Learning Rate: 1e-6
   - Training Iterations: 500

In [1]:
# Numpy Version
import numpy as np

N, D_in, D_h, D_out = 100, 1000, 100, 1

# generate the training data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# init the weights
w1 = np.random.randn(D_in, D_h)
w2 = np.random.randn(D_h, D_out)

learning_rate = 1e-6

for t in range(500):
    # forward inference
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    # calculate the loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    # back-propagation
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)
    # update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 4578347.941021423
1 2471386.0550409984
2 1865985.8169652259
3 3518134.9923215457
4 16376692.467875663
5 93889750.51186724
6 474853966.242233
7 993316544.0218148
8 38490045.170285694
9 17028018.9979011
10 10448400.295304652
11 7196131.101538438
12 5282199.816150585
13 4033626.3417542824
14 3167301.049943808
15 2538802.706842456
16 2066920.0160529031
17 1703388.0161938835
18 1417767.3379489803
19 1189900.9767300896
20 1005686.9477772282
21 855074.441256687
22 730853.337463073
23 627655.512285466
24 541338.3082262024
25 468750.69251051976
26 407510.7111747361
27 355578.9804186145
28 311362.79884176777
29 273606.5310706841
30 241234.08519978856
31 213426.0353554436
32 189469.81166678556
33 168766.03131946674
34 150850.84380896413
35 135312.27452102417
36 121808.48789865169
37 110046.90297558189
38 99775.54929768015
39 90793.07504443655
40 82917.42556246182
41 75991.90623918288
42 69892.15909631057
43 64508.966622447435
44 59742.53656215685
45 55509.59114687612
46 51740.3089052528
47 4837

In [2]:
# Tensor Version
import torch

# cpu or cuda
device = torch.device('cpu')

N, D_in, D_h, D_out = 100, 1000, 100, 10

# generate the training data
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# init the weights
w1 = torch.randn(D_in, D_h, device=device)
w2 = torch.randn(D_h, D_out, device=device)

learning_rate = 1e-6
for t in range(500):
    # forward inference
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    # calculate the loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())
    # back-propagation
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    # update the weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 45237520.0
1 98879256.0
2 487360192.0
3 1159751680.0
4 35282896.0
5 17066524.0
6 11218501.0
7 8219145.0
8 6383037.0
9 5152704.5
10 4273749.5
11 3620350.25
12 3117377.5
13 2719061.25
14 2398089.25
15 2135568.25
16 1916720.375
17 1732083.5
18 1574517.5
19 1439053.5
20 1320940.0
21 1217540.0
22 1126268.0
23 1045177.75
24 973052.6875
25 908549.375
26 850891.375
27 798805.375
28 751480.125
29 708353.75
30 668874.0625
31 632678.0625
32 599309.75
33 568488.5625
34 539964.875
35 513484.4375
36 488958.15625
37 466154.5
38 444889.6875
39 425082.625
40 406638.5
41 389375.5625
42 373156.53125
43 357935.625
44 343618.96875
45 330108.59375
46 317370.03125
47 305305.875
48 293919.0625
49 283175.0625
50 273020.21875
51 263383.0
52 254226.5625
53 245538.375
54 237294.84375
55 229470.359375
56 222007.234375
57 214884.03125
58 208096.859375
59 201628.265625
60 195466.71875
61 189572.015625
62 183930.78125
63 178536.78125
64 173373.296875
65 168424.8125
66 163680.75
67 159119.984375
68 154747.09375
69 1

In [3]:
# Autograd Version
import torch

# cpu or cuda
device = torch.device('cpu')

N, D_in, D_h, D_out = 100, 1000, 100, 10

# generate training data
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# init the weights
w1 = torch.randn(D_in, D_h, device=device, requires_grad=True)
w2 = torch.randn(D_h, D_out, device=device, requires_grad=True)

# training
learning_rate = 1e-6
for t in range(500):
    # forward inference
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    # calculate the loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss)
    # use autograd to do the back-propagation
    loss.backward()
    # update the weights, prevent torch do autograd for this part
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        # zero grads
        w1.grad.zero_()
        w2.grad.zero_()

0 tensor(52220128., grad_fn=<SumBackward0>)
1 tensor(1.1992e+08, grad_fn=<SumBackward0>)
2 tensor(4.7306e+08, grad_fn=<SumBackward0>)
3 tensor(8.2283e+08, grad_fn=<SumBackward0>)
4 tensor(28214168., grad_fn=<SumBackward0>)
5 tensor(14514134., grad_fn=<SumBackward0>)
6 tensor(9282458., grad_fn=<SumBackward0>)
7 tensor(6531954., grad_fn=<SumBackward0>)
8 tensor(4863534., grad_fn=<SumBackward0>)
9 tensor(3760603.5000, grad_fn=<SumBackward0>)
10 tensor(2989122., grad_fn=<SumBackward0>)
11 tensor(2428747.2500, grad_fn=<SumBackward0>)
12 tensor(2009219.5000, grad_fn=<SumBackward0>)
13 tensor(1687784.5000, grad_fn=<SumBackward0>)
14 tensor(1436648., grad_fn=<SumBackward0>)
15 tensor(1237514.3750, grad_fn=<SumBackward0>)
16 tensor(1077442.8750, grad_fn=<SumBackward0>)
17 tensor(947199.6250, grad_fn=<SumBackward0>)
18 tensor(840193.3750, grad_fn=<SumBackward0>)
19 tensor(751451.7500, grad_fn=<SumBackward0>)
20 tensor(677170.1250, grad_fn=<SumBackward0>)
21 tensor(614356.6875, grad_fn=<SumBackwa

In [3]:
# MyReLU Version
import torch
from my_relu import MyReLU

# cpu or cuda
device = torch.device('cpu')

N, D_in, D_h, D_out = 100, 1000, 100, 10

# generate training data
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# init the weights
w1 = torch.randn(D_in, D_h, device=device, requires_grad=True)
w2 = torch.randn(D_h, D_out, device=device, requires_grad=True)

# training
learning_rate = 1e-6
for t in range(500):
    # forward inference, with MyReLU
    y_pred = MyReLU.apply(x.mm(w1)).mm(w2)
    # calculate the loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss)
    # use autograd to do the back-propagation
    # gonna call MyReLU.backward()
    loss.backward()
    # update the weights, prevent torch do autograd for this part
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        # zero grads
        w1.grad.zero_()
        w2.grad.zero_()

0 tensor(64044740., grad_fn=<SumBackward0>)
1 tensor(1.8535e+08, grad_fn=<SumBackward0>)
2 tensor(6.4584e+08, grad_fn=<SumBackward0>)
3 tensor(4.4602e+08, grad_fn=<SumBackward0>)
4 tensor(5713088., grad_fn=<SumBackward0>)
5 tensor(4366853.5000, grad_fn=<SumBackward0>)
6 tensor(3461743.7500, grad_fn=<SumBackward0>)
7 tensor(2821221.5000, grad_fn=<SumBackward0>)
8 tensor(2351187.7500, grad_fn=<SumBackward0>)
9 tensor(1996174.5000, grad_fn=<SumBackward0>)
10 tensor(1722102., grad_fn=<SumBackward0>)
11 tensor(1506259.5000, grad_fn=<SumBackward0>)
12 tensor(1333113.2500, grad_fn=<SumBackward0>)
13 tensor(1191878.3750, grad_fn=<SumBackward0>)
14 tensor(1074925.5000, grad_fn=<SumBackward0>)
15 tensor(976912.8750, grad_fn=<SumBackward0>)
16 tensor(893680.7500, grad_fn=<SumBackward0>)
17 tensor(822140.6250, grad_fn=<SumBackward0>)
18 tensor(760085., grad_fn=<SumBackward0>)
19 tensor(705818.1875, grad_fn=<SumBackward0>)
20 tensor(657946., grad_fn=<SumBackward0>)
21 tensor(615333.1250, grad_fn=<S

In [6]:
# NN Version
import torch

# cpu or cuda
device = torch.device('cpu')

N, D_in, D_h, D_out = 100, 1000, 100, 10

# generate training data
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# define the model
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, D_h),
    torch.nn.ReLU(),
    torch.nn.Linear(D_h, D_out),
).to(device)

# define the loss function
loss_fn = torch.nn.MSELoss(reduction='sum')

# training, use bigger learning rate
learning_rate = 1e-4
for t in range(500):
    # forward inference
    y_pred = model(x)
    # calculate the loss
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    # back-propagation
    model.zero_grad()
    loss.backward()
    # update the model parameters(weights)
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

0 1055.11572265625
1 979.7573852539062
2 914.0875244140625
3 856.3018798828125
4 804.7339477539062
5 758.6033325195312
6 716.502685546875
7 677.8682861328125
8 641.9003295898438
9 608.1055908203125
10 576.0472412109375
11 545.6200561523438
12 516.5584716796875
13 488.8470153808594
14 462.3072814941406
15 436.818603515625
16 412.47161865234375
17 389.1689453125
18 366.92864990234375
19 345.71099853515625
20 325.39764404296875
21 306.02178955078125
22 287.5928039550781
23 270.0403137207031
24 253.38345336914062
25 237.55694580078125
26 222.5810089111328
27 208.38523864746094
28 194.99539184570312
29 182.37692260742188
30 170.48968505859375
31 159.3143310546875
32 148.81051635742188
33 138.978271484375
34 129.75088500976562
35 121.11564636230469
36 113.04875183105469
37 105.51568603515625
38 98.45283508300781
39 91.86031341552734
40 85.70729064941406
41 79.97239685058594
42 74.62997436523438
43 69.65254211425781
44 65.01960754394531
45 60.70447540283203
46 56.688720703125
47 52.9558029174

In [7]:
# Custom Module Version
import torch
from my_nn import TwoLayerNet

# cpu or cuda
device = torch.device('cpu')

N, D_in, D_h, D_out = 100, 1000, 100, 10

# generate training data
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# define the model
model = TwoLayerNet(D_in, D_h, D_out).to(device)

# define the loss function
loss_fn = torch.nn.MSELoss(reduction='sum')

# training, use bigger learning rate
learning_rate = 1e-4
for t in range(500):
    # forward inference
    y_pred = model(x)
    # calculate the loss
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    # back-propagation
    model.zero_grad()
    loss.backward()
    # update the model parameters(weights)
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

0 1163.261962890625
1 1077.5340576171875
2 1005.70556640625
3 943.58447265625
4 888.6766357421875
5 838.9414672851562
6 793.19921875
7 750.942626953125
8 711.2891845703125
9 673.778076171875
10 638.21435546875
11 604.2950439453125
12 571.9231567382812
13 540.854248046875
14 511.00775146484375
15 482.36749267578125
16 454.7848815917969
17 428.347412109375
18 402.93359375
19 378.5995178222656
20 355.3414001464844
21 333.22320556640625
22 312.112060546875
23 291.99652099609375
24 272.922119140625
25 254.904052734375
26 237.93966674804688
27 221.96177673339844
28 206.92666625976562
29 192.80487060546875
30 179.54898071289062
31 167.16567993164062
32 155.61251831054688
33 144.82412719726562
34 134.79347229003906
35 125.4600830078125
36 116.77212524414062
37 108.69270324707031
38 101.19742584228516
39 94.22679138183594
40 87.76121520996094
41 81.7593002319336
42 76.1905746459961
43 71.01890563964844
44 66.228271484375
45 61.783023834228516
46 57.63160705566406
47 53.77770233154297
48 50.2071