# Torch Study
## Two main features of PyTorch
 - Tensor: similar to numpy.array but can run on GPUs
 - Autograd: automatic differentiation for all operations on Tensors

## Materials
 - [PyTorch Documentation](https://pytorch.org/docs/stable/index.html)
 - [PyTorch Examples](https://github.com/jcjohnson/pytorch-examples)

## Examples
 - 3 Layers Neural Network
   - Input Layer: 1000 neurons
   - Hidden Layer: 100 neurons
      - ReLU Activation Function
   - Output Layer: 10 neurons
- Training Data: 100 samples
   - Learning Rate: 1e-6
   - Training Iterations: 500

In [1]:
# Numpy Version
import numpy as np

N, D_in, D_h, D_out = 100, 1000, 100, 1

# generate the training data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# init the weights
w1 = np.random.randn(D_in, D_h)
w2 = np.random.randn(D_h, D_out)

learning_rate = 1e-6

for t in range(500):
    # forward inference
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    # calculate the loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    # back-propagation
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)
    # update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 2809927.4984153016
1 2895192.164057741
2 9195337.62595426
3 52672674.94903434
4 305856609.5553557
5 1045698768.0498496
6 199561235.26916894
7 25670508.36144746
8 14668513.15366512
9 9938381.689097475
10 7305750.080500455
11 5639376.168415225
12 4516688.973984517
13 3713291.893420212
14 3109781.9489851957
15 2646019.473735994
16 2279626.8021711316
17 1983933.162397087
18 1742160.4876639124
19 1542133.7768047203
20 1374561.2308388902
21 1233790.2257218708
22 1113270.240378296
23 1008865.8361852948
24 918241.0627890414
25 840077.572144227
26 771346.8382308907
27 710486.5453011404
28 656157.9888915124
29 607437.4684917162
30 563553.4739288153
31 523936.1619861584
32 487896.0999278911
33 455015.8810166172
34 424950.28932338505
35 397828.61352782807
36 373114.01286175643
37 350586.7383271787
38 330076.21784706844
39 311372.4925737009
40 294105.3357829785
41 278128.95856948197
42 263351.43019166414
43 249582.38174369294
44 236842.7945321482
45 225021.53143765056
46 214055.88220353192
47 203

In [2]:
# Tensor Version
import torch

# cpu or cuda
device = torch.device('cpu')

N, D_in, D_h, D_out = 100, 1000, 100, 10

# generate the training data
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# init the weights
w1 = torch.randn(D_in, D_h, device=device)
w2 = torch.randn(D_h, D_out, device=device)

learning_rate = 1e-6
for t in range(500):
    # forward inference
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    # calculate the loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())
    # back-propagation
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    # update the weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 39354416.0
1 69728016.0
2 301565664.0
3 923355520.0
4 183074496.0
5 25839176.0
6 15219064.0
7 10511831.0
8 7848824.5
9 6157910.0
10 4996657.5
11 4156885.0
12 3524331.25
13 3033465.75
14 2642229.0
15 2325109.5
16 2063716.25
17 1845775.125
18 1661550.0
19 1503752.5
20 1367599.0
21 1249529.0
22 1146554.625
23 1056163.375
24 976136.3125
25 904936.25
26 841283.3125
27 784326.25
28 733280.8125
29 687390.0
30 645892.625
31 608086.9375
32 573518.625
33 541881.875
34 512836.5625
35 486059.75
36 461372.15625
37 438537.875
38 417399.1875
39 397815.375
40 379584.1875
41 362584.40625
42 346714.1875
43 331869.0
44 317965.6875
45 304922.25
46 292671.0
47 281178.21875
48 270373.125
49 260199.8125
50 250595.40625
51 241511.3125
52 232917.46875
53 224778.8125
54 217060.65625
55 209739.65625
56 202787.640625
57 196178.828125
58 189892.375
59 183890.234375
60 178138.84375
61 172657.3125
62 167426.96875
63 162432.53125
64 157664.234375
65 153113.453125
66 148768.953125
67 144601.5
68 140622.46875
69 1368

In [3]:
# Autograd Version
import torch

# cpu or cuda
device = torch.device('cpu')

N, D_in, D_h, D_out = 100, 1000, 100, 10

# generate training data
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# init the weights
w1 = torch.randn(D_in, D_h, device=device, requires_grad=True)
w2 = torch.randn(D_h, D_out, device=device, requires_grad=True)

# training
learning_rate = 1e-6
for t in range(500):
    # forward inference
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    # calculate the loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss)
    # use autograd to do the back-propagation
    loss.backward()
    # update the weights, prevent torch do autograd for this part
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        # zero grads
        w1.grad.zero_()
        w2.grad.zero_()

0 tensor(38188340., grad_fn=<SumBackward0>)
1 tensor(49341676., grad_fn=<SumBackward0>)
2 tensor(1.6171e+08, grad_fn=<SumBackward0>)
3 tensor(5.3576e+08, grad_fn=<SumBackward0>)
4 tensor(4.8901e+08, grad_fn=<SumBackward0>)
5 tensor(2362256.5000, grad_fn=<SumBackward0>)
6 tensor(2021672.2500, grad_fn=<SumBackward0>)
7 tensor(1762249.7500, grad_fn=<SumBackward0>)
8 tensor(1557750., grad_fn=<SumBackward0>)
9 tensor(1390408., grad_fn=<SumBackward0>)
10 tensor(1250695.7500, grad_fn=<SumBackward0>)
11 tensor(1132055.5000, grad_fn=<SumBackward0>)
12 tensor(1029826.6250, grad_fn=<SumBackward0>)
13 tensor(940923.1875, grad_fn=<SumBackward0>)
14 tensor(862830.5000, grad_fn=<SumBackward0>)
15 tensor(793863.1250, grad_fn=<SumBackward0>)
16 tensor(732542.1875, grad_fn=<SumBackward0>)
17 tensor(677760., grad_fn=<SumBackward0>)
18 tensor(628659.5625, grad_fn=<SumBackward0>)
19 tensor(584430., grad_fn=<SumBackward0>)
20 tensor(544439.7500, grad_fn=<SumBackward0>)
21 tensor(508163.7812, grad_fn=<SumBac