### Warm-up: numpy

In [1]:
# Code in file tensor/two_layer_net_numpy.py
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    #Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 35085673.5744
1 29152320.9546
2 25719684.9325
3 21270186.6668
4 15828714.7687
5 10572329.3388
6 6660169.42016
7 4150003.44604
8 2689480.4031
9 1852591.94893
10 1362372.1164
11 1057579.53626
12 854921.702839
13 710857.047815
14 602503.819072
15 517260.634328
16 448319.045746
17 391324.169916
18 343486.084949
19 302922.148328
20 268182.948197
21 238225.751713
22 212271.752935
23 189697.833477
24 169976.02322
25 152683.956417
26 137453.510157
27 123994.015395
28 112067.220889
29 101474.947068
30 92042.8291108
31 83628.9427811
32 76110.6034995
33 69369.4931511
34 63318.9071856
35 57870.2845482
36 52956.6638644
37 48522.3325789
38 44508.474197
39 40868.8768494
40 37564.6362527
41 34562.134672
42 31830.8405168
43 29343.3164225
44 27074.5845803
45 25001.5573751
46 23105.9826738
47 21370.7079083
48 19780.0159832
49 18320.6785824
50 16980.6690085
51 15748.8715034
52 14615.9564306
53 13572.6996569
54 12611.3042657
55 11724.9292348
56 10906.865741
57 10151.9246323
58 9453.95090619
59 8808.66967

### PyTorch: Tensors

In [2]:
# Code in file tensor/two_layer_net_tensor.py
import torch

#dtype = torch.FloatTensor
dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in).type(dtype)
y = torch.randn(N, D_out).type(dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H).type(dtype)
w2 = torch.randn(H, D_out).type(dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
  
    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss)
  
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
  
    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 32716396.342311546
1 33360728.37036922
2 40313190.379545376
3 46498612.410452895
4 42691067.608481586
5 28236465.801740695
6 13367051.088867143
7 5492353.694997198
8 2531653.7522917725
9 1506598.3302157093
10 1089902.4753683647
11 868219.6526528259
12 719797.4998409636
13 607482.8406376198
14 517734.0845698322
15 444334.61918703746
16 383590.55065242946
17 332825.2342837356
18 290119.7129699178
19 254003.2341007283
20 223318.07722719293
21 197063.88268792751
22 174471.12528328493
23 154953.85760968237
24 138038.02391760668
25 123305.95853644265
26 110422.08348418103
27 99129.00378606629
28 89195.74946903056
29 80439.36936812001
30 72705.02924448234
31 65840.98598021711
32 59741.91318577483
33 54306.8127211347
34 49461.029423600354
35 45117.94018791103
36 41217.60180261389
37 37708.174874183824
38 34544.1324386806
39 31686.659149767554
40 29101.52694210039
41 26758.944002967968
42 24633.641337979232
43 22701.16833872527
44 20940.983677557604
45 19335.83955202018
46 17870.189361900848


479 5.0801539514422345e-05
480 5.0097543726479454e-05
481 4.905280934180967e-05
482 4.849265099365896e-05
483 4.77289010404891e-05
484 4.680417489189015e-05
485 4.631166259227992e-05
486 4.545718589882286e-05
487 4.475031964956688e-05
488 4.43065281512528e-05
489 4.358540770416086e-05
490 4.280942379911168e-05
491 4.225016960880679e-05
492 4.170284934834623e-05
493 4.106665512437768e-05
494 4.065601244787609e-05
495 4.0273195601139844e-05
496 3.96670426549145e-05
497 3.899766866367893e-05
498 3.8494727682913954e-05
499 3.803224431286889e-05
