We will use a fully-connected ReLU network as our running example. The network will have a single hidden layer, and will be trained with gradient descent to fit random data by minimizing the Euclidean distance between the network output and the true output.

# Using numpy


In [1]:
import numpy as np

`N` is batch size <br>
`D_in` is input dimension <br>
`H` is hidden dimension <br>
`D_out` is output dimension

In [2]:
N, D_in, H, D_out = 64, 1000, 100, 10

In [3]:
# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

In [4]:
# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

In [5]:
print(f'Shapes\n x = {x.shape}\n y = {y.shape}\n w1 = {w1.shape}\n w2 = {w2.shape}')

Shapes
 x = (64, 1000)
 y = (64, 10)
 w1 = (1000, 100)
 w2 = (100, 10)


In [6]:
# initialize learning rate
learning_rate = 1e-6

In [7]:
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 30322437.05872044
1 27195902.61655095
2 27431147.102161624
3 26907588.089192525
4 23487251.911045786
5 17370302.346208066
6 11113889.805449737
7 6441166.422343137
8 3686053.5100525995
9 2209027.4572868785
10 1445706.430116514
11 1033880.3202904585
12 795634.5745740973
13 644210.3782577271
14 539010.1202979372
15 460498.68496492587
16 398809.9231058834
17 348562.19938656234
18 306687.949365409
19 271234.7944467956
20 240917.91932071364
21 214752.15001697085
22 192040.90590784128
23 172247.93360554264
24 154931.08160505304
25 139688.960343489
26 126222.70300593541
27 114298.01113966902
28 103718.3122026331
29 94293.81173106765
30 85871.75778641895
31 78329.10678567778
32 71554.07391972185
33 65457.85460422786
34 59965.41580059127
35 55004.90096246543
36 50515.06238811141
37 46446.307703189545
38 42751.97487546872
39 39390.10791121876
40 36331.86183184168
41 33541.69872026457
42 30993.088242077996
43 28661.819718167877
44 26526.526045303275
45 24569.365325404942
46 22773.469848903733
47

453 0.000538758450100964
454 0.0005204788752116309
455 0.0005028216423608956
456 0.0004857654175927583
457 0.0004692896984830314
458 0.0004533748121647753
459 0.00043800117626280013
460 0.000423150543474296
461 0.00040880502260740807
462 0.0003949472892893777
463 0.0003815609263998911
464 0.000368629533773519
465 0.0003561377512522394
466 0.0003440705688546924
467 0.00033241351006532374
468 0.0003211525942404148
469 0.00031027426016992296
470 0.00029976545040757887
471 0.00028961359928637593
472 0.00027980649241413584
473 0.0002703324454510162
474 0.0002611800804231428
475 0.0002523384741813029
476 0.00024379691617630058
477 0.00023554535041542897
478 0.0002275739178296034
479 0.00021987286074120498
480 0.0002124330927107682
481 0.00020524566682143198
482 0.00019830211028137352
483 0.00019159401680797768
484 0.00018511343446233285
485 0.0001788526109207211
486 0.00017280410511342635
487 0.00016696061364721693
488 0.0001613151753486343
489 0.0001558611251900891
490 0.0001505919083322217

# Using PyTorch

We use PyTorch in place of numpy because numpy cannot utilize GPU.

In [8]:
import torch

In [9]:
dtype = torch.float
device = torch.device("cuda:0")

`N` is batch size <br>
`D_in` is input dimension <br>
`H` is hidden dimension <br>
`D_out` is output dimension

In [10]:
N, D_in, H, D_out = 64, 1000, 100, 10

In [11]:
# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

In [12]:
# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

In [13]:
# Initialize the learning rate
learning_rate = 1e-6

In [14]:
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)
    
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 33405724.0
1 26495738.0
2 21502302.0
3 16441409.0
4 11684807.0
5 7810682.0
6 5126099.5
7 3412334.75
8 2368591.0
9 1727054.75
10 1319417.0
11 1047116.25
12 855030.125
13 712599.25
14 602697.1875
15 515263.78125
16 444162.0
17 385490.59375
18 336425.21875
19 295022.1875
20 259827.859375
21 229661.765625
22 203697.03125
23 181230.609375
24 161692.515625
25 144624.84375
26 129704.359375
27 116602.4765625
28 105053.5859375
29 94837.359375
30 85777.9296875
31 77724.765625
32 70546.5390625
33 64130.71875
34 58389.4765625
35 53239.95703125
36 48609.57421875
37 44439.23046875
38 40674.68359375
39 37270.19140625
40 34187.3671875
41 31391.828125
42 28854.591796875
43 26547.6171875
44 24446.03515625
45 22529.02734375
46 20778.947265625
47 19179.466796875
48 17715.625
49 16375.072265625
50 15147.2021484375
51 14021.125
52 12986.771484375
53 12036.18359375
54 11163.7158203125
55 10361.546875
56 9622.2861328125
57 8940.3720703125
58 8310.9443359375
59 7729.71484375
60 7192.4833984375
61 6695.591308

431 9.6113835752476e-05
432 9.450370271224529e-05
433 9.243766544386744e-05
434 9.086668433155864e-05
435 8.892547339200974e-05
436 8.712476846994832e-05
437 8.550543134333566e-05
438 8.378117490792647e-05
439 8.216901187552139e-05
440 8.063505083555356e-05
441 7.915173046058044e-05
442 7.775933772791177e-05
443 7.629372703377157e-05
444 7.491376891266555e-05
445 7.383703632513061e-05
446 7.230255141621456e-05
447 7.071552681736648e-05
448 6.972565461182967e-05
449 6.857030530227348e-05
450 6.737429794156924e-05
451 6.643834058195353e-05
452 6.50984002277255e-05
453 6.393512740032747e-05
454 6.299687811406329e-05
455 6.197483162395656e-05
456 6.107919762143865e-05
457 5.987777331029065e-05
458 5.9141158999409527e-05
459 5.803651947644539e-05
460 5.722507194150239e-05
461 5.6168457376770675e-05
462 5.523453728528693e-05
463 5.465642607305199e-05
464 5.362771116779186e-05
465 5.267601954983547e-05
466 5.2013863751199096e-05
467 5.1179413276258856e-05
468 5.03247429151088e-05
469 4.967510

# Autograd

In [20]:
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

In [21]:
# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do
# not need to compute gradients wrt to these Tensors
# during the backward pass.

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

In [22]:
# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want
# to compute gradients with resoect to these Tensors
# during the backward pass.

w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

In [23]:
# Initialize the learning rate
learning_rate = 1e-6

In [24]:
for t in range(500):
    # Forward pass compute predicted y using operations
    # on Tensors; these are exactly the same operations
    # we used to compute the forward pass using Tensor, 
    # but we do not need to keep references to intermediate
    # values since we are not implementing the backward pass
    # by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,) loss.item() gets
    # the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

0 26620752.0
1 26620752.0
2 26620752.0
3 26620752.0
4 26620752.0
5 26620752.0
6 26620752.0
7 26620752.0
8 26620752.0
9 26620752.0
10 26620752.0
11 26620752.0
12 26620752.0
13 26620752.0
14 26620752.0
15 26620752.0
16 26620752.0
17 26620752.0
18 26620752.0
19 26620752.0
20 26620752.0
21 26620752.0
22 26620752.0
23 26620752.0
24 26620752.0
25 26620752.0
26 26620752.0
27 26620752.0
28 26620752.0
29 26620752.0
30 26620752.0
31 26620752.0
32 26620752.0
33 26620752.0
34 26620752.0
35 26620752.0
36 26620752.0
37 26620752.0
38 26620752.0
39 26620752.0
40 26620752.0
41 26620752.0
42 26620752.0
43 26620752.0
44 26620752.0
45 26620752.0
46 26620752.0
47 26620752.0
48 26620752.0
49 26620752.0
50 26620752.0
51 26620752.0
52 26620752.0
53 26620752.0
54 26620752.0
55 26620752.0
56 26620752.0
57 26620752.0
58 26620752.0
59 26620752.0
60 26620752.0
61 26620752.0
62 26620752.0
63 26620752.0
64 26620752.0
65 26620752.0
66 26620752.0
67 26620752.0
68 26620752.0
69 26620752.0
70 26620752.0
71 26620752.0
72

In [25]:
for t in range(500):
    # Forward pass compute predicted y using operations
    # on Tensors; these are exactly the same operations
    # we used to compute the forward pass using Tensor, 
    # but we do not need to keep references to intermediate
    # values since we are not implementing the backward pass
    # by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,) loss.item() gets
    # the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

# Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()


0 26620752.0
1 405656775426048.0
2 405656775426048.0
3 405656775426048.0
4 405656775426048.0
5 405656775426048.0
6 405656775426048.0
7 405656775426048.0
8 405656775426048.0
9 405656775426048.0
10 405656775426048.0
11 405656775426048.0
12 405656775426048.0
13 405656775426048.0
14 405656775426048.0
15 405656775426048.0
16 405656775426048.0
17 405656775426048.0
18 405656775426048.0
19 405656775426048.0
20 405656775426048.0
21 405656775426048.0
22 405656775426048.0
23 405656775426048.0
24 405656775426048.0
25 405656775426048.0
26 405656775426048.0
27 405656775426048.0
28 405656775426048.0
29 405656775426048.0
30 405656775426048.0
31 405656775426048.0
32 405656775426048.0
33 405656775426048.0
34 405656775426048.0
35 405656775426048.0
36 405656775426048.0
37 405656775426048.0
38 405656775426048.0
39 405656775426048.0
40 405656775426048.0
41 405656775426048.0
42 405656775426048.0
43 405656775426048.0
44 405656775426048.0
45 405656775426048.0
46 405656775426048.0
47 405656775426048.0
48 405656

386 405656775426048.0
387 405656775426048.0
388 405656775426048.0
389 405656775426048.0
390 405656775426048.0
391 405656775426048.0
392 405656775426048.0
393 405656775426048.0
394 405656775426048.0
395 405656775426048.0
396 405656775426048.0
397 405656775426048.0
398 405656775426048.0
399 405656775426048.0
400 405656775426048.0
401 405656775426048.0
402 405656775426048.0
403 405656775426048.0
404 405656775426048.0
405 405656775426048.0
406 405656775426048.0
407 405656775426048.0
408 405656775426048.0
409 405656775426048.0
410 405656775426048.0
411 405656775426048.0
412 405656775426048.0
413 405656775426048.0
414 405656775426048.0
415 405656775426048.0
416 405656775426048.0
417 405656775426048.0
418 405656775426048.0
419 405656775426048.0
420 405656775426048.0
421 405656775426048.0
422 405656775426048.0
423 405656775426048.0
424 405656775426048.0
425 405656775426048.0
426 405656775426048.0
427 405656775426048.0
428 405656775426048.0
429 405656775426048.0
430 405656775426048.0
431 405656