# 热身：用numpy实现两层神经网络

In [1]:
import numpy as np
N, D_in, H, D_out = 64, 1000, 100, 10

随机生成数据

In [2]:
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for it in range(500):
    # Foward pass
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # compute loss
    loss = np.square(y_pred - y).sum()
    print(it, loss)

    # Backward pass
    # compute the gradient

    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # update w1 and w2
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 26731524.462978467
1 24008421.192501403
2 26441405.09657031
3 30525690.43269492
4 32490758.93088705
5 28810408.819717217
6 20428581.987930283
7 11694125.239634808
8 5955409.816668361
9 3022198.7359937434
10 1696704.66526286
11 1095159.2093514712
12 799539.0729277269
13 633628.0528335985
14 526394.6427945253
15 448655.873819664
16 387997.2493008209
17 338601.7307587244
18 297390.55779964157
19 262494.66466593405
20 232612.05009738117
21 206883.53899583424
22 184646.181952895
23 165295.33910023258
24 148369.61470770813
25 133510.51040626626
26 120418.73396530075
27 108848.57831753294
28 98590.6500335294
29 89471.14110249619
30 81342.79377200868
31 74077.54221263199
32 67593.5676973424
33 61776.79202748086
34 56544.089947275614
35 51823.4933782132
36 47557.61240848467
37 43695.9277521845
38 40196.88891903272
39 37017.12608460803
40 34124.22179312024
41 31488.86224741836
42 29084.61457340001
43 26888.35733666576
44 24883.63487798554
45 23047.781715870544
46 21364.75852540804
47 19819.497

## Pytorch实现神经网络

In [4]:
import torch

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

w1 = torch.randn(D_in, H)
w2 = torch.randn(H, D_out)

learning_rate = 1e-6
for it in range(500):
    # Forward pass
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # compute loss
    loss = (y_pred - y).pow(2).sum().item()
    print(it, loss)

    # Backward pass
    # compute the gradient
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # update w1 and w2
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2



033818460.0
134976420.0
242059200.0
345955408.0
438873100.0
523168220.0
610323332.0
74205334.0
82015491.625
91237400.375
10908700.875
11728858.5625
12607653.875
13516028.3125
14442962.25
15382928.125
16332958.0
17290994.40625
18255472.59375
19225215.15625
20199271.390625
21176915.046875
22157548.34375
23140726.453125
24126037.890625
25113155.265625
26101820.3203125
2791834.734375
2883005.4140625
2975164.4609375
3068185.7109375
3161959.1875
3256397.67578125
3351428.8515625
3446969.33203125
3542967.4296875
3639363.1015625
3736110.2109375
3833163.328125
3930491.076171875
4028066.6875
4125862.990234375
4223855.7890625
4322025.27734375
4420353.111328125
4518824.06640625
4617424.203125
4716141.56640625
4814964.6884765625
4913883.654296875
5012889.9931640625
5111975.43359375
5211133.1201171875
5310356.814453125
549640.787109375
558979.2626953125
568367.8720703125
577802.3681640625
587279.0185546875
596795.40576171875
606346.31103515625
615929.71826171875
625543.09033203125
635184.13525390625


autograd

In [5]:
x = torch.tensor(1., requires_grad=True)
w = torch.tensor(2., requires_grad=True)
b = torch.tensor(3., requires_grad=True)

y = w * x + b
y.backward()

print(w.grad)
print(x.grad)
print(b.grad)

tensor(1.)
tensor(2.)
tensor(1.)


In [22]:
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

w1 = torch.randn(D_in, H, requires_grad=True)
w2 = torch.randn(H, D_out, requires_grad=True)

learning_rate = 1e-6
for it in range(500):
    # Forward pass
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # compute loss
    loss = (y_pred - y).pow(2).sum()
    print(it, loss.item())

    # Backward pass
    loss.backward()

    # print(w1.grad)
    # update w1 and w2
    with torch.no_grad():
        w1 = w1 - learning_rate * w1.grad
        w2 = w2 - learning_rate * w2.grad
        w1.grad.zero_()
        w2.grad.zero_()

030830464.0


AttributeError: 'NoneType' object has no attribute 'zero_'

## Pytorch的nn

In [28]:
import torch.nn as nn

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = nn.Sequential(
    nn.Linear(D_in, H, bias=False),
    nn.ReLU(),
    nn.Linear(H, D_out, bias=False),
)

loss_fn = nn.MSELoss(reduction='sum')
learning_rate = 1e-3

for it in range(500):
    y_pred = model(x)

    loss = loss_fn(y_pred, y)
    print(it, loss.item())

    model.zero_grad()
    loss.backward()

    for param in model.parameters():
        param = param - learning_rate * param.grad



0663.044677734375
1663.044677734375
2663.044677734375
3663.044677734375
4663.044677734375
5663.044677734375
6663.044677734375
7663.044677734375
8663.044677734375
9663.044677734375
10663.044677734375
11663.044677734375
12663.044677734375
13663.044677734375
14663.044677734375
15663.044677734375
16663.044677734375
17663.044677734375
18663.044677734375
19663.044677734375
20663.044677734375
21663.044677734375
22663.044677734375
23663.044677734375
24663.044677734375
25663.044677734375
26663.044677734375
27663.044677734375
28663.044677734375
29663.044677734375
30663.044677734375
31663.044677734375
32663.044677734375
33663.044677734375
34663.044677734375
35663.044677734375
36663.044677734375
37663.044677734375
38663.044677734375
39663.044677734375
40663.044677734375
41663.044677734375
42663.044677734375
43663.044677734375
44663.044677734375
45663.044677734375
46663.044677734375
47663.044677734375
48663.044677734375
49663.044677734375
50663.044677734375
51663.044677734375
52663.044677734375
536

In [29]:
import torch.nn as nn

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = nn.Sequential(
    nn.Linear(D_in, H, bias=False),
    nn.ReLU(),
    nn.Linear(H, D_out, bias=False),
)

loss_fn = nn.MSELoss(reduction='sum')

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
learning_rate = 1e-3

for it in range(500):
    y_pred = model(x)

    loss = loss_fn(y_pred, y)
    print(it, loss.item())

    optimizer.zero_grad()
    loss.backward()

    optimizer.step()

0690.0714721679688
1537.9089965820312
2425.00787353515625
3341.126220703125
4275.0350341796875
5221.705810546875
6177.60125732421875
7140.99127197265625
8111.24031066894531
987.48314666748047
1068.94189453125
1154.993988037109375
1244.827510833740234
1337.66499328613281
1432.778465270996094
1529.452266693115234
1627.125776290893555
1725.342111587524414
1823.757286071777344
1922.179317474365234
2020.503042221069336
2118.69511604309082
2216.788616180419922
2314.873051643371582
2413.025961875915527
2511.331676483154297
269.833691596984863
278.563224792480469
287.495209693908691
296.607138156890869
305.8777618408203125
315.2862653732299805
324.8056488037109375
334.411730766296387
344.087886333465576
353.7976183891296387
363.520390510559082
373.242380142211914
382.963716745376587
392.6917827129364014
402.4367969036102295
412.206416606903076
422.0033175945281982
431.8244750499725342
441.6645753383636475
451.5134239196777344
461.3648617267608643
471.216603398323059
481.0712846517562866
490.93

In [31]:
import torch.nn as nn

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

class TwoLayerNet(nn.Module):
    def __init__(self, D_in, H, D_out):
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H, bias=False)
        self.linear2 = torch.nn.Linear(H, D_out, bias=False)

    def forward(self, x):
        y_pred = self.linear2(self.linear1(x).clamp(min=0))
        return y_pred

model = TwoLayerNet(D_in, H, D_out)

loss_fn = nn.MSELoss(reduction='sum')
learning_rate = 1e-4

for it in range(500):
    y_pred = model(x)

    loss = loss_fn(y_pred, y)
    print(it, loss.item())

    model.zero_grad()
    loss.backward()

    for param in model.parameters():
        param = param - learning_rate * param.grad

0677.0934448242188
1677.0934448242188
2677.0934448242188
3677.0934448242188
4677.0934448242188
5677.0934448242188
6677.0934448242188
7677.0934448242188
8677.0934448242188
9677.0934448242188
10677.0934448242188
11677.0934448242188
12677.0934448242188
13677.0934448242188
14677.0934448242188
15677.0934448242188
16677.0934448242188
17677.0934448242188
18677.0934448242188
19677.0934448242188
20677.0934448242188
21677.0934448242188
22677.0934448242188
23677.0934448242188
24677.0934448242188
25677.0934448242188
26677.0934448242188
27677.0934448242188
28677.0934448242188
29677.0934448242188
30677.0934448242188
31677.0934448242188
32677.0934448242188
33677.0934448242188
34677.0934448242188
35677.0934448242188
36677.0934448242188
37677.0934448242188
38677.0934448242188
39677.0934448242188
40677.0934448242188
41677.0934448242188
42677.0934448242188
43677.0934448242188
44677.0934448242188
45677.0934448242188
46677.0934448242188
47677.0934448242188
48677.0934448242188
49677.0934448242188
50677.0934