## numpy版のNN

In [6]:
import numpy as np

# N: バッチサイズ
N, D_in, H, D_out = 64, 1000, 100, 10

x = np.random.randn(N, D_in)   # (64, 1000)
y = np.random.randn(N, D_out)  # (64, 10)

w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # forward pass
    h = x.dot(w1)              # (64, 1000) * (1000, 100) = (64, 100)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)    # (64, 100) * (100, 10) = (64, 10)
    
    # compute loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
    # backward pass
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    # update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 32757066.0214
1 31592742.8799
2 35457112.0985
3 37779330.1688
4 33351651.0168
5 22702054.3509
6 12124512.7044
7 5724138.93169
8 2827098.35565
9 1637826.53972
10 1123090.87116
11 862480.624867
12 703663.47134
13 591248.819214
14 504331.184857
15 434005.94821
16 375807.802794
17 326875.918559
18 285497.953608
19 250246.140994
20 220067.870583
21 194112.992556
22 171676.117389
23 152240.021369
24 135363.982324
25 120628.157731
26 107723.493533
27 96385.6866235
28 86400.3615529
29 77572.777417
30 69761.9243962
31 62826.9461829
32 56657.9767638
33 51161.7501471
34 46252.8366224
35 41863.9361727
36 37934.3624612
37 34410.2253135
38 31247.6314374
39 28402.9882934
40 25840.6186223
41 23530.4546728
42 21448.2540177
43 19566.1061876
44 17862.5104198
45 16319.7807051
46 14920.7084017
47 13650.9504462
48 12497.9545937
49 11449.5371161
50 10495.551557
51 9627.02646568
52 8835.49424324
53 8113.57177192
54 7454.72820115
55 6853.08242569
56 6303.23374228
57 5800.51633071
58 5340.52125251
59 4919.286

## PyTorch版のNN

- dot()はmm()
- Tはt()

In [30]:
import torch

dtype = torch.FloatTensor
#dtype = torch.cuda.FloatTensor  # GPUを使う場合

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in).type(dtype)
y = torch.randn(N, D_out).type(dtype)

w1 = torch.randn(D_in, H).type(dtype)
w2 = torch.randn(H, D_out).type(dtype)

learning_rate = 1e-6
for t in range(500):
    # forward pass
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    # compute loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss)
    
    # backward pass
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 33519802.33498156
1 29872658.72228745
2 28232201.35337338
3 24703782.486548424
4 18743259.388961434
5 12301520.46972239
6 7339225.924230867
7 4289060.262792304
8 2623948.781733956
9 1738355.4788811116
10 1250364.4825149775
11 960000.0030733151
12 770809.2800763324
13 637018.3387828094
14 536106.0812650071
15 456549.16201951914
16 392037.97327633994
17 338774.421579702
18 294280.9317381763
19 256822.27168787987
20 225008.64031713532
21 197854.1979252888
22 174563.00484699442
23 154477.65188670252
24 137110.3964615895
25 122028.55731431302
26 108908.6724899185
27 97435.66606277274
28 87368.50717904617
29 78514.50379502052
30 70700.82946323616
31 63785.81109264992
32 57654.10851306887
33 52201.84758821846
34 47346.202351426065
35 43011.421975365025
36 39135.1979157662
37 35668.22037300737
38 32557.61342888174
39 29760.179726280614
40 27238.576571210047
41 24961.97025868032
42 22901.39659521862
43 21034.524166789844
44 19341.120538000076
45 17803.695989330685
46 16405.221539200844
47 151

In [20]:
mat1 = torch.randn(2, 3)
mat2 = torch.randn(3, 3)
print(torch.mm(mat1, mat2))
print(mat1.mm(mat2))


 1.3635  3.2792 -2.0203
 0.7865  1.0616 -0.5123
[torch.FloatTensor of size 2x3]


 1.3635  3.2792 -2.0203
 0.7865  1.0616 -0.5123
[torch.FloatTensor of size 2x3]



In [23]:
a = torch.randn(4)
print(a)
print(torch.clamp(a, min=-0.5, max=0.5))


 1.3481
 0.5239
-0.4739
 0.1594
[torch.FloatTensor of size 4]


 0.5000
 0.5000
-0.4739
 0.1594
[torch.FloatTensor of size 4]



In [25]:
a = torch.randn(4)
print(a)
print(torch.pow(a, 2))


-1.2419
-1.1250
-0.3517
-1.1164
[torch.FloatTensor of size 4]


 1.5423
 1.2656
 0.1237
 1.2462
[torch.FloatTensor of size 4]



In [29]:
a = torch.randn(2, 3)
print(a)
print(torch.t(a))
print(a.t())


-1.8005  0.2771 -0.2920
-0.2676 -0.0230 -1.3677
[torch.FloatTensor of size 2x3]


-1.8005 -0.2676
 0.2771 -0.0230
-0.2920 -1.3677
[torch.FloatTensor of size 3x2]


-1.8005 -0.2676
 0.2771 -0.0230
-0.2920 -1.3677
[torch.FloatTensor of size 3x2]



## Autograd

- Tensorはnumpy.arraに当たる
- TensorをくるんだVariableは計算グラフのノードに当たる
- `x`がVariableだとすると`x.data`がTensor、`x.grad`が勾配になる
- TensorのメソッドはほとんどVariableでも使える
- Tensorとの違いは勾配を自動計算できる点

In [66]:
import torch
from torch.autograd import Variable

dtype = torch.FloatTensor
#dtype = torch.cuda.FloatTensor  # GPU使う場合

N, D_in, H, D_out = 64, 1000, 100, 10

# 入力は勾配計算不要
# CNNの可視化をするときはTrueにする？
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # forward pass
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    # compute loss
    loss = (y_pred - y).pow(2).sum()
    # loss: (1,) のVariable
    # loss.data: (1,)のTensor
    # loss.data[0]: Scalar
    print(t, loss.data[0])
    
    # backward pass
    # lossに関する各Variableの勾配が求まる
    # requires_grad=TrueのVariableが勾配計算の対象
    # w1.gradとw2.gradに値が入る
    loss.backward()
    
    # update weights
    # w1.gradはVariableなのでTensor計算はdataを使う
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data

    # 重み更新したあとは蓄積した勾配をクリアする
    w1.grad.data.zero_()
    w2.grad.data.zero_()

0 37870032.0
1 34478608.0
2 32695228.0
3 27592806.0
4 19437664.0
5 11586074.0
6 6358191.5
7 3563654.25
8 2198189.25
9 1516179.0
10 1143097.875
11 912395.375
12 752971.0
13 633663.0
14 539798.625
15 463610.125
16 400742.5
17 348235.65625
18 304007.5
19 266465.09375
20 234415.390625
21 206922.6875
22 183230.328125
23 162741.234375
24 144937.5
25 129392.6328125
26 115781.6328125
27 103833.2265625
28 93326.7421875
29 84068.0234375
30 75869.8671875
31 68592.0234375
32 62121.3671875
33 56352.0234375
34 51195.7109375
35 46581.4765625
36 42440.4453125
37 38719.12890625
38 35367.265625
39 32345.12890625
40 29614.98828125
41 27145.8984375
42 24910.3203125
43 22882.93359375
44 21040.947265625
45 19366.619140625
46 17843.998046875
47 16456.990234375
48 15192.2353515625
49 14039.095703125
50 12987.9814453125
51 12026.560546875
52 11146.494140625
53 10339.19140625
54 9597.78515625
55 8916.2001953125
56 8289.037109375
57 7711.7705078125
58 7179.63671875
59 6689.1181640625
60 6235.869140625
61 5817.21

In [68]:
w2.grad

Variable containing:
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
       ...          ⋱          ...       
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
[torch.FloatTensor of size 100x10]

## New Function

In [70]:
import torch
from torch.autograd import Variable

class MyReLU(torch.autograd.Function):
    def forward(self, input):
        # Tensorを受け取ってTensorを返す
        self.save_for_backward(input)
        return input.clamp(min=0)

    def backward(self, grad_output):
        # lossのこのユニットの出力に対する勾配を受け取る
        # lossのこのユニットの入力に対する購買を返す
        input, = self.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input

dtype = torch.FloatTensor

N, D_in, H, D_out = 64, 1000, 100, 10

x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    relu = MyReLU()
    y_pred = relu(x.mm(w1)).mm(w2)    
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.data[0])
    loss.backward()
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data
    w1.grad.data.zero_()
    w2.grad.data.zero_()

0 28923664.0
1 21498286.0
2 18640954.0
3 17113786.0
4 15503434.0
5 13195715.0
6 10424069.0
7 7645163.0
8 5322102.0
9 3596325.25
10 2423709.0
11 1659939.375
12 1172385.25
13 858418.6875
14 652287.375
15 512329.5625
16 413720.1875
17 341474.09375
18 286561.5625
19 243556.640625
20 209006.109375
21 180666.84375
22 157067.765625
23 137187.0625
24 120301.1015625
25 105850.234375
26 93404.109375
27 82638.6953125
28 73295.3984375
29 65162.32421875
30 58063.671875
31 51841.375
32 46374.22265625
33 41566.15625
34 37321.76171875
35 33570.828125
36 30258.173828125
37 27318.052734375
38 24701.7265625
39 22367.705078125
40 20283.693359375
41 18418.294921875
42 16745.623046875
43 15242.7880859375
44 13891.72265625
45 12675.8466796875
46 11579.302734375
47 10588.8564453125
48 9693.3740234375
49 8882.716796875
50 8147.6982421875
51 7480.39501953125
52 6874.16259765625
53 6323.63720703125
54 5822.29150390625
55 5365.216796875
56 4947.8427734375
57 4566.49072265625
58 4217.5107421875
59 3898.07006835937

## TensorFlow版

In [77]:
import tensorflow as tf
import numpy as np

N, D_in, H, D_out = 64, 1000, 100, 10

# placeholderは計算グラフの実行時に実際のデータで満たされる
x = tf.placeholder(tf.float32, shape=(None, D_in))
y = tf.placeholder(tf.float32, shape=(None, D_out))

# TensorFlowのVariableもPyTorchと同じ（計算グラフのノード）
w1 = tf.Variable(tf.random_normal((D_in, H)))
w2 = tf.Variable(tf.random_normal((H, D_out)))

# forward pass
# この段階では計算グラフを構築するだけで実際の数値計算はしない
h = tf.matmul(x, w1)
h_relu = tf.maximum(h, tf.zeros(1))
y_pred = tf.matmul(h_relu, w2)

# compute loss
loss = tf.reduce_sum((y - y_pred) ** 2.0)

# 損失のw1とw2に関する勾配を計算
grad_w1, grad_w2 = tf.gradients(loss, [w1, w2])

# TensorFlowではパラメータ更新も計算グラフ内で行われる
learning_rate = 1e-6
new_w1 = w1.assign(w1 - learning_rate * grad_w1)
new_w2 = w2.assign(w2 - learning_rate * grad_w2)

# 計算グラフができたので実際のデータを入れて計算
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    x_value = np.random.randn(N, D_in)
    y_value = np.random.randn(N, D_out)
    
    for _ in range(500):
        # 計算グラフのloss, new_w1, new_w2を実行
        loss_value, _, _ = sess.run([loss, new_w1, new_w2],
                                    # placeholderにデータを入力
                                    feed_dict={x: x_value, y: y_value})
        print(loss_value)

3.12259e+07
2.45127e+07
2.12229e+07
1.85246e+07
1.5593e+07
1.21715e+07
8.81838e+06
6.06698e+06
4.07357e+06
2.75875e+06
1.91932e+06
1.38465e+06
1.03923e+06
810277.0
652668.0
539671.0
455276.0
389974.0
337956.0
295558.0
260221.0
230367.0
204837.0
182813.0
163670.0
146935.0
132244.0
119298.0
107845.0
97687.2
88647.5
80577.3
73363.4
66892.4
61076.4
55837.9
51115.2
46850.1
42996.8
39503.2
36334.0
33452.2
30829.5
28439.1
26259.9
24267.5
22446.4
20777.2
19247.7
17847.8
16561.9
15379.7
14292.3
13290.8
12368.0
11517.2
10731.7
10005.9
9334.87
8713.51
8137.94
7604.45
7109.44
6649.92
6223.16
5826.59
5457.7
5114.57
4795.11
4497.48
4219.99
3961.19
3719.85
3494.45
3283.91
3087.24
2903.42
2731.38
2570.43
2419.7
2278.57
2146.35
2022.38
1906.18
1797.21
1694.92
1598.91
1508.76
1424.05
1344.43
1269.61
1199.28
1133.1
1070.82
1012.21
957.044
905.087
856.149
810.006
766.531
725.558
686.908
650.451
616.076
583.614
552.974
524.041
496.714
470.906
446.53
423.478
401.689
381.113
361.664
343.269
325.868
309.398
2

## nnパッケージ

In [82]:
import torch
from torch.autograd import Variable

N, D_in, H, D_out = 64, 1000, 100, 10

x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

loss_fn = torch.nn.MSELoss(size_average=False)

learning_rate = 1e-4
for t in range(500):
    # forward pass
    y_pred = model(x)
    
    # compute loss
    loss = loss_fn(y_pred, y)
    print(t, loss.data[0])
    
    # backwardする前に勾配をクリア
    model.zero_grad()
    
    # backward pass
    loss.backward()
    
    # update the weights
    # paramはVariableなのでdataでTensorを取り出して更新する
    for param in model.parameters():
        param.data -= learning_rate * param.grad.data

0 675.65478515625
1 625.3120727539062
2 581.5741577148438
3 543.2457275390625
4 509.56689453125
5 479.6416931152344
6 452.5718994140625
7 427.7896728515625
8 404.82525634765625
9 383.4107971191406
10 363.2756652832031
11 344.36639404296875
12 326.5947265625
13 309.7552795410156
14 293.7693786621094
15 278.57525634765625
16 264.117919921875
17 250.3040008544922
18 237.11444091796875
19 224.51162719726562
20 212.4835205078125
21 200.97837829589844
22 190.0261993408203
23 179.53216552734375
24 169.5443115234375
25 159.99258422851562
26 150.90789794921875
27 142.3028564453125
28 134.12548828125
29 126.3597183227539
30 118.99972534179688
31 112.01719665527344
32 105.41255187988281
33 99.15910339355469
34 93.26506042480469
35 87.70309448242188
36 82.46214294433594
37 77.52316284179688
38 72.86669921875
39 68.46904754638672
40 64.32349395751953
41 60.43075180053711
42 56.78474044799805
43 53.35789489746094
44 50.141658782958984
45 47.12574005126953
46 44.28652572631836
47 41.62451171875
48 39

## Optimizer

In [83]:
import torch
from torch.autograd import Variable

N, D_in, H, D_out = 64, 1000, 100, 10

x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

loss_fn = torch.nn.MSELoss(size_average=False)

learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    # forward pass
    y_pred = model(x)
    
    # compute loss
    loss = loss_fn(y_pred, y)
    print(t, loss.data[0])
    
    # backwardする前に勾配をクリア
    optimizer.zero_grad()
    
    # backward pass
    loss.backward()
    
    # update the weights
    optimizer.step()

0 630.1945190429688
1 613.5613403320312
2 597.468505859375
3 581.8590698242188
4 566.6857299804688
5 551.9625244140625
6 537.625244140625
7 523.65478515625
8 510.0739440917969
9 496.94525146484375
10 484.1810302734375
11 471.8095397949219
12 459.8274230957031
13 448.1696472167969
14 436.78521728515625
15 425.6701354980469
16 414.8320617675781
17 404.3284606933594
18 394.0894470214844
19 384.1022644042969
20 374.3945007324219
21 364.9898986816406
22 355.8160400390625
23 346.88946533203125
24 338.2266845703125
25 329.8339538574219
26 321.614501953125
27 313.6141357421875
28 305.8409423828125
29 298.2760925292969
30 290.8974304199219
31 283.6495666503906
32 276.5545349121094
33 269.619873046875
34 262.8145446777344
35 256.151123046875
36 249.61766052246094
37 243.227783203125
38 236.96031188964844
39 230.82102966308594
40 224.8285369873047
41 218.98483276367188
42 213.2899627685547
43 207.694580078125
44 202.20387268066406
45 196.83082580566406
46 191.58071899414062
47 186.45713806152344


## Custom nn Modeules

In [84]:
import torch
from torch.autograd import Variable

# 独自モデルの定義
class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)
    
    def forward(self, x):
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred

N, D_in, H, D_out = 64, 1000, 100, 10

x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

model = TwoLayerNet(D_in, H, D_out)

criterion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
    # forward pass
    y_pred = model(x)
    
    # compute loss
    loss = criterion(y_pred, y)
    print(t, loss.data[0])
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 628.8768920898438
1 582.905029296875
2 543.0150146484375
3 507.8055114746094
4 476.51544189453125
5 448.16473388671875
6 422.3822326660156
7 398.69195556640625
8 376.92254638671875
9 356.5376281738281
10 337.4390563964844
11 319.57373046875
12 302.7062072753906
13 286.7491760253906
14 271.6932373046875
15 257.373779296875
16 243.7882080078125
17 230.88551330566406
18 218.60121154785156
19 206.88059997558594
20 195.67245483398438
21 184.96205139160156
22 174.74896240234375
23 165.0063934326172
24 155.72891235351562
25 146.8719024658203
26 138.43077087402344
27 130.381103515625
28 122.75699615478516
29 115.50647735595703
30 108.64201354980469
31 102.15310668945312
32 96.0034408569336
33 90.20628356933594
34 84.72351837158203
35 79.5626220703125
36 74.6899185180664
37 70.1153564453125
38 65.8133773803711
39 61.767574310302734
40 57.97158432006836
41 54.41753005981445
42 51.0764274597168
43 47.945804595947266
44 45.01655197143555
45 42.27145004272461
46 39.703224182128906
47 37.304431915

## Dynamic Model

In [85]:
import random
import torch
from torch.autograd import Variable

class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)
    
    def forward(self, x):
        h_relu = self.input_linear(x).clamp(min=0)
        # forward passのたびに0から2個のlinear層をランダムに追加する
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred

N, D_in, H, D_out = 64, 1000, 100, 10

x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

model = DynamicNet(D_in, H, D_out)

criterion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
for t in range(500):
    # forward pass
    y_pred = model(x)
    
    # compute loss
    loss = criterion(y_pred, y)
    print(t, loss.data[0])
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 640.6339111328125
1 631.3048706054688
2 659.3577880859375
3 629.0328369140625
4 591.5602416992188
5 625.6828002929688
6 630.2764892578125
7 494.9000549316406
8 450.3318786621094
9 628.6624755859375
10 546.13623046875
11 627.501708984375
12 614.8070068359375
13 611.4189453125
14 515.6226196289062
15 226.6064453125
16 593.8165283203125
17 617.955322265625
18 614.0869140625
19 446.6548156738281
20 425.9581604003906
21 542.76025390625
22 525.5250854492188
23 573.9893798828125
24 480.4125671386719
25 534.5718994140625
26 507.98529052734375
27 408.9234313964844
28 390.77008056640625
29 435.8751525878906
30 421.808349609375
31 355.2283935546875
32 291.4407043457031
33 370.36181640625
34 224.03392028808594
35 202.64659118652344
36 179.03271484375
37 322.5909118652344
38 143.73648071289062
39 202.34881591796875
40 102.24514770507812
41 179.5446319580078
42 88.59590911865234
43 160.90188598632812
44 141.61976623535156
45 94.38774108886719
46 114.52020263671875
47 158.8015594482422
48 79.393714