In [0]:
import torch
import torch.nn as nn
import fastai.datasets as datasets
import gzip
import pickle
import math
import torch.nn.functional as F

In [35]:
MNIST_URL = 'http://deeplearning.net/data/mnist/mnist.pkl'
fpath = datasets.download_data(MNIST_URL, ext='.gz')

with gzip.open(fpath, 'rb') as fp:
  ((x_train, y_train), (x_val, y_val), _) = pickle.load(fp, encoding='latin-1')
x_train.shape, y_train.shape, x_val.shape, y_val.shape

((50000, 784), (50000,), (10000, 784), (10000,))

In [36]:
x_train.mean(), x_train.std(), x_val.mean(), x_val.std()

(0.13044983, 0.3072898, 0.12865187, 0.3049646)

### Normalize

In [0]:
def norm(x, m, s): return (x - m) / s

In [38]:
m, s = x_train.mean(), x_train.std()
x_train, x_val = norm(x_train, m, s), norm(x_val, m, s)
x_val.mean(), x_val.std()

(-0.005850922, 0.99243325)

In [40]:
x_train[0], x_val[0]

(array([-0.424517, -0.424517, -0.424517, -0.424517, ..., -0.424517, -0.424517, -0.424517, -0.424517], dtype=float32),
 array([-0.424517, -0.424517, -0.424517, -0.424517, ..., -0.424517, -0.424517, -0.424517, -0.424517], dtype=float32))

In [0]:
x_train, y_train, x_val, y_val = map(torch.tensor, [x_train, y_train, x_val, y_val])

### Model: $y = ax + b$ : linear -> relu -> linear

In [0]:
def linear(x, w, b):
  z = x @ w + b
  print(z.mean(), z.std())
  return z

def relu(x): return x.clamp_min(0.)

In [0]:
nh = 50

In [0]:
W1 = torch.randn(784, nh)
b1 = torch.zeros(nh)
W2 = torch.randn(nh, 1)
b2 = torch.zeros(1)

In [80]:
W1.mean(), W1.std()

(tensor(-0.0078), tensor(1.0026))

In [81]:
z1 = linear(x_train, W1, b1)

tensor(-0.8809) tensor(26.9281)


In [0]:
W1 = torch.randn(784, nh) * math.sqrt(1 / 784)
b1 = torch.zeros(nh)
W2 = torch.randn(nh, 1) * math.sqrt(1 / nh)
b2 = torch.zeros(1)

In [83]:
z1 = linear(x_train, W1, b1)

tensor(0.1031) tensor(0.9458)


In [84]:
a1 = relu(z1)
a1.mean(), a1.std()

(tensor(0.4272), tensor(0.5915))

In [0]:
W1 = torch.randn(784, nh) * math.sqrt(2 / 784)
b1 = torch.zeros(nh)
W2 = torch.randn(nh, 1) * math.sqrt(2 / nh)
b2 = torch.zeros(1)

In [86]:
z1 = linear(x_train, W1, b1)

tensor(-0.1405) tensor(1.3252)


In [87]:
a1 = relu(z1)
a1.mean(), a1.std()

(tensor(0.4553), tensor(0.7339))

In [0]:
def relu(x): return x.clamp_min(0.) - 0.5

In [89]:
a1 = relu(z1)
a1.mean(), a1.std()

(tensor(-0.0447), tensor(0.7339))

In [0]:
import torch.nn.init as init

In [0]:
init.kaiming_normal_??

In [92]:
W1.shape

torch.Size([784, 50])

In [94]:
nn.Linear(784, nh).weight.data.shape

torch.Size([50, 784])

In [0]:
??nn.Linear.forward

In [0]:
init._calculate_correct_fan??

In [0]:
init._calculate_fan_in_and_fan_out??

In [99]:
w = torch.zeros(784, nh)
z1 = linear(x_train, w, b1)
a1 = relu(z1)
a1.mean(), a1.std()

tensor(0.) tensor(0.)


(tensor(0.), tensor(0.))

In [108]:
W1 = torch.zeros(784, nh)
b1 = torch.zeros(nh)
W2 = torch.zeros(nh, 1)
b2 = torch.zeros(1)

init.kaiming_normal_(W1, mode='fan_out', nonlinearity='relu')
init.kaiming_normal_(W2, mode='fan_out')
z1 = linear(x_train, W1, b1)
a1 = relu(z1)
print("layer1: ", a1.mean(), a1.std())
z2 = linear(a1, W2, b2)

tensor(0.0053) tensor(1.4042)
layer1:  tensor(0.5583) tensor(0.8157)
tensor(1.1784) tensor(1.3209)


In [132]:
def linear(x, w, b):
  return x @ w + b

def relu(x):
  return x.clamp_min(0.) - 0.5

def model(x):
  x = relu(linear(x, W1, b1))
  print("layer1: ", x.mean(), x.std())
  x = relu(linear(x, W2, b2))
  print("layer2: ", x.mean(), x.std())
  x = linear(x, W3, b3)
  print("layer3: ", x.mean(), x.std())
  return x

nh = [100, 50]
W1 = torch.zeros(784, nh[0])
b1 = torch.zeros(nh[0])
W2 = torch.zeros(nh[0], nh[1])
b2 = torch.zeros(nh[1])
W3 = torch.zeros(nh[1], 1)
b3 = torch.zeros(1)

init.kaiming_normal_(W1, mode='fan_out')
init.kaiming_normal_(W2, mode='fan_out')
init.kaiming_normal_(W3, mode='fan_out')
_ = model(x_train)

layer1:  tensor(0.0134) tensor(0.7912)
layer2:  tensor(0.0073) tensor(0.6957)
layer3:  tensor(-0.3079) tensor(0.9384)


In [0]:
??nn.Linear

In [133]:
class Model(nn.Module):
  def __init__(self):
    super().__init__()
    self.lin1 = nn.Linear(784, nh[0])
    self.lin2 = nn.Linear(nh[0], nh[1])
    self.lin3 = nn.Linear(nh[1], 1)
    self.relu = nn.ReLU()
  
  def forward(self, x):
    x = self.relu(self.lin1(x))
    print("layer 1: ", x.mean().item(), x.std().item())
    x = self.lin2(x)
    print("layer 2: ", x.mean().item(), x.std().item())
    x = self.lin3(x)
    print("layer 3: ", x.mean().item(), x.std().item())
    return x

m = Model()
_ = m(x_train)

layer 1:  0.2270725518465042 0.3270741105079651
layer 2:  0.033514849841594696 0.23475737869739532
layer 3:  0.013271240517497063 0.09185370802879333
