In [68]:
import matplotlib.pyplot as plt
from pathlib import Path
import gzip, torch, pickle
from sympy import symbols, diff

In [17]:
%matplotlib inline
torch.manual_seed(42)
torch.set_printoptions(linewidth=140, sci_mode=False)

In [5]:
FOLDER_PATH = Path('data')
FILE_PATH = FOLDER_PATH/'mnist.pkl.gz'

In [35]:
with gzip.open(FILE_PATH, 'rb') as f:
    data = pickle.load(f, encoding='latin')

In [36]:
((x_train, y_train),(x_dev, y_dev), _) = data

In [37]:
(x_train, y_train, x_dev, y_dev) = map(torch.tensor, (x_train, y_train, x_dev, y_dev))

In [38]:
x_train.shape, y_train.shape, x_dev.shape, y_dev.shape

(torch.Size([50000, 784]),
 torch.Size([50000]),
 torch.Size([10000, 784]),
 torch.Size([10000]))

#### model

In [39]:
m,n = x_train.shape
c = y_train.max() + 1
nh = 50 # num of hidden neuron

In [40]:
w1 = torch.randn(n, nh)
b1 = torch.zeros(nh)
w2 = torch.randn(nh, 1)
b2 = torch.zeros(1)

In [41]:
def lin(x, w, b):
    return x@w+b

In [299]:
def relu(x):
    return x.clamp_min(0)

In [72]:
def model(x):
    lyrl = lin(x, w1, b1)
    lyr2 = lin(relu(lyrl), w2, b2)
    return lyr2

In [73]:
res = model(x_dev);res.shape

torch.Size([10000, 1])

#### error measure

In [49]:
res.shape, y_dev.shape

(torch.Size([10000, 1]), torch.Size([10000]))

In [59]:
err = (res[:,0] - y_dev);err

tensor([ 21.5985, -12.8880,  12.9920,  ...,  -5.2024,   0.8196, -41.5658])

In [60]:
err.shape

torch.Size([10000])

In [61]:
y_train, y_dev = y_train.float(), y_dev.float()

In [63]:
res = model(x_train);res.shape

torch.Size([50000, 1])

In [66]:
def mse(out, trg):
    return (out[:,0]-trg).pow(2).mean()

In [67]:
err = mse(res, y_train);err

tensor(2630.1516)

#### gradient and backward pass

In [351]:
x, y = symbols('x y')

In [354]:
diff(x**2 + y, x)

2*x

In [75]:
err = (res[:,0] - y_dev);err.shape

torch.Size([10000])

In [79]:
err

tensor([ -5.8467, -37.3039, -19.2733,  ..., -31.9318, -22.2322, -29.9598])

In [80]:
err / x_dev.shape[0]

tensor([-0.0006, -0.0037, -0.0019,  ..., -0.0032, -0.0022, -0.0030])

In [82]:
(x-y)**2

(x - y)**2

In [358]:
diff((x-y)**2, x)

2*x - 2*y

In [364]:
o,x,w,b = symbols('o x w b')

In [369]:
diff(x*w+b, 'x'), diff(x*w+b, 'w'), diff(x*w+b, 'b')

(w, x, 1)

In [493]:
def lingrad(x, o, w, b):
    b.g = 1 * o.g.sum(0)
    w.g = x.T @ o.g
    x.g = o.g @ w.T


In [494]:
def forward_backward(inp, tar):
    # forward pass
    l1 = lin(inp, w1, b1)
    l2 = relu(l1)
    out = lin(l2, w2, b2)
    
    # calculate loss (MSE)
    diff = out[:,0] - tar
    loss = diff.pow(2).mean()
    
    
    # backward pass
    out.g = 2 * diff[:,None] / inp.shape[0]
    lingrad(l2, out, w2, b2)
    l1.g = (l1 > 0).float() * l2.g
    lingrad(inp, l1, w1, b1)
     

In [495]:
forward_backward(x_train, y_train)

In [474]:
def get_gard(x):
    return x.g.clone()

In [475]:
chcks = w1,w2,b1,b2,x_train

In [476]:
grads = tuple(map(get_grad, chcks))

In [477]:
def make_grad(x):
    return x.clone().requires_grad_(True)

In [478]:
ptgrads = w1pyt, w2pyt, b1pyt, b2pyt, x_trianpyt = tuple(map(make_grad, chcks))

In [479]:
def pyt_forward(inp, targ):
    l1 = lin(inp, w1pyt, b1pyt)
    l2 = relu(l1)
    out = lin(l2, w2pyt, b2pyt)
    return mse(out, targ)

In [480]:
loss = pyt_forward(x_trianpyt, y_train)

In [481]:
loss.backward()

In [485]:
for a, b in zip(grads, ptgrads):
    print(a.shape, b.shape)
    print(a, b)
    break

torch.Size([784, 50]) torch.Size([784, 50])
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]) tensor([[ 2.5353,  1.7152,  0.2519,  ...,  1.2621, -1.3704, -1.8541],
        [ 0.8463, -0.7835, -1.1173,  ..., -1.9560,  0.5467, -0.7582],
        [ 0.9242,  0.5263,  1.1134,  ...,  2.2070,  1.6893, -0.6268],
        ...,
        [-0.6378, -0.0758,  0.5120,  ..., -0.6960,  0.0835,  0.5951],
        [-0.2228, -1.0157,  1.3523,  ..., -1.4237, -0.2719, -1.1539],
        [-1.0204,  0.1851, -0.4556,  ..., -1.1737,  1.0057, -0.7915]], requires_grad=True)


#### refactor the code

In [522]:
class Relu():
    def __call__(self, inp):
        self.inp = inp
        self.out = inp.clamp_min(0.)
        return self.out
    
    def backward(self):
        self.inp.g = (self.inp > 0).float() * self.out.g

In [575]:
class Lin():
    def __init__(self, w, b):
        self.w = w
        self.b = b
    
    def __call__(self, inp):
        self.inp = inp
        self.out = inp @ self.w + self.b
        return self.out
    
    def backward(self):
        self.b.g = 1 * self.out.g.sum(0)
        self.w.g = self.inp.t() @ self.out.g
        self.inp.g = self.out.g @ self.w.t()

In [576]:
class Mse():
    def __call__(self, inp, targ):
        self.inp = inp
        self.targ = targ
        self.out = (self.inp[:,0]-self.targ).pow(2).mean()
        return self.out
    
    def backward(self):
        self.inp.g = 2 * (self.inp[:,0]-self.targ)[:,None] / self.inp.shape[0] 

In [577]:
class Model():
    def __init__(self, w1, b1, w2, b2):
        self.layers = [Lin(w1, b1), Relu(), Lin(w2, b2)]
        self.loss = Mse()
    
    def __call__(self, inp, targ):
        x = inp
        for lyr in self.layers:
            x = lyr(x)
        return self.loss(x, targ)
    
    def backward(self):
        self.loss.backward()
        for lyr in reversed(self.layers):
            lyr.backward()

In [578]:
# initialize params
m,n = x_train.shape
c = y_train.max() + 1
nh = 50 # num of hidden neuron

w1 = torch.randn(n, nh)
b1 = torch.zeros(nh)
w2 = torch.randn(nh, 1)
b2 = torch.zeros(1)

In [579]:
model = Model(w1, b1, w2, b2)

In [580]:
loss = model(x_train, y_train)

In [581]:
loss

tensor(1419.5629)

In [582]:
model.backward()