# MNIST Model From "Scratch" -- Attempt 2

Goal: To reproduce the second half of Chapter 4 from the Fast.ai book with as little referencing the material as possible. Looking up things in the documentation is allowed.

In [1]:
from fastbook import untar_data, URLs

In [2]:
data = untar_data(URLs.MNIST_SAMPLE)

In [3]:
from pathlib import Path

Path.BASE_PATH = data
data.ls()

(#3) [Path('labels.csv'),Path('valid'),Path('train')]

In [4]:
train3s = (data/'train'/'3').ls().sorted()
train7s = (data/'train'/'7').ls().sorted()
train3s, train7s

((#6131) [Path('train/3/10.png'),Path('train/3/10000.png'),Path('train/3/10011.png'),Path('train/3/10031.png'),Path('train/3/10034.png'),Path('train/3/10042.png'),Path('train/3/10052.png'),Path('train/3/1007.png'),Path('train/3/10074.png'),Path('train/3/10091.png')...],
 (#6265) [Path('train/7/10002.png'),Path('train/7/1001.png'),Path('train/7/10014.png'),Path('train/7/10019.png'),Path('train/7/10039.png'),Path('train/7/10046.png'),Path('train/7/10050.png'),Path('train/7/10063.png'),Path('train/7/10077.png'),Path('train/7/10086.png')...])

In [5]:
import torch
from PIL import Image
from numpy import *

train3_tens = [torch.as_tensor(array(Image.open(im))) for im in train3s]
train7_tens = [torch.as_tensor(array(Image.open(im))) for im in train7s]
len(train3_tens), len(train7_tens)

(6131, 6265)

In [6]:
train3_stacked = (torch.stack(train3_tens).float()) / 255
train7_stacked = (torch.stack(train7_tens).float()) / 255
train3_stacked.shape, train7_stacked.shape

(torch.Size([6131, 28, 28]), torch.Size([6265, 28, 28]))

In [7]:
train3_stacked[0][4][10:15]

tensor([0.0000, 0.1647, 0.4627, 0.8588, 0.6510])

In [8]:
valid3_stacked = (torch.stack([torch.as_tensor(array(Image.open(im))) for im in (data/'train'/'3').ls().sorted()]).float()) / 255
valid7_stacked = (torch.stack([torch.as_tensor(array(Image.open(im))) for im in (data/'train'/'7').ls().sorted()]).float()) / 255
valid3_stacked.shape, valid7_stacked.shape

(torch.Size([6131, 28, 28]), torch.Size([6265, 28, 28]))

In [133]:
# Almost forgot again about the torch randn...

weights = (torch.randn((28*28, 1))).requires_grad_() # I was adding 1.0 to all the weights, for some unknown reason...
bias = (torch.randn((1)) + 1.0).requires_grad_()
weights.shape, bias.shape, weights.mean()

(torch.Size([784, 1]),
 torch.Size([1]),
 tensor(0.0198, grad_fn=<MeanBackward0>))

In [91]:
# I'm not sure if I should be getting something between 0 and 1...not sure if that's right for weights.
# And if it's not, why does that matter?

weights[2], bias[0]

(tensor([-0.4036], grad_fn=<SelectBackward0>),
 tensor(1.9356, grad_fn=<SelectBackward0>))

In [73]:
ys = torch.cat([torch.as_tensor(([1] * len(valid3_stacked))),
                torch.as_tensor(([0] * len(valid7_stacked)))]).unsqueeze(-1)
len(valid3_stacked), len(valid7_stacked), ys.shape

(6131, 6265, torch.Size([12396, 1]))

In [74]:
xs = torch.cat([valid3_stacked, valid7_stacked]).view(-1, 28*28)
xys = list(zip(xs, ys))
len(xys), xs.shape

(12396, torch.Size([12396, 784]))

In [75]:
from fastai.data.load import DataLoader

dset = DataLoader(xys, bs=256, shuffle=True)
dset.one_batch()[0].shape, dset.one_batch()[1].shape

(torch.Size([256, 784]), torch.Size([256, 1]))

In [76]:
def linear1(xb): return xb@weights + bias

In [84]:
def mnist_loss(preds, tars): 
    # print(f"preds: {preds[0]}, tars: {tars[0]}")
    temp = torch.where(tars == 1, 1-preds, preds)
    # print(f"loss before mean: {temp[0]} -- shape: {temp.shape}")
    return temp.mean()

In [64]:
loss_test = mnist_loss(torch.as_tensor([0.1, 0.8, 0.4]), torch.as_tensor([1, 1, 0]))
loss_test

preds: 0.10000000149011612, tars: 1
loss before mean: 0.8999999761581421 -- shape: torch.Size([3])


tensor(0.5000)

In [65]:
def batch_accuracy(preds, valid_ys): return (((preds > 0.5) == valid_ys)).float().mean()

In [66]:
acc_test = batch_accuracy(torch.as_tensor([.4, .9, .55]), torch.as_tensor([1, 0, 1]))
acc_test

tensor(0.3333)

In [67]:
valid_xs = torch.cat([valid3_stacked, valid7_stacked]).view(-1, 28*28)
valid_ys = torch.cat([torch.as_tensor([1] * len(valid3_stacked)), torch.as_tensor([0] * len(valid7_stacked))])
valid_xs.shape, valid_ys.shape

(torch.Size([12396, 784]), torch.Size([12396]))

In [126]:
def calc_batch_accuracy():
    preds = linear1(valid_xs).sigmoid()
    print(batch_accuracy(preds, valid_ys).item())

In [131]:
def one_epoch(lr=0.01, pr=True):
    for xb, yb in dset:
        preds = linear1(xb)
        # print(f"preds: {preds[0]}")
        preds = preds.sigmoid_() # initially forgot the sigmoid here
        # print(f"preds: {preds[0]}")
        loss = mnist_loss(preds, yb)
        # I was printing the loss here, but that doesn't really make sense. I'd want the loss and accuracy over the entire dataset, not from each batch...
        # if pr: print(loss.item(), end=' ')
        # print(f"loss: {loss}")
        loss.backward()
        print(f"weights grad: {weights.grad.mean()}, bias grad: {bias.grad}")
        for param in (weights, bias):
            # print(f"before: {param.data[0]}, grad: {param.grad[0]}")
            param.data -= param.grad * lr
            # print(f"after: {param.data[0]}")
            param.grad = None
    if pr: calc_batch_accuracy()

In [132]:
one_epoch()

weights grad: -2.8590175134013407e-05, bias grad: tensor([-0.0002])
weights grad: -0.00012591762060765177, bias grad: tensor([-0.0013])
weights grad: -0.00013849942479282618, bias grad: tensor([-0.0011])
weights grad: -3.768434180528857e-05, bias grad: tensor([-0.0004])
weights grad: 7.102797098923475e-05, bias grad: tensor([0.0008])
weights grad: -5.5793901992728934e-05, bias grad: tensor([-0.0005])
weights grad: 3.5758985177380964e-05, bias grad: tensor([0.0004])
weights grad: -6.720107921864837e-05, bias grad: tensor([-0.0005])
weights grad: -4.967653876519762e-05, bias grad: tensor([-0.0003])
weights grad: 8.608797361375764e-05, bias grad: tensor([0.0010])
weights grad: 5.8634832384996116e-05, bias grad: tensor([0.0005])
weights grad: 4.104158142581582e-05, bias grad: tensor([0.0005])
weights grad: -6.640130595769733e-05, bias grad: tensor([-0.0004])
weights grad: -0.00010795096022775397, bias grad: tensor([-0.0008])
weights grad: -3.420831490075216e-05, bias grad: tensor([-0.0003]

In [129]:
def n_epochs(n):
    for i in range(n):
        one_epoch()

In [130]:
n_epochs(10)

0.5000837445259094
0.5000837445259094
0.5000837445259094
0.5000837445259094
0.5000837445259094
0.5000837445259094
0.5000837445259094
0.5000837445259094
0.5000837445259094
0.5000837445259094


Okay, I messed up something here...

My gradients are all zero. What did I mess up?

I'm stumped... Going to take a look at the last attempt's code...

So the way I'm doing the loss doesn't seem to be right. The result isn't a tensor, and therefore the gradient is getting lost (or can't be calculated through).

...

The gradient is now being persisted (not sure how to phrase that) up until the where call, but only until the mean call. 

Hmm, now everything is moving, but not by much...
Going to reinit the params, then try again. Might need to bump the lr?

This is a mess, and my brain is fried. I'm calling it...ugh.