In [3]:
# MNIST on PyTorch from "scratch" (no optimizer)

import torch
import requests, gzip, os, hashlib, tempfile
import numpy as np

In [4]:
def fetch(url):
    temp_dir = tempfile.gettempdir()
    file_path = os.path.join(temp_dir, hashlib.md5(url.encode('utf-8')).hexdigest())

    # if exists don't redownload
    if os.path.isfile(file_path):
        with open(file_path, "rb") as file:
            data = file.read()
    else:
        with open(file_path, "wb") as file:
            data = requests.get(url).content
            file.write(data)

    return np.frombuffer(gzip.decompress(data), dtype=np.uint8).copy()

In [5]:
import random
import torch.nn.functional as F

class MNIST_MLP():
    def __init__(self, device):
        self.w1 = torch.randn(512, 784, requires_grad=True, device=device)
        self.b1 = torch.zeros(512, requires_grad=True, device=device) 
        self.w2 = torch.randn(10, 512, requires_grad=True, device=device)
        self.b2 = torch.zeros(10, requires_grad=True, device=device)
        self.relu = torch.nn.ReLU()
        self.ce_loss = torch.nn.CrossEntropyLoss()
        
    def train(self, x, y, batch_size, iterations, learning_rate):
        y = y.long() if isinstance(y, torch.Tensor) else torch.tensor(y, dtype=torch.long)

        # normalize
        if x.max() > 1.0:
            x = x / 255.0

        for _ in range(iterations):
            sample_start = random.randint(0, len(x) - batch_size)
            x_sample = x[sample_start:sample_start + batch_size]
            y_sample = y[sample_start:sample_start + batch_size]
            
             # forward
            act = self.relu(F.linear(x_sample, self.w1, self.b1))
            lin2 = F.linear(act, self.w2, self.b2)
            
            # calculate loss
            loss = self.ce_loss(lin2, y_sample)
            
            # backward
            if self.w1.grad != None: self.w1.grad.zero_()
            if self.b1.grad != None: self.b1.grad.zero_()
            if self.w2.grad != None: self.w2.grad.zero_()
            if self.b2.grad != None: self.b2.grad.zero_()
            
            loss.backward()
            
            self.w1.data -= (learning_rate * self.w1.grad)
            self.b1.data -= (learning_rate * self.b1.grad)
            
            self.w2.data -= (learning_rate * self.w2.grad)
            self.b2.data -= (learning_rate * self.b2.grad)

        print(f"training loss: {loss}")

    def sample(self, x, y):
        if x.max() > 1.0:
            x = x / 255.0
        y = y.long() if isinstance(y, torch.Tensor) else torch.tensor(y, dtype=torch.long)
        act = self.relu(F.linear(x, self.w1, self.b1))        
        lin2 = F.linear(act, self.w2, self.b2)
        softmax = F.softmax(lin2, dim=1)

        correct = 0
        for prediction, actual in zip(softmax, y):
            if torch.argmax(prediction) == actual:
                correct += 1

        accuracy = correct / len(y)
        print(f"accuracy on test set: {accuracy}")
        
        loss = self.ce_loss(lin2, y)
        print(f"sample loss: {loss}")

In [6]:
import time

def run_mnist(use_cuda):
    x_train = torch.Tensor(fetch("https://raw.githubusercontent.com/fgnt/mnist/master/train-images-idx3-ubyte.gz")[16:].reshape(-1, 28, 28))
    y_train = torch.Tensor(fetch("https://raw.githubusercontent.com/fgnt/mnist/master/train-labels-idx1-ubyte.gz")[8:])
    x_test = torch.Tensor(fetch("https://raw.githubusercontent.com/fgnt/mnist/master/t10k-images-idx3-ubyte.gz")[16:].reshape(-1, 28, 28))
    y_test = torch.Tensor(fetch("https://raw.githubusercontent.com/fgnt/mnist/master/t10k-labels-idx1-ubyte.gz")[8:])

    x_train = x_train.view(60000, -1)
    x_test = x_test.view(10000, -1)
    
    device = torch.device("cpu")
    if use_cuda:
        device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
        
    mnist_mlp = MNIST_MLP(device)
    if device == torch.device("cuda"):
        print("Using CUDA")
        x_train = x_train.to("cuda")
        y_train = y_train.to("cuda")
        x_test = x_test.to("cuda")
        y_test = y_test.to("cuda")
    else:
        print("Using CPU")
        
    start = time.time()
    mnist_mlp.train(x_train, y_train, 60, 50000, 0.01)
    mnist_mlp.sample(x_test, y_test)
    print("--- %s seconds ---" % (time.time() - start))

In [7]:
run_mnist(True)

Using CUDA
training loss: 0.17054659128189087
accuracy on test set: 0.9422
sample loss: 1.6557892560958862
--- 40.78183078765869 seconds ---


In [8]:
run_mnist(False)

Using CPU
training loss: 3.4570373941278376e-07
accuracy on test set: 0.94
sample loss: 1.6168708801269531
--- 490.6525583267212 seconds ---
