In [1]:
import torch
from torch.autograd import Variable
from torch.optim import Adam
from torch import nn
import torch.nn.functional as F
from torchvision.datasets import mnist
from torchvision import transforms
from torch.utils.data import DataLoader

In [2]:
NUM_CLASSES = 10
BATCH_SIZE = 200
LEARING_RATE = 0.001

In [3]:
# train set
dataset = mnist.MNIST('./data/', train=True, download=True, transform=transforms.ToTensor())
loader = DataLoader(dataset, batch_size=BATCH_SIZE)

# validation set
validation_dataset = mnist.MNIST('./data/', train=False, download=True, transform=transforms.ToTensor())
validation_loader = DataLoader(dataset, batch_size=BATCH_SIZE)

In [4]:
# input_size
data, _ = next(iter(loader))
x_size = len(data[0][0][0])
y_size = len(data[0][0])
input_size = x_size * y_size  # flatten 28x28 tensor to 1x784 tensor
print(input_size)

784


In [5]:
HIDDEN_SIZE = 50

class Model2Linear(nn.Module):
    def __init__(self):
        super().__init__()
        self.h1 = nn.Linear(input_size, HIDDEN_SIZE)
        self.h2 = nn.Linear(HIDDEN_SIZE, HIDDEN_SIZE)
        self.h3 = nn.Linear(HIDDEN_SIZE, NUM_CLASSES)
    
    def forward(self, x):
        x = x.data.view(-1, input_size)
        x = self.h1(x)
        x = F.relu(x)
        x = self.h2(x)
        x = F.relu(x)
        x = self.h3(x)
        x = F.softmax(x, dim=1)
        return x

In [6]:
HIDDEN_SIZE = 100

class Model1Linear(nn.Module):
    def __init__(self):
        super().__init__()
        self.h1 = nn.Linear(input_size, HIDDEN_SIZE)
        self.h2 = nn.Linear(HIDDEN_SIZE, NUM_CLASSES)
    
    def forward(self, x):
        x = x.data.view(-1, input_size)
        x = self.h1(x)
        x = F.relu(x)
        x = self.h2(x)
        x = F.softmax(x, dim=1)
        return x

In [13]:
HIDDEN_SIZE = 100

class Model1LinearDropout(nn.Module):
    def __init__(self):
        super().__init__()
        self.h1 = nn.Linear(input_size, HIDDEN_SIZE)
        self.h2 = nn.Linear(HIDDEN_SIZE, NUM_CLASSES)
    
    def forward(self, x):
        x = x.data.view(-1, input_size)
        x = self.h1(x)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.h2(x)
        x = F.softmax(x, dim=1)
        return x

In [8]:
HIDDEN_SIZE = 100

class ModelConv1(nn.Module):
    def __init__(self):
        super().__init__()
        self.h1 = nn.Linear(input_size, HIDDEN_SIZE)
        self.h2 = nn.Linear(HIDDEN_SIZE, NUM_CLASSES)
        
        # (in_channels, out_channels, kernel_size
        
        # in_channels=1 (input image is grayscale => one channel)
        # out_channels=10 ()
        
        # (old_width + 2*padding - kernel_size)/stride + 1
        # (28 + 2*0 - 5)/1 + 1
        
        # stride=1, padding=0, dilation=1, groups=1
        # 28 + 2*0 - 1 * (5 -1) -1 + 1 = 24
        
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)

    
    def forward(self, x):
        x = x.data.view(-1, input_size)
        x = self.h1(x)
        x = F.relu(x)
        x = self.h2(x)
        x = F.softmax(x, dim=1)
        return x

In [9]:
HIDDEN_SIZE = 100

class ModelConv2(nn.Module):
    def __init__(self):
        super().__init__()
        self.h1 = nn.Linear(input_size, HIDDEN_SIZE)
        self.h2 = nn.Linear(HIDDEN_SIZE, NUM_CLASSES)
    
    def forward(self, x):
        x = x.data.view(-1, input_size)
        x = self.h1(x)
        x = F.relu(x)
        x = self.h2(x)
        x = F.softmax(x, dim=1)
        return x

In [10]:
# 2d dropout
# 2d pool
# layer count/size

In [11]:
def evalulate(model):
    model.eval()
    loss = 0.0
    for data, labels in validation_loader:
        predictions_per_class = model(data.cuda())
        _, highest_prediction_class = predictions_per_class.max(1)
        loss += F.nll_loss(predictions_per_class, labels.cuda())
    return loss/len(validation_loader)

In [14]:
def learn():
    model = Model1LinearDropout().cuda()
    optimizer = Adam(params=model.parameters(), lr=LEARING_RATE)

    for epoch in range(1000):
        model.train()
        for data, labels in loader:
            predictions_per_class = model(data.cuda())
            highest_prediction, highest_prediction_class = predictions_per_class.max(1)

            # how good are we? compare output with the target classes
            loss = F.nll_loss(predictions_per_class, labels.cuda())

            model.zero_grad() # ???
            loss.backward() # backpropagate
            optimizer.step()
        
        validation_loss = evalulate(model)
        print(f'Epoch: {epoch}, Loss: {validation_loss.item()}')
        
    return model
%time model = learn()

Epoch: 0, Loss: -0.8663519620895386
Epoch: 1, Loss: -0.903980016708374
Epoch: 2, Loss: -0.9190024137496948
Epoch: 3, Loss: -0.9279651045799255
Epoch: 4, Loss: -0.9344733953475952
Epoch: 5, Loss: -0.9396764636039734
Epoch: 6, Loss: -0.9440144896507263
Epoch: 7, Loss: -0.9476982951164246
Epoch: 8, Loss: -0.9500924944877625
Epoch: 9, Loss: -0.9528350234031677
Epoch: 10, Loss: -0.953758180141449
Epoch: 11, Loss: -0.9566394090652466
Epoch: 12, Loss: -0.9583799242973328
Epoch: 13, Loss: -0.9598350524902344
Epoch: 14, Loss: -0.9610174894332886
Epoch: 15, Loss: -0.9623236060142517
Epoch: 16, Loss: -0.9629250168800354
Epoch: 17, Loss: -0.9639860987663269
Epoch: 18, Loss: -0.96522057056427
Epoch: 19, Loss: -0.9657012224197388
Epoch: 20, Loss: -0.9664510488510132
Epoch: 21, Loss: -0.966827929019928
Epoch: 22, Loss: -0.9676613807678223
Epoch: 23, Loss: -0.9682779312133789
Epoch: 24, Loss: -0.9685783982276917
Epoch: 25, Loss: -0.9699547290802002
Epoch: 26, Loss: -0.9698930382728577
Epoch: 27, Loss:

Epoch: 220, Loss: -0.9881453514099121
Epoch: 221, Loss: -0.9878230094909668
Epoch: 222, Loss: -0.9882858395576477
Epoch: 223, Loss: -0.9880897998809814
Epoch: 224, Loss: -0.9883807897567749
Epoch: 225, Loss: -0.9881869554519653
Epoch: 226, Loss: -0.9882554411888123
Epoch: 227, Loss: -0.9884114861488342
Epoch: 228, Loss: -0.9883939623832703
Epoch: 229, Loss: -0.9881075024604797
Epoch: 230, Loss: -0.9881932735443115
Epoch: 231, Loss: -0.9882181882858276
Epoch: 232, Loss: -0.9879837036132812
Epoch: 233, Loss: -0.9881662130355835
Epoch: 234, Loss: -0.9882230758666992
Epoch: 235, Loss: -0.9885725975036621
Epoch: 236, Loss: -0.9882437586784363
Epoch: 237, Loss: -0.9884817600250244
Epoch: 238, Loss: -0.9885255098342896
Epoch: 239, Loss: -0.9882961511611938
Epoch: 240, Loss: -0.9885178804397583
Epoch: 241, Loss: -0.9885518550872803
Epoch: 242, Loss: -0.9884817004203796
Epoch: 243, Loss: -0.9885640740394592
Epoch: 244, Loss: -0.9885975122451782
Epoch: 245, Loss: -0.9883281588554382
Epoch: 246, 

KeyboardInterrupt: 

tensor(-0.9610, device='cuda:0', grad_fn=<DivBackward0>)
