In [18]:
import torch
import torchvision
import torch.nn as nn 
import torch.nn.functional as F
from torch.utils.data.dataset import TensorDataset
from torch.utils.data.dataloader import DataLoader

In [19]:
# download datasets
mnist_dev = torchvision.datasets.MNIST("./data", train=True, download=True)
mnist_test = torchvision.datasets.MNIST("./data", train=False, download=True)

# extract data and targets
x_dev, y_dev = mnist_dev.data / 255, mnist_dev.targets

# split train and val, construct dataset
n_train = 50000
train_set = TensorDataset(x_dev[:n_train], y_dev[:n_train])
val_set = TensorDataset(x_dev[n_train:], y_dev[n_train:])

# construct dataloader
dataloaders = {}
dataloaders["train"] = DataLoader(train_set, batch_size=64, shuffle=True)
dataloaders["val"] = DataLoader(val_set, batch_size=64, shuffle=True)


In [20]:
class CFunc(nn.Module):
    def __init__(self, func):
        super().__init__()
        self.func = func
    
    def forward(self, input):
        return self.func(input)

In [21]:
class CNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, stride=2, padding=1)
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=16, kernel_size=3, stride=2, padding=1)
        self.conv3 = nn.Conv2d(in_channels=16, out_channels=10, kernel_size=3, stride=2, padding=1)
    
    def forward(self, x):
       x = x.view(-1, 1, 28, 28) 
       x = F.relu(self.conv1(x))
       x = F.relu(self.conv2(x))
       x = F.relu(self.conv3(x))
       x = F.avg_pool2d(x, kernel_size=4)
       x = x.view(x.size(0), -1)
       return x

# class CNN(nn.Module):
#     def __init__(self, num_channels, num_classes):
#         super().__init__()
#         self.conv1 = nn.Conv2d(1, num_channels, kernel_size=3, stride=2, padding=1)
#         self.conv2 = nn.Conv2d(num_channels, num_channels, kernel_size=3, stride=2, padding=1)
#         self.conv3 = nn.Conv2d(num_channels, num_classes, kernel_size=3, stride=2, padding=1)
    
#     def forward(self, xb):
#         xb = xb.view(-1, 1, 28, 28)
#         xb = F.relu(self.conv1(xb))
#         xb = F.relu(self.conv2(xb))
#         xb = F.relu(self.conv3(xb))
#         xb = F.avg_pool2d(xb, 4)
#         return xb.view(-1, xb.size(1)) 

In [22]:
def initialize_weight(module):
    if isinstance(module, (nn.Linear, nn.Conv2d)):
        nn.init.xavier_normal_(module.weight)

In [23]:
model = CNN()
model.apply(initialize_weight)

CNN(
  (conv1): Conv2d(1, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (conv2): Conv2d(16, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (conv3): Conv2d(16, 10, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
)

In [24]:
# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# print(device)

In [25]:
# model.to(device)

In [26]:
def fit(model, dataloaders, optimizer, loss_fn, num_epoch):
    for epoch in range(num_epoch):
        model.train()
        # one minibatch
        for x_mini, y_mini in dataloaders["train"]:
            # Training
            # x_mini.to(device)
            # y_mini.to(device)
            logits = model(x_mini)
            loss_train = loss_fn(logits, y_mini)
            model.zero_grad()
            loss_train.backward()
            optimizer.step()

        # Evaluation
        model.eval()
        loss_val = 0 
        with torch.no_grad():
            for x_mini, y_mini in dataloaders["val"]:
                loss_val += loss_fn(model(x_mini), y_mini)
            loss_val /= len(dataloaders["val"])

        print("Epoch {}: loss_train = {}, loss_val = {}".format(epoch, loss_train, loss_val))



In [27]:
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
fit(model, dataloaders, optimizer, F.cross_entropy, 2)

Epoch 0: loss_train = 0.3921363651752472, loss_val = 0.3937610983848572
Epoch 1: loss_train = 0.4083484411239624, loss_val = 0.2568962574005127


In [28]:
model = nn.Sequential(
    # (N, 28, 28) -> (N, 1, 28, 28)
    CFunc(lambda x: x.view(-1, 1, 28, 28)),
    # (N, 1, 28, 28) -> (N, 16, 14, 14)
    nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, stride=2, padding=1),
    nn.ReLU(),
    # (N, 16, 14, 14) -> (N, 16, 7, 7)
    nn.Conv2d(in_channels=16, out_channels=16, kernel_size=3, stride=2, padding=1),
    nn.ReLU(),
    # (N, 16, 7, 7) -> (N, 10, 4, 4)
    nn.Conv2d(in_channels=16, out_channels=10, kernel_size=3, stride=2, padding=1),
    nn.ReLU(), 
    # 
    nn.AdaptiveAvgPool2d(output_size=1),
    CFunc(lambda x: x.view(x.size(0), -1))
)

In [12]:
model.apply(initialize_weight)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
fit(model, dataloaders, optimizer, F.cross_entropy, 2)

Epoch 0: loss_train = 0.2679460942745209, loss_val = 0.3013773262500763
Epoch 1: loss_train = 0.1895914077758789, loss_val = 0.2513209581375122


In [13]:
print(torch.cuda.is_available())
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

True


In [14]:
model.to(device)

Sequential(
  (0): CFunc()
  (1): Conv2d(1, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (2): ReLU()
  (3): Conv2d(16, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (4): ReLU()
  (5): Conv2d(16, 10, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (6): ReLU()
  (7): AdaptiveAvgPool2d(output_size=1)
  (8): CFunc()
)

In [15]:
dataloaders = {}
dataloaders["train"] = DataLoader(train_set, batch_size=64, shuffle=True, pin_memory=True)
dataloaders["val"] = DataLoader(val_set, batch_size=64, shuffle=False, pin_memory=True)

In [16]:
def fit(model, dataloaders, optimizer, loss_fn, num_epoch):
    for epoch in range(num_epoch):
        model.train()
        # one minibatch
        for x_mini, y_mini in dataloaders["train"]:
            # Training
            x_mini = x_mini.to(device)
            y_mini = y_mini.to(device)
            logits = model(x_mini)
            loss_train = loss_fn(logits, y_mini)
            model.zero_grad()
            loss_train.backward()
            optimizer.step()

        # Evaluation
        model.eval()
        loss_val = 0 
        with torch.no_grad():
            for x_mini, y_mini in dataloaders["val"]:
                x_mini = x_mini.to(device)
                y_mini = y_mini.to(device)
                loss_val += loss_fn(model(x_mini), y_mini)
            loss_val /= len(dataloaders["val"])

        print("Epoch {}: loss_train = {}, loss_val = {}".format(epoch, loss_train, loss_val))



In [17]:
model.apply(initialize_weight)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
fit(model, dataloaders, optimizer, F.cross_entropy, 2)

Epoch 0: loss_train = 0.35736456513404846, loss_val = 0.33639249205589294
Epoch 1: loss_train = 0.0864199697971344, loss_val = 0.2613961696624756
