In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import torch.optim as optim

import torchvision

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Network

In [2]:
class LeNet(nn.Module):
    def __init__(self, in_channels: int = 1, num_classes: int =10):
        super(LeNet, self).__init__()


        self.conv1 = nn.Conv2d(in_channels=in_channels, out_channels=6, kernel_size=5) 
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5) 
        
        self.fc1 = nn.Linear(16*4*4, 120) # 16*5*5 for CIFAR10 and 16*4*4 for MNIST
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, num_classes)
        
        
    def forward(self, x):
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        x = F.max_pool2d(F.relu(self.conv2(x)), (2, 2))
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        
        return x
        

In [3]:
class LeNet(nn.Module):

    '''
    https://stackoverflow.com/questions/53580088/calculate-the-output-size-in-convolution-layer
    
    Conv layer / MaxPool output shape formula: floor((Width − KernelSize + 2Padding) / Stride) + 1

    Default Stride for nn.Conv2D == 1
    //       //     // nn.MaxPool2D == Kernel Size



    MNIST for Example:
    '''

    def __init__(self, num_classes: int =10, in_channels: int = 1):
        super(LeNet, self).__init__()
    
        self.features = nn.Sequential(
            nn.Conv2d(in_channels=in_channels, out_channels=6, kernel_size=(5,5)), # (28-5 + 2*0 / 1) + 1 == 24 --> 24*24*6
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=(2,2)), # (24-2 + 2*0 / 2) + 1 == 12 --> 12*12*6
            
            nn.Conv2d(in_channels=6, out_channels=16, kernel_size=(5,5)), # (12-5 + 2*0 / 1) + 1 == 9 --> 9*9*16
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=(2,2)), # (9 - 2 + 2*0 / 2 + 1 == 4 --> 4*4*16)
        )
        
        
        self.fc = nn.Sequential(
        nn.Linear(16*4*4, 120),
        nn.ReLU(inplace=True),
        nn.Linear(120, 84),
        nn.ReLU(inplace=True),
        nn.Linear(84, num_classes),
        )
        
        
    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, 1) # can't use nn.Flatten(x, 1)
        x = self.fc(x)
        return x

# Data

### MNIST

In [4]:
# Load the MNIST dataset
mnist_train = torchvision.datasets.MNIST(
    root='../data',
    train=True,
    download=True,
    transform=torchvision.transforms.ToTensor()
)

# Load the MNIST test dataset
mnist_test = torchvision.datasets.MNIST(
    root='../data',
    train=False,
    download=True,
    transform=torchvision.transforms.ToTensor()
)

train_loader = torch.utils.data.DataLoader(mnist_train, batch_size=16, shuffle=True)
val_loader = torch.utils.data.DataLoader(mnist_test, batch_size=16, shuffle=True)

### CIFAR10

In [None]:
import torchvision

# Load the cifar10 dataset
cifar10_train = torchvision.datasets.CIFAR10(
    root='../data',
    train=True,
    download=True,
    transform=torchvision.transforms.ToTensor()
)

# Load the cifar10 test dataset
cifar10_test = torchvision.datasets.CIFAR10(
    root='../data',
    train=False,
    download=True,
    transform=torchvision.transforms.ToTensor()
)

train_loader = torch.utils.data.DataLoader(cifar10_train, batch_size=16, shuffle=True)
val_loader = torch.utils.data.DataLoader(cifar10_test, batch_size=16, shuffle=True)

# Train

In [20]:
model = LeNet(in_channels=1).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01)
# criterion = nn.MSELoss()
criterion = nn.CrossEntropyLoss()

### Basic

In [21]:
epochs = 10
for epoch in range(epochs):
    epoch_loss = 0
    
    for batch in train_loader:
        X, y = batch[0].to(device), batch[1].to(device)

        optimizer.zero_grad() # ~ model.zero_grad()

        output = model(X)

        loss = criterion(output, y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss

    print(f'Epoch: {epoch}, Loss: {epoch_loss}')

Epoch: 0, Loss: 3548.939697265625
Epoch: 1, Loss: 490.649169921875
Epoch: 2, Loss: 334.4135437011719
Epoch: 3, Loss: 260.4508056640625
Epoch: 4, Loss: 218.77500915527344
Epoch: 5, Loss: 190.89358520507812
Epoch: 6, Loss: 170.44808959960938
Epoch: 7, Loss: 152.34683227539062
Epoch: 8, Loss: 136.0486602783203
Epoch: 9, Loss: 124.96707916259766


### Ignite

In [6]:
from ignite.engine import Engine, Events, create_supervised_trainer, create_supervised_evaluator
from ignite.metrics import Accuracy, Loss
from ignite.handlers import ModelCheckpoint
from ignite.contrib.handlers import TensorboardLogger, global_step_from_engine

In [7]:
trainer = create_supervised_trainer(model, optimizer, criterion, device)

val_metrics = {
    "accuracy": Accuracy(),
    "loss": Loss(criterion)
}

train_evaluator = create_supervised_evaluator(model, metrics=val_metrics, device=device)
val_evaluator = create_supervised_evaluator(model, metrics=val_metrics, device=device)

In [8]:
def train_step(engine, batch):
    model.train()
    optimizer.zero_grad()
    x, y = batch[0].to(device), batch[1].to(device)
    y_pred = model(x)
    loss = criterion(y_pred, y)
    loss.backward()
    optimizer.step()
    return loss.item()

trainer = Engine(train_step)

def validation_step(engine, batch):
    model.eval()
    with torch.no_grad():
        x, y = batch[0].to(device), batch[1].to(device)
        y_pred = model(x)
        return y_pred, y

train_evaluator = Engine(validation_step)
val_evaluator = Engine(validation_step)

# Attach metrics to the evaluators
for name, metric in val_metrics.items():
    metric.attach(train_evaluator, name)

for name, metric in val_metrics.items():
    metric.attach(val_evaluator, name)


In [9]:
# How many batches to wait before logging training status
log_interval = 100

In [10]:
@trainer.on(Events.EPOCH_COMPLETED(every=log_interval))
def log_training_loss(engine):
    print(f"Epoch[{engine.state.epoch}], Iter[{engine.state.iteration}] Loss: {engine.state.output:.2f}")


In [11]:
@trainer.on(Events.EPOCH_COMPLETED)
def log_training_results(trainer):
    train_evaluator.run(train_loader)
    metrics = train_evaluator.state.metrics
    print(f"Training Results - Epoch[{trainer.state.epoch}] Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['loss']:.2f}")


@trainer.on(Events.EPOCH_COMPLETED)
def log_validation_results(trainer):
    val_evaluator.run(val_loader)
    metrics = val_evaluator.state.metrics
    print(f"Validation Results - Epoch[{trainer.state.epoch}] Avg accuracy: {metrics['accuracy']:.2f} Avg loss: {metrics['loss']:.2f}")


In [14]:
# Score function to return current value of any metric we defined above in val_metrics
def score_function(engine):
    return engine.state.metrics["accuracy"]

# Checkpoint to store n_saved best models wrt score function
model_checkpoint = ModelCheckpoint(
    "checkpoint",
    n_saved=2,
    filename_prefix="best",
    score_function=score_function,
    score_name="accuracy",
    global_step_transform=global_step_from_engine(trainer), # helps fetch the trainer's state
)
  
# Save the model after every epoch of val_evaluator is completed
val_evaluator.add_event_handler(Events.COMPLETED, model_checkpoint, {"model": model})


<ignite.engine.events.RemovableEventHandle at 0x7ff025249ad0>

In [15]:
# Define a Tensorboard logger
tb_logger = TensorboardLogger(log_dir="tb-logger")

# Attach handler to plot trainer's loss every 100 iterations
tb_logger.attach_output_handler(
    trainer,
    event_name=Events.EPOCH_COMPLETED(every=100),
    tag="training",
    output_transform=lambda loss: {"batch_loss": loss},
)

# Attach handler for plotting both evaluators' metrics after every epoch completes
for tag, evaluator in [("training", train_evaluator), ("validation", val_evaluator)]:
    tb_logger.attach_output_handler(
        evaluator,
        event_name=Events.EPOCH_COMPLETED,
        tag=tag,
        metric_names="all",
        global_step_transform=global_step_from_engine(trainer),
    )


In [16]:
trainer.run(train_loader, max_epochs=10)

Training Results - Epoch[1] Avg accuracy: 0.95 Avg loss: 0.18
Validation Results - Epoch[1] Avg accuracy: 0.95 Avg loss: 0.16
Training Results - Epoch[2] Avg accuracy: 0.97 Avg loss: 0.11
Validation Results - Epoch[2] Avg accuracy: 0.97 Avg loss: 0.09
Training Results - Epoch[3] Avg accuracy: 0.98 Avg loss: 0.07
Validation Results - Epoch[3] Avg accuracy: 0.98 Avg loss: 0.06
Training Results - Epoch[4] Avg accuracy: 0.98 Avg loss: 0.07
Validation Results - Epoch[4] Avg accuracy: 0.98 Avg loss: 0.06
Training Results - Epoch[5] Avg accuracy: 0.98 Avg loss: 0.06
Validation Results - Epoch[5] Avg accuracy: 0.98 Avg loss: 0.05
Training Results - Epoch[6] Avg accuracy: 0.99 Avg loss: 0.04
Validation Results - Epoch[6] Avg accuracy: 0.99 Avg loss: 0.04
Training Results - Epoch[7] Avg accuracy: 0.99 Avg loss: 0.04
Validation Results - Epoch[7] Avg accuracy: 0.98 Avg loss: 0.05
Training Results - Epoch[8] Avg accuracy: 0.99 Avg loss: 0.04
Validation Results - Epoch[8] Avg accuracy: 0.98 Avg los

State:
	iteration: 37500
	epoch: 10
	epoch_length: 3750
	max_epochs: 10
	output: 0.0030934270471334457
	batch: <class 'list'>
	metrics: <class 'dict'>
	dataloader: <class 'torch.utils.data.dataloader.DataLoader'>
	seed: <class 'NoneType'>
	times: <class 'dict'>

In [19]:
tb_logger.close()

%load_ext tensorboard

%tensorboard --logdir=.

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 20328), started 0:01:45 ago. (Use '!kill 20328' to kill it.)