In [1]:
import math

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchvision import transforms
from torchvision.datasets import MNIST

import colossalai
from colossalai.core import global_context as gpc
from colossalai.utils import MultiTimer, get_dataloader
from colossalai.logging import get_dist_logger
from colossalai.trainer import Trainer, hooks

Colossalai should be built with cuda extension to use the FP16 optimizer


In [2]:
def get_mnist_data():
    # load MNIST dataset
    train_dataset = MNIST(
        root='./tmp/', train=True, download=True, 
        transform=transforms.Compose([
            transforms.Resize((32, 32)),
            transforms.ToTensor()
        ])
    )    
    test_dataset = MNIST(
        root='./tmp/', train=False, download=True, 
        transform=transforms.Compose([
            transforms.Resize((32, 32)),
            transforms.ToTensor()
        ])
    )
    
    # build dataloader
    train_dataloader = get_dataloader(dataset=train_dataset,
                                      shuffle=True,
                                      batch_size=gpc.config.BATCH_SIZE,
                                      num_workers=1,
                                      pin_memory=True)
    test_dataloader = get_dataloader(dataset=test_dataset,
                                     add_sampler=False,
                                     batch_size=gpc.config.BATCH_SIZE,
                                     num_workers=1,
                                     pin_memory=True)

    return train_dataloader, test_dataloader

In [3]:
class LeNet5(nn.Module):
    
    def __init__(self, n_classes):
        super(LeNet5, self).__init__()
        
        self.feature_extractor = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, stride=1),
            nn.Tanh(),
            nn.AvgPool2d(kernel_size=2),
            nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1),
            nn.Tanh(),
            nn.AvgPool2d(kernel_size=2),
            nn.Conv2d(in_channels=16, out_channels=120, kernel_size=5, stride=1),
            nn.Tanh()
        )

        self.classifier = nn.Sequential(
            nn.Linear(in_features=120, out_features=84),
            nn.Tanh(),
            nn.Linear(in_features=84, out_features=n_classes),
        )
        
        # self.feature_extractor = nn.Sequential(
        #     nn.Conv2d(in_channels=1, out_channels=50, kernel_size=3, padding=1),
        #     nn.ReLU(),
        #     nn.MaxPool2d(2, 2),
        #     nn.Conv2d(in_channels=50, out_channels=100, kernel_size=3, padding=1),
        #     nn.ReLU(),
        #     nn.MaxPool2d(2, 2)
        # )
        
        # self.classifier = nn.Sequential(
        #     nn.Linear(in_features=4900, out_features=100),
        #     nn.ReLU(),
        #     nn.Linear(in_features=100, out_features=n_classes)
        # )


    def forward(self, x):
        x = self.feature_extractor(x)
        x = torch.flatten(x, 1)
        logits = self.classifier(x)
        
        # x = self.feature_extractor(x)
        # x = x.view(-1, 4900)
        # logits = self.classifier(x)
        
        probs = F.softmax(logits, dim=1)
    
        return logits

In [5]:
def train(train_dataloader, test_dataloader,
          num_epochs, optim_type='SGD',
          lr_start=1, lr_low=1e-5, lr_high=10,
          lr_sch='None', lr_sch_params=None,
          eval=False):
    ###===== Build Model =====###
    model = LeNet5(n_classes=10)

    ###===== Set Loss =====###
    criterion = torch.nn.CrossEntropyLoss()
    
    ###===== Set Optimizer =====###
    if optim_type == 'SGD':
        optimizer = torch.optim.SGD(model.parameters(), lr=lr_start, 
                                    momentum=0.9, 
                                    weight_decay=5e-4)
    elif optim_type == 'Adam':
        optimizer = torch.optim.Adam(model.parameters(), lr=lr_start, 
                                     betas=[0.9, 0.999],
                                     eps=1e-9)
    
    ###===== Learning Rate Setting =====###
    def exp_increase_lr(batch):
        ''' exponentially increase learning rate from low to high
        '''
        low = math.log2(lr_low)
        high = math.log2(lr_high)
        
        total_size = len(train_dataloader)
        index = low + (high - low) * batch / (total_size * gpc.config.NUM_EPOCHS)
        updated_lr = 2 ** index

        return updated_lr

    ##=== set lr scheduler ===##
    if lr_sch == 'RangeTest':
        lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer=optimizer, 
                                                   lr_lambda=exp_increase_lr)
    elif lr_sch == 'MultiStep':
        lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer=optimizer, 
                                                      milestones=lr_sch_params['milestones'])
    elif lr_sch == 'OneCycle':
        lr_scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer,
                                                     max_lr=lr_sch_params['max_lr'],
                                                     final_div_factor=lr_sch_params['final_div_factor'],
                                                     steps_per_epoch=len(train_dataloader),
                                                     epochs=num_epochs)
    else:
        lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer=optimizer, 
                                                      milestones=[])
    
    ###===== Training =====###
    timer = MultiTimer()
    logger = get_dist_logger()

    engine, train_dataloader, test_dataloader, _ = \
        colossalai.initialize(model, optimizer, criterion,
                            train_dataloader, test_dataloader)

    ##=== create a trainer object ===##
    trainer = Trainer(engine=engine, timer=timer,
                      logger=logger)

    ##=== define the hooks to attach to the trainer ===##
    if lr_sch == 'RangeTest':
        log_dir = './tb_logs/' + f'{optim_type}_{lr_low}_{lr_high}_{num_epochs}'
    else:
        log_dir = './tb_logs/' + f'{optim_type}_{lr_sch}_{num_epochs}'
    hook_list = [
        hooks.LossHook(),
        hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, 
                              by_epoch=True if lr_sch in ['MultiStep', 'None'] else False),
        
        hooks.LogMetricByEpochHook(logger),
        hooks.LogMemoryByEpochHook(logger),
        hooks.LogTimingByEpochHook(timer, logger),

        hooks.TensorboardHook(log_dir=log_dir, ranks=[0]),
        # hooks.SaveCheckpointHook(checkpoint_dir='./ckpt')
    ]

    ##=== start training ===##
    trainer.fit(train_dataloader=train_dataloader,
                epochs=gpc.config.NUM_EPOCHS,
                test_dataloader=test_dataloader,
                test_interval=1,
                hooks=hook_list,
                display_progress=True)
    
    ##=== evaluating ===##
    if eval:
        GTs, PRs = [], []
        for batch in test_dataloader:
            inputs, labels = batch
            logits = trainer.engine._model(inputs.to(torch.device('cuda')))
            outputs = logits.cpu().argmax(-1)
            PRs.append(outputs)
            GTs.append(labels)
        GTs = torch.cat(GTs, dim=0)
        PRs = torch.cat(PRs, dim=0)
        
        accu = PRs.eq(GTs).sum() / len(PRs)
        
        print('Accuracy on Testing Dataset: {:.3f}%'.format(accu * 100))

In [6]:
batch_size = 128
num_epochs = 30

###===== Set Configuration =====###
config = {
    'BATCH_SIZE':batch_size,
    'NUM_EPOCHS':num_epochs
}
colossalai.launch(config=config, rank=0, world_size=1,
                  host='127.0.0.1', port=1234)

###===== Load Dataset =====###
train_dataloader, test_dataloader = get_mnist_data()

colossalai - root - 2022-04-01 23:24:33,543 INFO: Added key: store_based_barrier_key:1 to store for rank: 0
colossalai - root - 2022-04-01 23:24:33,545 INFO: Added key: store_based_barrier_key:2 to store for rank: 0
colossalai - root - 2022-04-01 23:24:33,546 INFO: Added key: store_based_barrier_key:3 to store for rank: 0
colossalai - root - 2022-04-01 23:24:33,548 INFO: Added key: store_based_barrier_key:4 to store for rank: 0
colossalai - colossalai - 2022-04-01 23:24:33,555 INFO: process rank 0 is bound to device 0
colossalai - colossalai - 2022-04-01 23:24:33,557 INFO: initialized seed on rank 0, numpy: 1024, python random: 1024, ParallelMode.DATA: 1024, ParallelMode.TENSOR: 1024,the default parallel seed is ParallelMode.DATA.
colossalai - colossalai - 2022-04-01 23:24:33,558 INFO: Distributed environment is initialized, data parallel size: 1, pipeline parallel size: 1, tensor parallel size: 1


---
#### LR Range Test
1. SGD
2. Adam

In [None]:
###===== LR Range Test (SGD) =====###
train(train_dataloader, test_dataloader,
      num_epochs=10, 
      optim_type='SGD',
      lr_start=1,
      lr_low=1e-6, 
      lr_high=1.18, 
      lr_sch='RangeTest')

In [None]:
###===== LR Range Test (Adam) =====###
train(train_dataloader, test_dataloader,
      num_epochs=10, 
      optim_type='Adam',
      lr_start=1,
      lr_low=1e-8, 
      lr_high=0.12, 
      lr_sch='RangeTest')

---
#### Training (SGD)
1. No Scheduling
2. MultiStep
3. OneCycle

In [None]:
###===== No Scheduling =====###    
train(train_dataloader, test_dataloader,
      num_epochs=30, 
      optim_type='SGD',
      lr_start=0.1,
      lr_sch='None')

In [8]:
###===== MultiStep LR Scheduler =====###    
train(train_dataloader, test_dataloader,
      num_epochs=30, 
      optim_type='SGD',
      lr_start=0.1,
      lr_sch='MultiStep',
      lr_sch_params={'milestones': [15, 25]},
      eval=True)

colossalai - colossalai - 2022-04-01 23:25:10,500 INFO: 
{'BATCH_SIZE': 128, 'NUM_EPOCHS': 30}

colossalai - colossalai - 2022-04-01 23:25:10,501 INFO: cuDNN benchmark = True, deterministic = False
colossalai - colossalai - 2022-04-01 23:25:14,196 INFO: Using LossHook for training, priority = 0
colossalai - colossalai - 2022-04-01 23:25:14,198 INFO: Using LRSchedulerHook for training, priority = 1
colossalai - colossalai - 2022-04-01 23:25:14,198 INFO: Using LogMetricByEpochHook for training, priority = 10
colossalai - colossalai - 2022-04-01 23:25:14,199 INFO: Using LogMemoryByEpochHook for training, priority = 10
colossalai - colossalai - 2022-04-01 23:25:14,200 INFO: Using LogTimingByEpochHook for training, priority = 10
colossalai - colossalai - 2022-04-01 23:25:14,201 INFO: Using TensorboardHook for training, priority = 10
colossalai - colossalai - 2022-04-01 23:25:14,201 INFO: Lower value means higher priority for calling hook function
colossalai - colossalai - 2022-04-01 23:25:1

Accuracy on Testing Dataset: 99.270%


In [9]:
###===== OneCycle LR Scheduler =====###    
train(train_dataloader, test_dataloader,
      num_epochs=30, 
      optim_type='SGD',
      lr_sch='OneCycle',
      lr_sch_params={'max_lr': 0.1,
                     'final_div_factor': 4},
      eval=True)

colossalai - colossalai - 2022-04-01 23:29:04,915 INFO: 
{'BATCH_SIZE': 128, 'NUM_EPOCHS': 30}

colossalai - colossalai - 2022-04-01 23:29:04,916 INFO: cuDNN benchmark = True, deterministic = False
colossalai - colossalai - 2022-04-01 23:29:04,922 INFO: Using LossHook for training, priority = 0
colossalai - colossalai - 2022-04-01 23:29:04,922 INFO: Using LRSchedulerHook for training, priority = 1
colossalai - colossalai - 2022-04-01 23:29:04,923 INFO: Using LogMetricByEpochHook for training, priority = 10
colossalai - colossalai - 2022-04-01 23:29:04,924 INFO: Using LogMemoryByEpochHook for training, priority = 10
colossalai - colossalai - 2022-04-01 23:29:04,925 INFO: Using LogTimingByEpochHook for training, priority = 10
colossalai - colossalai - 2022-04-01 23:29:04,925 INFO: Using TensorboardHook for training, priority = 10
colossalai - colossalai - 2022-04-01 23:29:04,926 INFO: Lower value means higher priority for calling hook function
colossalai - colossalai - 2022-04-01 23:29:0

Accuracy on Testing Dataset: 99.180%


In [None]:
%load_ext tensorboard
%tensorboard --logdir tb_logs