In [9]:
import torch
import argparse
import torch.backends.cudnn as cudnn
import torch.multiprocessing as mp
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data.distributed
from torch.utils.tensorboard import SummaryWriter
from torchvision import datasets, transforms, models
import horovod.torch as hvd
import os
import torch.nn as nn
import math
from tqdm import tqdm

In [10]:
# Training settings in Jupyter Notebook
train_dir = 'Desktop/project-thesis/tiny-imagenet-200/train'
val_dir = 'Desktop/project-thesis/tiny-imagenet-200/val'
log_dir = './logs'
checkpoint_format = './checkpoint-{epoch}.pth.tar'
fp16_allreduce = False
use_adasum = False
batches_per_allreduce = 1
gradient_predivide_factor = 1.0
# Custom settings
batch_size = 64  # Set your desired batch size
val_batch_size = 64  # Set your desired validation batch size
epochs = 100  # Set your desired number of epochs
base_lr = 0.001  # Set your desired learning rate
warmup_epochs = 5  # Keep this as per your needs
momentum = 0.9  # Keep this as per your needs
wd = 0.00005  # Keep this as per your needs
no_cuda = True  # Set to True to disable CUDA
seed = 42  # Set your desired seed for reproducibility

# Now you can use these variables in your code


In [11]:
def adjust_learning_rate(epoch, batch_idx):
    if epoch < warmup_epochs:
        epoch += float(batch_idx + 1) / len(train_loader)
        lr_adj = 1. / hvd.size() * (epoch * (hvd.size() - 1) / warmup_epochs + 1)
    elif epoch < 30:
        lr_adj = 1.
    elif epoch < 60:
        lr_adj = 1e-1
    elif epoch < 80:
        lr_adj = 1e-2
    else:
        lr_adj = 1e-3
    for param_group in optimizer.param_groups:
        param_group['lr'] = base_lr * hvd.size() * batches_per_allreduce * lr_adj

In [12]:
def accuracy(output, target):
    # get the index of the max log-probability
    pred = output.max(1, keepdim=True)[1]
    return pred.eq(target.view_as(pred)).cpu().float().mean()

In [13]:

def save_checkpoint(epoch):
    if hvd.rank() == 0:
        filepath = checkpoint_format.format(epoch=epoch + 1)
        state = {
            'model': model.state_dict(),
            'optimizer': optimizer.state_dict(),
        }
        torch.save(state, filepath)


In [14]:
# Horovod: average metrics from distributed training.
class Metric(object):
    def __init__(self, name):
        self.name = name
        self.sum = torch.tensor(0.)
        self.n = torch.tensor(0.)

    def update(self, val):
        self.sum += hvd.allreduce(val.detach().cpu(), name=self.name)
        self.n += 1

    @property
    def avg(self):
        return self.sum / self.n

In [15]:

def train(epoch):
    model.train()
    train_sampler.set_epoch(epoch)
    train_loss = Metric('train_loss')
    train_accuracy = Metric('train_accuracy')

    with tqdm(total=len(train_loader),
          desc='Train Epoch #{}'.format(epoch + 1),
          disable=not verbose) as t:
        for batch_idx, (data, target) in enumerate(train_loader):
            adjust_learning_rate(epoch, batch_idx)

            optimizer.zero_grad()
            # Split data into sub-batches of size batch_size
            for i in range(0, len(data), batch_size):
                data_batch = data[i:i + batch_size]
                target_batch = target[i:i + batch_size]
                output = model(data_batch)
                train_accuracy.update(accuracy(output, target_batch))
                loss = F.cross_entropy(output, target_batch)
                train_loss.update(loss)
                # Average gradients among sub-batches
                loss.div_(math.ceil(float(len(data)) / batch_size))
                loss.backward()
                #if i % 20 == 0:
                #    print(f"Batch {i}: loss={loss}, accuracy={accuracy}")
            # Gradient is applied across all ranks
            optimizer.step()
            t.set_postfix({'loss': train_loss.avg.item(),
                           'accuracy': 100. * train_accuracy.avg.item()})
            t.update(1)

    if log_writer:
        log_writer.add_scalar('train/loss', train_loss.avg, epoch)
        log_writer.add_scalar('train/accuracy', train_accuracy.avg, epoch)


def validate(epoch):
    model.eval()
    val_loss = Metric('val_loss')
    val_accuracy = Metric('val_accuracy')

    with tqdm(total=len(val_loader),
              desc='Validate Epoch  #{}'.format(epoch + 1),
              disable=not verbose) as t:
        with torch.no_grad():
            for data, target in val_loader:
                output = model(data)

                val_loss.update(F.cross_entropy(output, target))
                val_accuracy.update(accuracy(output, target))
                t.set_postfix({'loss': val_loss.avg.item(),
                               'accuracy': 100. * val_accuracy.avg.item()})
                t.update(1)

    if log_writer:
        log_writer.add_scalar('val/loss', val_loss.avg, epoch)
        log_writer.add_scalar('val/accuracy', val_accuracy.avg, epoch)

In [16]:
device = "cpu"
# Set up standard ResNet-18 model.
model = models.resnet18()
# Finetune Final few layers to adjust for tiny imagenet input
model.avgpool = nn.AdaptiveAvgPool2d(1)
num_features = model.fc.in_features
model.fc = nn.Linear(num_features, 200)
model = model.to(device)

In [17]:
common_transforms = transforms.Compose(
    [
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ]
)

In [18]:
# Create datasets using the defined transforms
train_dataset = datasets.ImageFolder(train_dir, transform=common_transforms)
val_dataset = datasets.ImageFolder(val_dir, transform=common_transforms)

In [None]:
allreduce_batch_size = batch_size * batches_per_allreduce

hvd.init()
torch.manual_seed(seed)

# If set > 0, will resume training from a given checkpoint.
resume_from_epoch = 0
for try_epoch in range(epochs, 0, -1):
    if os.path.exists(checkpoint_format.format(epoch=try_epoch)):
        resume_from_epoch = try_epoch
        break

# Horovod: broadcast resume_from_epoch from rank 0 (which will have
# checkpoints) to other ranks.
resume_from_epoch = hvd.broadcast(torch.tensor(resume_from_epoch), root_rank=0,
                                  name='resume_from_epoch').item()

# Horovod: print logs on the first worker.
verbose = 1 if hvd.rank() == 0 else 0

# Horovod: write TensorBoard logs on first worker.
log_writer = SummaryWriter(log_dir) if hvd.rank() == 0 else None

# Horovod: limit # of CPU threads to be used per worker.
torch.set_num_threads(4)

kwargs = {'num_workers': 4, 'pin_memory': False}  # No need for pin_memory on CPU
# When supported, use 'forkserver' to spawn dataloader workers instead of 'fork'
if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and
        mp._supports_context and 'forkserver' in mp.get_all_start_methods()):
    kwargs['multiprocessing_context'] = 'forkserver'

train_sampler = torch.utils.data.distributed.DistributedSampler(
    train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=allreduce_batch_size,
    sampler=train_sampler, **kwargs)

val_sampler = torch.utils.data.distributed.DistributedSampler(
    val_dataset, num_replicas=hvd.size(), rank=hvd.rank())
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=val_batch_size,
                                         sampler=val_sampler, **kwargs)


lr_scaler = batches_per_allreduce * hvd.size() if not use_adasum else 1


# Horovod: scale learning rate by the number of GPUs.
optimizer = optim.SGD(model.parameters(),
                      lr=(base_lr * lr_scaler),
                      momentum=momentum, weight_decay=wd)

# Horovod: (optional) compression algorithm.
compression = hvd.Compression.fp16 if fp16_allreduce else hvd.Compression.none

# Horovod: wrap optimizer with DistributedOptimizer.
optimizer = hvd.DistributedOptimizer(
    optimizer, named_parameters=model.named_parameters(),
    compression=compression,
    backward_passes_per_step=batches_per_allreduce,
    op=hvd.Adasum if use_adasum else hvd.Average,
    gradient_predivide_factor=gradient_predivide_factor)

# Restore from a previous checkpoint, if initial_epoch is specified.
if resume_from_epoch > 0 and hvd.rank() == 0:
    filepath = checkpoint_format.format(epoch=resume_from_epoch)
    checkpoint = torch.load(filepath)
    model.load_state_dict(checkpoint['model'])
    optimizer.load_state_dict(checkpoint['optimizer'])

# Horovod: broadcast parameters & optimizer state.
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
hvd.broadcast_optimizer_state(optimizer, root_rank=0)

for epoch in range(resume_from_epoch, epochs):
    train(epoch)
    validate(epoch)
    save_checkpoint(epoch)


Train Epoch #4:  37%|███▋      | 584/1563 [1:41:08<143:18:56, 527.00s/it, loss=4.23, accuracy=11.5]