#### Setting up dataloader with persistent_workers set to True to reduce waiting at enumerate(train_loader) line

[source](https://discuss.pytorch.org/t/what-are-the-dis-advantages-of-persistent-workers/102110)

With this option to false, every time your code hits a line line for sample in dataloader:, it will create a brand new set of workers to do this loading and will kill them on exit.
Meaning that if you have multiple dataloaders, the workers will be killed when you are done with one instantly.

If you make them persist, these workers will stay around (with their state) waiting for another call into that dataloader.

Setting this to True will improve performances when you call into the dataloader multiple times in a row (as creating the workers is expensive). But it also means that the dataloader will have some persistent state even when it is not used (which can use some RAM depending on your dataset).

In [1]:
import torch
from torchvision import transforms, datasets
from util import TwoCropTransform, AverageMeter

batch_size = 512
num_workers = 12
img_size = 32

def set_loader(batch_size, num_workers, img_size):
    mean = (0.4914, 0.4822, 0.4465)
    std = (0.2023, 0.1994, 0.2010)

    normalize = transforms.Normalize(mean=mean, std=std)
    # data augmentation
    train_transform = transforms.Compose([
        transforms.Resize(size=(img_size, img_size)),
        transforms.RandomResizedCrop(size=img_size, scale=(0.2, 1.)),
        transforms.RandomHorizontalFlip(),
        transforms.RandomApply([
            transforms.ColorJitter(0.4, 0.4, 0.4, 0.1)
        ], p=0.8),
        transforms.RandomGrayscale(p=0.2),
        transforms.RandomApply([transforms.GaussianBlur(kernel_size=img_size//20*2+1, sigma=(0.1, 2.0))], p=0.5 if img_size>32 else 0.0),
        transforms.ToTensor(),
        normalize,
    ])

    train_dataset = datasets.CIFAR10(root='../data',
                                     transform=TwoCropTransform(train_transform),
                                     download=True)

    train_sampler = None
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, shuffle=(train_sampler is None),
        num_workers=num_workers, pin_memory=True, sampler=train_sampler,
        persistent_workers=True )

    return train_loader

train_loader = set_loader(batch_size, num_workers, img_size)

### On SupConResNet architecture
#### ReLU(inplace=True)
inplace=True means that it will modify the input directly, without allocating any additional output. It can sometimes slightly decrease the memory usage, but may not always be a valid operation (because the original input is destroyed). However, if you don’t see an error, it means that your use case is valid. [source](https://discuss.pytorch.org/t/whats-the-difference-between-nn-relu-and-nn-relu-inplace-true/948/2)

In [3]:
from networks.resnet_extended import SupConResNet
from losses_negative_only import SupConLoss
import torch.backends.cudnn as cudnn
from torchinfo import summary

def set_model(temp=0.5):
    model = SupConResNet('resnet18')
    criterion = SupConLoss(temperature=temp)

    if torch.cuda.is_available():
        if torch.cuda.device_count() > 1:
            model.encoder = torch.nn.DataParallel(model.encoder)
        model = model.cuda()
        criterion = criterion.cuda()
        cudnn.benchmark = True

    return model, criterion

model, criterion = set_model()
summary(model, input_size=(batch_size, 3, img_size, img_size))

Layer (type:depth-idx)                        Output Shape              Param #
SupConResNet                                  [512, 128]                --
├─ResNet: 1-1                                 [512, 512]                --
│    └─Conv2d: 2-1                            [512, 64, 32, 32]         1,728
│    └─BatchNorm2d: 2-2                       [512, 64, 32, 32]         128
│    └─Sequential: 2-3                        [512, 64, 32, 32]         --
│    │    └─BasicBlock: 3-1                   [512, 64, 32, 32]         73,984
│    │    └─BasicBlock: 3-2                   [512, 64, 32, 32]         73,984
│    └─Sequential: 2-4                        [512, 128, 16, 16]        --
│    │    └─BasicBlock: 3-3                   [512, 128, 16, 16]        230,144
│    │    └─BasicBlock: 3-4                   [512, 128, 16, 16]        295,424
│    └─Sequential: 2-5                        [512, 256, 8, 8]          --
│    │    └─BasicBlock: 3-5                   [512, 256, 8, 8]          9

In [None]:
summary(model, input_size=(256, 3, img_size, img_size))

In [None]:
import torch.optim as optim

def set_optimizer(model, lr=0.5, momentum=0.9, weight_decay=1e-4):
    optimizer = optim.SGD(model.parameters(),
                          lr=lr,
                          momentum=momentum,
                          weight_decay=weight_decay)
    return optimizer

optimizer = set_optimizer(model)

In [None]:
import time

time1 = time.time()
for idx, (images, labels) in enumerate(train_loader):
    time2 = time.time()
    print(idx, (images, labels), time2-time1)

In [None]:
def train(train_loader:torch.utils.data.DataLoader, model:SupConResNet, criterion, optimizer, epoch:int):
    """
    one epoch training

    train_loader : union of current task samples and buffered samples, without any oversampling.
    model : the new model to train (this function calls .train() on it)
    model2 : frozen previous model, in test mode (previously called .eval() on it)
    criterion : 
    optimizer : stochastic gradient descent for the model's parameters, with specified learning rate, momentum and weight decay
    epoch : specifies which epoch this is, to calculate a warm-up learning rate if the epoch is in warm-up phase
    """
    model.train()

    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    distill = AverageMeter()

    end = time.time()
    print("entering batch loop...")
    for idx, (images, labels) in enumerate(train_loader):
        data_time.update(time.time() - end)

        #concatenate images from both classes of the current task
        images = torch.cat([images[0], images[1]], dim=0)
        if torch.cuda.is_available():
            print("copying images and labels to CUDA memory...")
            images_cuda = images.cuda(non_blocking=True)
            labels = labels.cuda(non_blocking=True)
        bsz = labels.shape[0] #batch size

        #forward pass
        print("forward pass to model...")
        predictions, features = model(images_cuda, return_feat=True)
        print("shape of images tensor : ", images.shape)

        # AsymSupCon loss
        f1, f2 = torch.split(features, [bsz, bsz], dim=0)
        features = torch.cat([f1.unsqueeze(1), f2.unsqueeze(1)], dim=1)
        loss, logprob, mask = criterion(features, labels, target_labels=[i for i in range(0,10)])
        print("AsymSupCon loss : ",loss)

        # update metric
        losses.update(loss.item(), bsz)

        # SGD
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        # print info
        print('Train: [{0}][{1}/{2}]\t'
              'BT {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
              'DT {data_time.val:.3f} ({data_time.avg:.3f})\t'
              'loss {loss.val:.3f} ({loss.avg:.3f} {distill.avg:.3f})'.format(
               epoch, idx + 1, len(train_loader), batch_time=batch_time,
               data_time=data_time, loss=losses, distill=distill))
        sys.stdout.flush()

    return losses.avg, model2

# train for one epoch
time1 = time.time()
loss, model = train(train_loader, model, criterion, optimizer, epoch=0)
time2 = time.time()
print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1))