In [None]:
#!g1.1
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import random
from IPython.display import clear_output

import torch
import torch.nn as nn
import torch.nn.functional as F

import torchvision
from torchvision import transforms
from copy import deepcopy
from torch.utils.data import DataLoader

# random.seed(42)
# np.random.seed(42)
# torch.manual_seed(42)
# if torch.cuda.is_available():
#     torch.cuda.manual_seed_all(42)

Загрузим датасет с большой неразмеченной частью

In [None]:
#!g1.1
from torchvision.datasets import STL10
from torchvision.transforms import ToTensor


trainDataset = STL10(root="data", split="train", download=True, transform=ToTensor())
trainDatasetWithUnlabeled = STL10(
    root="data", split="train+unlabeled", download=True, transform=ToTensor()
)
testDataset = STL10(root="data", split="test", download=True, transform=ToTensor())

Downloading http://ai.stanford.edu/~acoates/stl10/stl10_binary.tar.gz to data/stl10_binary.tar.gz


  0%|          | 0/2640397119 [00:00<?, ?it/s]

Extracting data/stl10_binary.tar.gz to data
Files already downloaded and verified
Files already downloaded and verified


In [None]:
#!g1.1
train = DataLoader(
    trainDataset,
    batch_size=128,
    shuffle=True,
)
trainUnl = DataLoader(
    trainDatasetWithUnlabeled,
    batch_size=128,
    shuffle=True,
)
val = DataLoader(
    testDataset,
    batch_size=128,
)
dataloaders = {
    "train": train,
    "trainUnl": trainUnl,
    "val": val
}
dataset_sizes = {
    "train": len(train.dataset),
    "trainUnl": len(trainUnl.dataset),
    "val": len(val.dataset)
}

Соберем дефолтную аугментацию

In [None]:
#!g1.1
def getDefaultAugmentation(imageSize=(224,224)):
    return nn.Sequential(*[
        transforms.Resize(size=imageSize),
        transforms.RandomApply(nn.ModuleList([
            transforms.ColorJitter(0.8, 0.8, 0.8, 0.2),
        ]), p=0.4),
        transforms.RandomGrayscale(p=0.2),
        transforms.RandomHorizontalFlip(),
        transforms.GaussianBlur((3, 3)),
        transforms.RandomResizedCrop(size=imageSize),
        transforms.Normalize(
            mean=torch.tensor([0.485, 0.456, 0.406]),
            std=torch.tensor([0.229, 0.224, 0.225]),
        ),
    ])

# Реализация BYOL

In [None]:
#!g1.1
class BYOL(nn.Module):
    def __init__(self,
                 reprModel = torchvision.models.resnet18(),
                 reprOutputSize = 1000,
                 projectionHidden = 4096,
                 projectionSize = 256,
                 augmentationModel = getDefaultAugmentation(),
                 predictionModel = nn.Sequential(*[
                        nn.Linear(256, 4096),
                        nn.ReLU(),
                        nn.Linear(4096, 256),
                 ]).cuda(),
                 beta = 0.9
    ):
        super(BYOL, self).__init__()
        self.augmentationModel = augmentationModel
        self.targetModel = deepcopy(reprModel)
        self.onlineModel = deepcopy(reprModel)
        self.projectionTargetModel = nn.Sequential(*[
            nn.Linear(reprOutputSize, projectionHidden),
            nn.ReLU(),
            nn.Linear(projectionHidden, projectionSize),
        ]).cuda()
        self.projectionOnlineModel = nn.Sequential(*[
            nn.Linear(reprOutputSize, projectionHidden),
            nn.ReLU(),
            nn.Linear(projectionHidden, projectionSize),
        ]).cuda()
        self.predictionModel = predictionModel
        self.beta = beta
    
    def forward(self, x, mode="target"):
        x = self.augmentationModel(x)
        x = self.targetModel(x) if mode == "target" else self.onlineModel(x)
        x = self.projectionTargetModel(x) if mode == "target" else self.projectionOnlineModel(x)
        return x
    
    def getPrediction(self, x):
        return self.predictionModel(x)

    # обновляем веса target моделей через online
    def updateTarget(self):
        for x, y in zip(self.targetModel.parameters(), self.onlineModel.parameters()):
            x.data = x * self.beta + y * (1 - self.beta)
        for x, y in zip(self.projectionTargetModel.parameters(), self.projectionOnlineModel.parameters()):
            x.data = x * self.beta + y * (1 - self.beta)

Train supervised

# Code

In [None]:
#!g1.1
t = nn.Sequential(*[
    transforms.Resize(size=(224, 224)),
    transforms.Normalize(
        mean=torch.tensor([0.485, 0.456, 0.406]),
        std=torch.tensor([0.229, 0.224, 0.225]),
    ),
])

def train_model(model, criterion, optimizer, scheduler, transform=False, num_epochs=25):
    train_loss = []
    val_loss = []
    train_acc = []
    val_acc = []
    since = time.time()

    best_model_wts = model.state_dict()
    best_acc = 0.0

    for epoch in range(num_epochs):
        clear_output(True)
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                scheduler.step()
                model.train(True)  # Set model to training mode
            else:
                model.train(False)  # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for data in dataloaders[phase]:
                # get the inputs
                inputs, labels = data

                inputs = inputs.cuda()
                if transform:
                    inputs = t(inputs)
                labels = labels.cuda()

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                outputs = model(inputs)
                _, preds = torch.max(outputs, 1)
                loss = criterion(outputs, labels)

                # backward + optimize only if in training phase
                if phase == 'train':
                    loss.backward()
                    optimizer.step()

                # statistics
                running_loss += loss.item()
                running_corrects += torch.sum(preds == labels).type(torch.float)

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = model.state_dict()
        
        time_elapsed = time.time() - since
        print('Elapsed {:.0f}m {:.0f}s\n'.format(time_elapsed // 60, time_elapsed % 60))

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [None]:
#!g1.1
from torchvision import models
import torch.optim as optim
import torchvision

from torchvision import datasets, models, transforms
from torch.optim import lr_scheduler
import time

# Supervised train

Дообучим resnet18 на размеченных данных

In [None]:
#!g1.1
model_ft = models.resnet18(pretrained=True)

for params in model_ft.parameters():
    params.requires_grad = False 

model_ft.fc = nn.Linear(512, 10)
params_to_train = model_ft.parameters()
model_ft = model_ft.cuda()

criterion = nn.CrossEntropyLoss()
optimizer_ft = optim.SGD(params_to_train, lr=0.001, momentum=0.9)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


  0%|          | 0.00/44.7M [00:00<?, ?B/s]

In [None]:
#!g1.1
model_ft_all = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, transform=True, num_epochs=25)

Epoch 0/24
----------


  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


train Loss: 0.0136 Acc: 0.5020
val Loss: 0.0077 Acc: 0.8369
Elapsed 0m 29s

Epoch 1/24
----------
train Loss: 0.0061 Acc: 0.8738
val Loss: 0.0044 Acc: 0.9044
Elapsed 0m 57s

Epoch 2/24
----------
train Loss: 0.0040 Acc: 0.9122
val Loss: 0.0033 Acc: 0.9208
Elapsed 1m 25s

Epoch 3/24
----------
train Loss: 0.0033 Acc: 0.9190
val Loss: 0.0028 Acc: 0.9309
Elapsed 1m 53s

Epoch 4/24
----------
train Loss: 0.0027 Acc: 0.9294
val Loss: 0.0026 Acc: 0.9311
Elapsed 2m 21s

Epoch 5/24
----------
train Loss: 0.0026 Acc: 0.9316
val Loss: 0.0024 Acc: 0.9349
Elapsed 2m 49s

Epoch 6/24
----------
train Loss: 0.0024 Acc: 0.9404
val Loss: 0.0023 Acc: 0.9358
Elapsed 3m 17s

Epoch 7/24
----------
train Loss: 0.0025 Acc: 0.9358
val Loss: 0.0023 Acc: 0.9375
Elapsed 3m 45s

Epoch 8/24
----------
train Loss: 0.0023 Acc: 0.9394
val Loss: 0.0023 Acc: 0.9359
Elapsed 4m 13s

Epoch 9/24
----------
train Loss: 0.0023 Acc: 0.9402
val Loss: 0.0023 Acc: 0.9353
Elapsed 4m 41s

Epoch 10/24
----------
train Loss: 0.0023 

# BOYL train code

In [None]:
#!g1.1
import numpy as np

def train_BOYLmodel(model, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = model.state_dict()
    best_acc = 0.0

    for epoch in range(num_epochs):
        l = []
        clear_output(True)
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Iterate over data.
        for data in dataloaders["trainUnl"]:
            # get the inputs
            inputs, _ = data
            inputs = inputs.cuda()

            optimizer.zero_grad()

            outputs1 = model(inputs, mode="online")
            outputs2 = model(inputs, mode="target")
            # берем prediction для online модели
            predict1 = model.getPrediction(outputs1)
            diff = torch.norm((F.normalize(predict1, dim=-1) - F.normalize(outputs2, dim=-1)), dim=1) ** 2
            loss = torch.mean(diff)

            loss.backward()
            optimizer.step()
            l.append(loss.item())
        print("loss = ", np.mean(l))
        model.updateTarget()
    return model

# BOYL train process

In [None]:
#!g1.1
import itertools

In [None]:
#!g1.1
model_ft = models.resnet18(pretrained=True).cuda()
for params in model_ft.parameters():
    params.requires_grad = False 

model_ft.fc = nn.Linear(512, 1000)
byol = BYOL(reprModel=model_ft.cuda())

# оптимизируем только онлайн часть
optimizer_ft = optim.SGD(itertools.chain(*[
    byol.onlineModel.parameters(),
    byol.projectionOnlineModel.parameters(),
    byol.predictionModel.parameters()
]), lr=0.001, momentum=0.9)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

In [14]:
#!g1.1
model_BYOL = train_BOYLmodel(byol, optimizer_ft, exp_lr_scheduler, num_epochs=8)

Epoch 7/7
----------
loss =  0.3200041124649373


# Linear evaluation protocol

На зафиксированных имбедах применяем линейную регрессию

In [15]:
#!g1.1
for params in model_BYOL.targetModel.parameters():
    params.requires_grad = False
for params in model_BYOL.projectionTargetModel.parameters():
    params.requires_grad = False
for params in model_BYOL.onlineModel.parameters():
    params.requires_grad = False
for params in model_BYOL.projectionOnlineModel.parameters():
    params.requires_grad = False
for params in model_BYOL.predictionModel.parameters():
    params.requires_grad = False

In [16]:
#!g1.1
emb_model = nn.Sequential(*[
#     model_BYOL.onlineModel,
    model_BYOL.targetModel,
    model_BYOL.projectionTargetModel,
#     model_BYOL.projectionOnlineModel,
    model_BYOL.predictionModel
])

lr_model = nn.Sequential(*[
    emb_model,
    nn.Linear(256, 10),
#     nn.Linear(1000, 10),
])
lr_model = lr_model.cuda()
params_to_train = lr_model.parameters()


In [17]:
#!g1.1
criterion = nn.CrossEntropyLoss()
optimizer_ft = optim.SGD(params_to_train, lr=0.001, momentum=0.9)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

Сложно получить заметные результаты на такой модели, так как нужны достаточно большие вычилительные ресурсы. Цитата из статьи:

**We use a batch size of 4096 split over 512 Cloud TPU v3 cores.
With this setup, training takes approximately 8 hours for a ResNet-50(×1)**

In [18]:
#!g1.1

lr_model = train_model(lr_model, criterion, optimizer_ft, exp_lr_scheduler, transform=True, num_epochs=25)

Epoch 24/24
----------
train Loss: 0.0183 Acc: 0.2182
val Loss: 0.0181 Acc: 0.2201
Elapsed 11m 53s

Training complete in 11m 53s
Best val Acc: 0.222000
