In [1]:
import numpy as np

import torch
from torch import nn
from torch.nn import functional as F
from torchvision import models

from skorch import NeuralNetClassifier
from skorch.helper import predefined_split


imagenet으로 사전학습된 모델을 cifar100으로 finetuning

In [2]:
from torchvision import transforms, datasets
def get_cifar100_DS(path, input_size=224):
    data_transforms = {
        'train': transforms.Compose([
            transforms.RandomResizedCrop(input_size),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize([0.5071, 0.4867, 0.4408], [0.2675, 0.2565, 0.2761])
        ]),
        'val': transforms.Compose([
            transforms.Resize(input_size),
            transforms.CenterCrop(input_size),
            transforms.ToTensor(),
            transforms.Normalize([0.5071, 0.4867, 0.4408], [0.2675, 0.2565, 0.2761])
        ]),
    }
    imagenet_data_train = datasets.CIFAR100(path, train=True, download=True, transform=data_transforms['train'])
    imagenet_data_val = datasets.CIFAR100(path, train=False, download=True, transform=data_transforms['val'])

    dataset_dict = {
    'train' : imagenet_data_train,
    'val' : imagenet_data_val
    }
    return dataset_dict

cifar100_dict = get_cifar100_DS(path='./../data/')

Files already downloaded and verified
Files already downloaded and verified


# full-finetuning

In [3]:
resnet = models.resnet18(weights=models.ResNet18_Weights)
resnet.fc = nn.Linear(512, 100)



In [19]:
# 시간이 오래 걸리므로 10 epoch만 돌려보기로 함
import time
start_time = time.time()
net = NeuralNetClassifier(
    resnet,
    max_epochs=10, lr=0.001, device='cuda', optimizer=torch.optim.Adam, batch_size=128, criterion=nn.CrossEntropyLoss, train_split=predefined_split(cifar100_dict['val']))
net.fit(cifar100_dict['train'], y=None)
end_time = time.time()
print('Time taken: ', end_time-start_time, ' seconds')

  epoch    train_loss    valid_acc    valid_loss       dur
-------  ------------  -----------  ------------  --------
      1        [36m2.7961[0m       [32m0.4760[0m        [35m1.9346[0m  116.4469
      2        [36m2.1125[0m       [32m0.5311[0m        [35m1.7282[0m  116.4275
      3        [36m1.8879[0m       [32m0.5876[0m        [35m1.4504[0m  116.3372
      4        [36m1.7301[0m       [32m0.6175[0m        [35m1.3658[0m  116.4313
      5        [36m1.6119[0m       [32m0.6388[0m        [35m1.2496[0m  116.5739
      6        [36m1.5231[0m       [32m0.6484[0m        1.2669  116.7009
      7        [36m1.4475[0m       [32m0.6591[0m        [35m1.2176[0m  116.3685
      8        [36m1.3881[0m       [32m0.6772[0m        [35m1.1492[0m  116.3395
      9        [36m1.3249[0m       0.6507        1.3068  116.3075
     10        [36m1.2708[0m       0.6692        1.1951  117.5275
Time taken:  1165.5719316005707  seconds


# Linear probing

In [23]:
resnet = models.resnet18(weights=models.ResNet18_Weights)
resnet.fc = nn.Linear(512, 100)
# mlp층을 제외하고는 freeze
for param in resnet.parameters():
    param.requires_grad = False
for param in resnet.fc.parameters():
    param.requires_grad = True



In [24]:
import time
start_time = time.time()
net = NeuralNetClassifier(
    resnet,
    max_epochs=10, lr=0.001, device='cuda', optimizer=torch.optim.Adam, batch_size=128, criterion=nn.CrossEntropyLoss, train_split=predefined_split(cifar100_dict['val']))
net.fit(cifar100_dict['train'], y=None)
end_time = time.time()
print('Time taken: ', end_time-start_time, ' seconds')

  epoch    train_loss    valid_acc    valid_loss      dur
-------  ------------  -----------  ------------  -------
      1        [36m3.4145[0m       [32m0.4231[0m        [35m2.3412[0m  86.8040
      2        [36m2.7489[0m       [32m0.4628[0m        [35m2.0523[0m  86.8841
      3        [36m2.6000[0m       [32m0.4850[0m        [35m1.9316[0m  87.2739
      4        [36m2.5305[0m       [32m0.4937[0m        [35m1.8848[0m  87.0323
      5        [36m2.5024[0m       [32m0.5028[0m        [35m1.8457[0m  87.1622
      6        [36m2.4649[0m       [32m0.5075[0m        [35m1.8238[0m  87.0487
      7        [36m2.4503[0m       [32m0.5117[0m        [35m1.7934[0m  87.4497
      8        [36m2.4283[0m       [32m0.5159[0m        1.7960  87.1855
      9        [36m2.4144[0m       0.5147        [35m1.7796[0m  87.2174
     10        [36m2.4103[0m       [32m0.5189[0m        [35m1.7693[0m  87.7822
Time taken:  871.9291796684265  seconds


# SSF-ADA

In [19]:
resnet = models.resnet18(weights=models.ResNet18_Weights)
resnet.fc = nn.Linear(512, 100)



In [20]:
# ssf-ada modules
# nn.Module로 상속받아야
class ssf_ada(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.scale = nn.Parameter(torch.ones(dim))
        self.shift = nn.Parameter(torch.zeros(dim))
        
        nn.init.normal_(self.scale, mean=1, std=0.02)
        nn.init.normal_(self.shift, mean=0, std=0.02)

    def forward(self, x):
        assert self.scale.shape == self.shift.shape
        if x.shape[-1] == self.scale.shape[0]:
            return x * self.scale + self.shift
        elif x.shape[1] == self.scale.shape[0]:
            return x * self.scale.view(1, -1, 1, 1) + self.shift.view(1, -1, 1, 1)
        else:
            raise ValueError('the input tensor shape does not match the shape of the scale factor.')

In [21]:
# ssf-ada와 mlp층을 제외하고는 freeze
for param in resnet.parameters():
    param.requires_grad = False
for param in resnet.fc.parameters():
    param.requires_grad = True
    
# 간단하게 각 convolution layer와 block에만 ssf-ada를 적용
resnet_finetune = nn.Sequential(resnet.conv1, ssf_ada(64),
                                resnet.bn1, resnet.relu, resnet.maxpool, 
                                resnet.layer1, ssf_ada(64),
                                resnet.layer2, ssf_ada(128),
                                resnet.layer3, ssf_ada(256),
                                resnet.layer4, ssf_ada(512),
                                resnet.avgpool, nn.Flatten(), resnet.fc)    


In [22]:
import time
start_time = time.time()
net = NeuralNetClassifier(
    resnet_finetune,
    max_epochs=10, lr=0.001, device='cuda', optimizer=torch.optim.Adam, batch_size=128, criterion=nn.CrossEntropyLoss, train_split=predefined_split(cifar100_dict['val']))
net.fit(cifar100_dict['train'], y=None)
end_time = time.time()
print('Time taken: ', end_time-start_time, ' seconds')

  epoch    train_loss    valid_acc    valid_loss       dur
-------  ------------  -----------  ------------  --------
      1        [36m3.2592[0m       [32m0.4780[0m        [35m1.9853[0m  110.0639
      2        [36m2.4548[0m       [32m0.5469[0m        [35m1.6360[0m  110.2602
      3        [36m2.2584[0m       [32m0.5751[0m        [35m1.5020[0m  109.8013
      4        [36m2.1578[0m       [32m0.5755[0m        1.5063  110.4893
      5        [36m2.1076[0m       [32m0.6093[0m        [35m1.3811[0m  111.5079
      6        [36m2.0797[0m       [32m0.6103[0m        [35m1.3678[0m  111.4631
      7        [36m2.0411[0m       0.5636        1.5740  111.4723
      8        [36m2.0273[0m       0.5683        1.5622  111.2941
      9        [36m1.9988[0m       [32m0.6114[0m        1.3827  111.5658
     10        [36m1.9799[0m       [32m0.6247[0m        [35m1.3355[0m  110.0025
Time taken:  1108.0351128578186  seconds


In [27]:
# activated parameters
for name, param in resnet_finetune.named_parameters():
    if param.requires_grad:
        print(name)

1.scale
1.shift
6.scale
6.shift
8.scale
8.shift
10.scale
10.shift
12.scale
12.shift
15.weight
15.bias


- 시간 자체는 큰 차이가 나지 않지만 훈련시키는 파라미터의 수는 압도적으로 적음
- 좀 더 정확한 실험을 위해서는 epoch을 늘려볼 필요가 있음