In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import torch.optim as optim

In [None]:
torch.cuda.is_available() # gpu 사용 확인

True

In [None]:
use_cuda=torch.cuda.is_available()
device=torch.device("cuda" if use_cuda else "cpu")
device #cuda

device(type='cuda')

이미지 데이터에 대한 데이터 전처리

In [None]:
transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, padding = 4),
    transforms.ToTensor(),
    transforms.Normalize((0.507, 0.487, 0.441),(0.267, 0.256, 0.276))
])

Dataset 불러오기

In [None]:
train_dataset = datasets.CIFAR10(root= '/data', train = True, download=True, transform=transform)
test_dataset = datasets.CIFAR10(root= '/data', train = False, download=True, transform=transform)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:03<00:00, 42985285.12it/s]


Extracting /data/cifar-10-python.tar.gz to /data
Files already downloaded and verified


Dataloader 생성


1.   batchsize = 128
2.   Trainset은 shuffle, Testset은 no-shuffle


In [None]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=2)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=2)

하이퍼파라미터 설정

In [None]:
epochs = 150
learning_rate = 0.01
momentum = 0.9
weight_decay = 0.0001

Teacher model 생성 및 학습

In [None]:
class Teacher(nn.Module):
    def __init__(self, num_classes=10):
        super(Teacher, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3,32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),

            nn.Conv2d(32,32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),

            nn.MaxPool2d(kernel_size=2, stride =2),

            nn.Conv2d(32,64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),

            nn.Conv2d(64,64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),

            nn.MaxPool2d(kernel_size=2, stride =2),


            nn.Conv2d(64,128,kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),

            nn.Conv2d(128,128,kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),

            nn.MaxPool2d(kernel_size=2, stride =2),


            nn.Conv2d(128,256,kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),

            nn.Conv2d(256,256,kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),

            nn.Conv2d(256,256,kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),

            nn.Conv2d(256,256,kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),

            nn.MaxPool2d(kernel_size=2, stride =2),
        )

        self.fc_layers = nn.Sequential(
            nn.Linear(1024,128),
            nn.Linear(128,10),
        )

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.fc_layers(x)
        return x


teacher = Teacher().to(device) #Teacher모델 gpu에 생성
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(teacher.parameters(),lr=learning_rate,momentum=momentum,weight_decay=weight_decay)

for epoch in range(epochs):
    teacher.train()
    train_loss = 0
    correct = 0
    total = 0

    for idx, (images, labels) in enumerate(train_loader):
        images = images.to(device)
        labels = labels.to(device)

        output = teacher(images)
        loss = criterion(output,labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = torch.max(output.data, 1)
        total += labels.size(0)
        correct += predicted.eq(labels.data).cpu().sum()

        if idx % 100 == 0:
            print('Epoch: {:3d} | Batch_idx: {:3d} |  Loss: {:.4f} | Acc: {:3.2f}%'.format(
                epoch, idx, train_loss / (idx + 1), 100. * correct / total))

print("============Training finished=============")


teacher.eval()  # 모델 평가모드

with torch.no_grad():
    correct = 0
    val_acc = 0
    total = 0

    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)

        outputs = teacher(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    val_acc = 100 * correct / total
    print('Accuracy on the test set: {}'.format(val_acc))


torch.save({
    'epoch': epoch,
    'model_state_dict': teacher.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': loss.item(),},
    '/content/teacher_model.pth') # 모델 epochs, weight, opimizer 상태,loss 값 등 체크포인트 저장



Epoch:   0 | Batch_idx:   0 |  Loss: 2.3939 | Acc: 7.03%
Epoch:   0 | Batch_idx: 100 |  Loss: 1.7468 | Acc: 35.03%
Epoch:   0 | Batch_idx: 200 |  Loss: 1.5765 | Acc: 41.30%
Epoch:   0 | Batch_idx: 300 |  Loss: 1.4640 | Acc: 45.96%
Epoch:   1 | Batch_idx:   0 |  Loss: 1.1040 | Acc: 60.16%
Epoch:   1 | Batch_idx: 100 |  Loss: 0.9855 | Acc: 65.18%
Epoch:   1 | Batch_idx: 200 |  Loss: 0.9580 | Acc: 65.93%
Epoch:   1 | Batch_idx: 300 |  Loss: 0.9284 | Acc: 67.17%
Epoch:   2 | Batch_idx:   0 |  Loss: 0.7818 | Acc: 73.44%
Epoch:   2 | Batch_idx: 100 |  Loss: 0.7721 | Acc: 73.11%
Epoch:   2 | Batch_idx: 200 |  Loss: 0.7595 | Acc: 73.34%
Epoch:   2 | Batch_idx: 300 |  Loss: 0.7437 | Acc: 73.93%
Epoch:   3 | Batch_idx:   0 |  Loss: 0.6621 | Acc: 75.00%
Epoch:   3 | Batch_idx: 100 |  Loss: 0.6420 | Acc: 77.38%
Epoch:   3 | Batch_idx: 200 |  Loss: 0.6411 | Acc: 77.53%
Epoch:   3 | Batch_idx: 300 |  Loss: 0.6411 | Acc: 77.62%
Epoch:   4 | Batch_idx:   0 |  Loss: 0.5501 | Acc: 82.81%
Epoch:   4 | Ba

모델 학습 함수

In [None]:
class Student(nn.Module):
    def __init__(self, num_classes=10):
        super(Student,self).__init__()

        self.features = nn.Sequential(
            nn.Conv2d(3,16,kernel_size=3, padding =1),
            nn.BatchNorm2d(16),
            nn.ReLU(),

            nn.MaxPool2d(kernel_size=2, stride=2),


            nn.Conv2d(16,16,kernel_size=3, padding =1),
            nn.BatchNorm2d(16),
            nn.ReLU(),

            nn.MaxPool2d(kernel_size=2, stride=2),

        )
        self.fc_layers = nn.Linear(1024,10)

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x= self.fc_layers(x)
        return x


student = Student().to(device)  #Student모델 gpu에 생성
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(student.parameters(),lr=learning_rate,momentum=momentum,weight_decay=weight_decay)

for epoch in range(epochs):
    student.train()
    train_loss = 0
    correct = 0
    total = 0

    for idx, (images, labels) in enumerate(train_loader):
        images = images.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()

        output = student(images)
        loss = criterion(output,labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = torch.max(output.data, 1)
        total += labels.size(0)
        correct += predicted.eq(labels.data).cpu().sum()

        if idx % 100 == 0:
            print('Epoch: {:3d} | Batch_idx: {:3d} |  Loss: {:.4f} | Acc: {:3.2f}%'.format(
                epoch, idx, train_loss / (idx + 1), 100. * correct / total))

print("============Training finished=============")

student.eval()  # 모델 평가모드
with torch.no_grad(): #no gradient
    correct = 0
    val_acc = 0
    total = 0

    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)

        outputs = student(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    val_acc = 100 * correct / total
    print('Accuracy on the test set: {}'.format(val_acc))

torch.save({
    'epoch': epoch,
    'model_state_dict': student.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': loss.item(),
    }, '/content/student_model.pth') # 모델 epochs, weight, opimizer 상태,loss 값 등 체크포인트 저장

NameError: ignored

Knowledge Distillation Train

*  10 layers 모델(Teacher) > 2 layer 모델(Student)





In [None]:
trained_teacher = Teacher().to(device)
model_ckp = torch.load('/content/teacher_model.pth')
trained_teacher.load_state_dict(model_ckp['model_state_dict']) #teacher model 새로 생성후 teacher_checkpoint load를 가져와서 trained_teacher에 적용


lambda_ = 0.0001 #Knowledge distillation을 위한 parameters (lamda_, T)
T = 4.5
kl_div_loss = nn.KLDivLoss() #Knowledge distillation을 위한 Cost function


for epoch in range(epochs):
    student.train() #위에 train된 student 모델이 아니라 위에 만든 student를 그대로 다시 적용
    train_loss = 0
    correct = 0
    total = 0

    for idx, (images, labels) in enumerate(train_loader):
        images = images.to(device)
        labels = labels.to(device)

        s_output = student(images)
        t_output =trained_teacher(images)

        loss_SL = criterion(s_output, labels) # Standard Learning loss
        loss_KD = kl_div_loss(F.log_softmax(s_output / T, dim=1),
                            F.softmax(t_output / T, dim=1))
        loss = (1 - lambda_) * loss_SL + lambda_ * T * T * loss_KD  # total_loss = (1 −λ)⋅loss_SL +λ⋅T^2 ⋅loss_KD)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = torch.max(s_output.data, 1)
        total += labels.size(0)
        correct += predicted.eq(labels.data).cpu().sum()

        if idx % 100 == 0:
            print('Epoch: {:3d} | Batch_idx: {:3d} |  Loss: {:.4f} | Acc: {:3.2f}%'.format(
                epoch, idx, train_loss / (idx + 1), 100. * correct / total))

print("============Training finished=============")


student.eval()  # 모델 평가모드
with torch.no_grad(): #no gradient
    correct = 0
    val_acc = 0
    total = 0

    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)

        outputs = student(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        val_acc = 100 * correct / total
        print('Accuracy on the test set: {}'.format(val_acc))


torch.save({
    'epoch': epoch,
    'model_state_dict': student.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': loss.item(),},
    '/content/KD_model.pth') # KD 적용한 student 모델의 epochs, weight, opimizer 상태,loss 값 등 체크포인트 저장


NameError: ignored