### **Import and Model**

In [None]:
import os
import torch
import torch.optim as optim
from torch.utils.data import Dataset
import torchvision
from torchvision import datasets
from torchvision import transforms
import torch.nn as nn
from torchvision.transforms import ToTensor
from model import classification_pvt

### **CIFAR100**

In [None]:
transform = transforms.Compose(
    [transforms.Resize((224, 224)),
     transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

batch_size = 128

trainset = torchvision.datasets.CIFAR100(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=2, drop_last=True)

testset = torchvision.datasets.CIFAR100(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=False, num_workers=2, drop_last=True)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

net = classification_pvt(3, 224, 224, batch_size, 100)
net.to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.AdamW(net.parameters(), lr=5e-5, betas=[0.9, 0.999], weight_decay=1e-8)
#optimizer = optim.AdamW(net.parameters(), lr=1e-3, betas=[0.9, 0.999], weight_decay=5e-2) # hyperparameters specified in the paper
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100)

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
save_path = '../ckpt_cifar100/'

for epoch in range(100):
    loss_train = 0.0
    correct_pred = 0
    for i, data in enumerate(trainloader, 0):

        images, labels = data
        images, labels = images.cuda(), labels.cuda()

        optimizer.zero_grad()
        pred = net(images)
        loss = loss_fn(pred, labels)
        torch.autograd.set_detect_anomaly(True) # for debugging
        loss.backward()
        optimizer.step()

        loss_train += loss.item()

    scheduler.step()

    if (epoch+1) % 5 == 0:
      torch.save(net.state_dict(), save_path + f'{epoch+1}.pth')

    with torch.no_grad():
        for data in testloader:
            images, labels = data
            images, labels = images.cuda(), labels.cuda()
            pred = net(images)
            _, predictions = torch.max(pred.data, 1)
            correct_pred += (predictions == labels).sum().item()
    
    print(f'Epoch {epoch + 1} -- loss: {loss_train/(50000//batch_size)}')
    print(f'---------- testing accuracy: {correct_pred/(batch_size*(10000//batch_size))}')
    print('#################################################')

print('Training completed')