# Imports

In [3]:
import random
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from PIL import Image
from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from os import listdir
from os.path import join
from torch.optim import SGD
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms, models
from torchinfo import summary

# Seed

In [4]:
SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    print(torch.cuda.get_device_name())

NVIDIA GeForce RTX 3060 Laptop GPU


# Constants

In [5]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DATASET_PATH = 'data'

IMG_SIZE = (224, 224)
GLOBAL_MEAN, GLOBAL_STD = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]
BATCH_SIZE = 128
VAL_SIZE = .1
EPOCHS = 2

# Dataset

In [6]:
class SportsDataset(Dataset):
    def __init__(self, data_path, label2id, is_train=True, transform=None):
        self.data_path = data_path
        self.is_train = is_train
        self.folder = join(data_path, 'train' if is_train else 'test')
        self.images = listdir(self.folder)
        self.dataframe = pd.read_csv(f'{self.folder}.csv')
        self.label_encoding = encoder
        self.transform = transform

    def __getitem__(self, idx):
        image_name = self.images[idx]
        with Image.open(join(self.folder, image_name)).convert('RGB') as image:
            image.load()

        if self.transform:
            image = self.transform(image)
        if self.is_train:
            y = self.dataframe[self.dataframe.image_id == image_name].label.item()
            return image, torch.from_numpy(self.label_encoding.transform([y])).type(torch.long)
        return image, image_name
        
    def __len__(self):
        return len(self.images)

In [7]:
unique_labels = sorted(set(pd.read_csv(join(DATASET_PATH, 'train.csv')).label))
NUM_CLASSES = len(unique_labels)

encoder = LabelEncoder()
encoder.fit(unique_labels)

In [8]:
transform = transforms.Compose([
    transforms.Resize(IMG_SIZE),
    transforms.AutoAugment(),
    transforms.RandAugment(),
    transforms.ToTensor(),
    transforms.Normalize(GLOBAL_MEAN, GLOBAL_STD)
])

In [9]:
data = SportsDataset(DATASET_PATH, encoder, is_train=True, transform=transform)
train_data, val_data = random_split(data, [1 - VAL_SIZE, VAL_SIZE])

In [10]:
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)

# Training

In [11]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.model = models.vit_l_32(weights='DEFAULT')
        
        for param in self.model.parameters():
            param.requires_grad = False

        self.model.heads = nn.Sequential(nn.Linear(1024, NUM_CLASSES))

    def forward(self, x):
        return self.model(x)

In [12]:
model = Model()
model.to(DEVICE)
summary(model)

Layer (type:depth-idx)                                                 Param #
Model                                                                  --
├─VisionTransformer: 1-1                                               1,024
│    └─Conv2d: 2-1                                                     (3,146,752)
│    └─Encoder: 2-2                                                    51,200
│    │    └─Dropout: 3-1                                               --
│    │    └─Sequential: 3-2                                            (302,309,376)
│    │    └─LayerNorm: 3-3                                             (2,048)
│    └─Sequential: 2-3                                                 --
│    │    └─Linear: 3-4                                                30,750
Total params: 305,541,150
Trainable params: 30,750
Non-trainable params: 305,510,400

In [148]:
from torchmetrics.classification import F1Score
micro_f1_score = F1Score(task="multiclass", average="micro", num_classes=NUM_CLASSES)

In [13]:
def train_model(model, criterion, optimizer, train_loader, val_loader, num_epochs=EPOCHS):
    train_losses, val_losses = [], []
    for epoch in tqdm(range(num_epochs)):
        model.train()
        train_loss = val_loss = .0
        true_labels, pred_labels = [], []
        for X, y in tqdm(train_loader, leave=False):
            X = X.to(DEVICE)
            y = y.to(DEVICE).squeeze()
            optimizer.zero_grad()
            outputs = model(X)
            loss = criterion(outputs, y)
            loss.backward()

            optimizer.step()

            train_loss += loss.item()
            
            true_labels.extend(y.cpu())
            pred_labels.extend(outputs.cpu().argmax(1))
            print(loss.item(), f1_score(true_labels, pred_labels, average='micro'))

        train_loss /= len(train_loader.dataset)
        train_losses.append(train_loss)
        f1_train = f1_score(true_labels, pred_labels, average='micro')
        true_labels, pred_labels = [], []
        
        model.eval()
        with torch.no_grad():
            for X, y in tqdm(val_loader, leave=False):
                X = X.to(DEVICE)
                y = y.to(DEVICE).squeeze()
                outputs = model(X)
                loss = criterion(outputs, y)
                val_loss += loss.item()
                true_labels.extend(y.cpu())
                pred_labels.extend(outputs.cpu().argmax(1))

        val_loss /= len(val_loader.dataset)
        val_losses.append(val_loss)
        f1_val = f1_score(true_labels, pred_labels, average='micro')

        print(f"Epoch: {epoch + 1}, train loss: {train_loss:.4f},  val. loss: {val_loss:.4f},  f1_train: {f1_train},  f1_val: {f1_val}")

    return train_losses, val_losses

In [14]:
criterion = nn.CrossEntropyLoss()
optimizer = SGD(model.parameters(), lr=0.001, momentum=0.9)

In [None]:
train_losses, val_losses = train_model(model, criterion, optimizer, train_loader, val_loader, num_epochs=EPOCHS)

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/320 [00:00<?, ?it/s]

3.4289536476135254 0.0390625
3.4503109455108643 0.04296875
3.361067056655884 0.041666666666666664
3.4169652462005615 0.041015625
3.4375741481781006 0.040625
3.411011219024658 0.041666666666666664
3.3752267360687256 0.04241071428571429
3.383244514465332 0.046875
3.4282023906707764 0.046875
3.39577317237854 0.04609375
3.424251079559326 0.04474431818181819
3.369077444076538 0.046223958333333336
3.4201738834381104 0.04807692307692308
3.4292380809783936 0.04575892857142857
3.3706068992614746 0.04791666666666667
3.2952165603637695 0.05126953125
3.3427724838256836 0.05193014705882354
3.3242039680480957 0.055121527777777776
3.3037900924682617 0.05879934210526316
3.341883420944214 0.05859375
3.307598829269409 0.06063988095238095
3.3030197620391846 0.06321022727272728
3.299844741821289 0.06657608695652174
3.231071949005127 0.07063802083333333
3.25761079788208 0.0703125
3.2247300148010254 0.07271634615384616
3.2371840476989746 0.07523148148148148
3.2059414386749268 0.07784598214285714
3.229862213

  0%|          | 0/36 [00:00<?, ?it/s]

Epoch: 1, train loss: 0.0181,  val. loss: 0.0131,  f1_train: 0.4813399941228328,  f1_val: 0.6193519947101609


  0%|          | 0/320 [00:00<?, ?it/s]

1.6674236059188843 0.640625
1.6508842706680298 0.6328125
1.616401195526123 0.6380208333333334
1.7097362279891968 0.623046875
1.5990649461746216 0.625
1.7255960702896118 0.625
1.680155634880066 0.6272321428571429
1.5241961479187012 0.634765625
1.5540003776550293 0.6362847222222222
1.7205877304077148 0.6296875
1.5005379915237427 0.6342329545454546
1.5266265869140625 0.634765625
1.6411280632019043 0.6298076923076923
1.5499167442321777 0.6294642857142857
1.5203136205673218 0.6296875
1.7049176692962646 0.6259765625
1.7164580821990967 0.6259191176470589
1.5920592546463013 0.6267361111111112
1.6074334383010864 0.6254111842105263
1.5453574657440186 0.62734375
1.5795789957046509 0.6272321428571429
1.6345516443252563 0.6274857954545454
1.5452998876571655 0.6311141304347826
1.6627984046936035 0.6295572916666666
1.58347749710083 0.6296875
1.7906080484390259 0.6277043269230769
1.5473134517669678 0.6304976851851852
1.5600810050964355 0.6303013392857143
1.4904361963272095 0.6306573275862069
1.6125009

In [None]:
plt.figure(figsize=(12, 9))
plt.plot(train_losses, label = 'train')
plt.plot(val_losses, label = 'val')
plt.xlabel('epoches')
plt.ylabel('loss')
plt.legend()
plt.show()

# Prediction

In [None]:
test_data = SportsDataset(DATASET_PATH, encoder, is_train=False, transform=transform)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, pin_memory=True)

In [None]:
def get_prediction(model, test_loader, encoder, filename):
    image_ids, labels = [], []
    for X, y in tqdm(test_loader):
        X = X.to(DEVICE)
        image_ids.extend(y)
        preds = model(X).cpu().argmax(1)
        labels.extend(encoder.inverse_transform(preds))

    predictions = pd.DataFrame({"image_id": image_ids, "label": labels})
    predictions.to_csv(filename, index=False)

    return predictions

In [None]:
data = get_prediction(model, test_loader, encoder, 'submission.csv')