In [1]:
# https://www.kaggle.com/competitions/colombian-ai-olympiad-pr-3-five-artists
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

In [2]:
from datasets import load_dataset

train = load_dataset("eleon360/five-artists-dataset")['train']
test = load_dataset("eleon360/five-artists-test-dataset")['train']

subm = pd.read_csv("/kaggle/input/colombian-ai-olympiad-pr-3-five-artists/sample_submission.csv")

README.md:   0%|          | 0.00/496 [00:00<?, ?B/s]

data/train-00000-of-00001-006c2e045df70c(…):   0%|          | 0.00/113M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10000 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/496 [00:00<?, ?B/s]

data/train-00000-of-00001-2bd814921b49c8(…):   0%|          | 0.00/22.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [3]:
train = train.train_test_split(test_size=0.1, seed=42)
valid = train['test']
train = train['train']

In [13]:
import torch
from torch import nn
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader

BATCH_SIZE = 64
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class ArtistDataset(Dataset):
    def __init__(self, ds, transform=None):
        super().__init__()
        self.ds = ds
        self.transform = transform
    def __len__(self):
        return len(self.ds)
    def __getitem__(self, idx):
        img, label = self.ds[idx]['image'], None
        if 'artist_id' in self.ds.column_names:
            label = self.ds[idx]['artist_id']
        if self.transform is not None:
            img = self.transform(img)
        if label is not None:
            return img, label
        else:
            return img

transform = transforms.Compose([
    transforms.ToTensor()
])

train_ds = ArtistDataset(train, transform=transform)
valid_ds = ArtistDataset(valid, transform=transform)
test_ds = ArtistDataset(test, transform=transform)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_ds, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

device

device(type='cuda')

In [10]:
class ArtistNetwork(nn.Module):
    def __init__(self, in_channels=3):
        super().__init__()
        self.cnn1 = self.conv_block(in_channels, 16)
        self.cnn2 = self.conv_block(16, 64)
        self.fc1 = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 64*64, 1024),
            nn.ReLU(inplace=True),
            nn.Linear(1024, 5)
        )
    def conv_block(self, in_channels, out_channels):
        return nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),

            nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),

            nn.Dropout(0.1),
            nn.MaxPool2d(2),
        )
    def forward(self, x):
        x = self.cnn2(self.cnn1(x))
        return self.fc1(x)

def n_correct(y_true, y_pred):
    return torch.eq(y_true, y_pred).sum().item()

model = ArtistNetwork().to(device)

epochs = 6
log_rate = 1

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5, weight_decay=1e-3)

In [11]:
for epoch in tqdm(range(epochs), desc='Training'):
    model.train()
    train_loss = 0
    for X, y in (pbar := tqdm(train_loader, desc='Train DataLoader', leave=False)):
        X, y = X.to(device), y.to(device)
        logits = model(X)
        loss = loss_fn(logits, y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        pbar.set_postfix({'loss': f"{loss.item():.5f}"})
        
    train_loss /= len(train_loader)
    
    if (epoch+1)%log_rate==0:
        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {train_loss:.5f}")

    model.eval()
    valid_loss, correct, total = 0, 0, 0
    for X, y in (pbar := tqdm(valid_loader, desc='Valid DataLoader', leave=False)):
        X, y = X.to(device), y.to(device)
        with torch.no_grad():
            logits = model(X)
            loss = loss_fn(logits, y)

        probs = torch.softmax(logits, dim=-1)
        preds = torch.argmax(probs, dim=-1)

        total += len(preds)
        correct += n_correct(y, preds)
        
        valid_loss += loss.item()
        pbar.set_postfix({'loss': f"{loss.item():.5f}"})
        
    valid_loss /= len(valid_loader)
    acc = correct / total
    
    if (epoch+1)%log_rate==0:
        print(f"Valid Loss: {valid_loss:.5f} | Valid Acc: {acc:.5f}")

Training:   0%|          | 0/6 [00:00<?, ?it/s]

Train DataLoader:   0%|          | 0/141 [00:00<?, ?it/s]

Epoch 1/6 | Train Loss: 1.25753


Valid DataLoader:   0%|          | 0/16 [00:00<?, ?it/s]

Valid Loss: 0.64058 | Valid Acc: 0.78800


Train DataLoader:   0%|          | 0/141 [00:00<?, ?it/s]

Epoch 2/6 | Train Loss: 0.45113


Valid DataLoader:   0%|          | 0/16 [00:00<?, ?it/s]

Valid Loss: 0.46220 | Valid Acc: 0.83200


Train DataLoader:   0%|          | 0/141 [00:00<?, ?it/s]

Epoch 3/6 | Train Loss: 0.26246


Valid DataLoader:   0%|          | 0/16 [00:00<?, ?it/s]

Valid Loss: 0.42123 | Valid Acc: 0.84900


Train DataLoader:   0%|          | 0/141 [00:00<?, ?it/s]

Epoch 4/6 | Train Loss: 0.14471


Valid DataLoader:   0%|          | 0/16 [00:00<?, ?it/s]

Valid Loss: 0.35736 | Valid Acc: 0.85300


Train DataLoader:   0%|          | 0/141 [00:00<?, ?it/s]

Epoch 5/6 | Train Loss: 0.10586


Valid DataLoader:   0%|          | 0/16 [00:00<?, ?it/s]

Valid Loss: 0.28778 | Valid Acc: 0.89600


Train DataLoader:   0%|          | 0/141 [00:00<?, ?it/s]

Epoch 6/6 | Train Loss: 0.05880


Valid DataLoader:   0%|          | 0/16 [00:00<?, ?it/s]

Valid Loss: 0.27393 | Valid Acc: 0.89800


In [14]:
all_preds = []

for X in (pbar := tqdm(test_loader, desc='Test DataLoader', leave=True)):
        X = X.to(device)
        with torch.no_grad():
            logits = model(X)

        probs = torch.softmax(logits, dim=-1)
        preds = torch.argmax(probs, dim=-1)

        all_preds.extend(preds.squeeze().tolist())

subm['artist_id'] = all_preds

subm.to_csv("submission.csv", index=False)

subm

Test DataLoader:   0%|          | 0/32 [00:00<?, ?it/s]

Unnamed: 0,image_id,artist_id
0,10000,4
1,10001,3
2,10002,4
3,10003,4
4,10004,1
...,...,...
1995,11995,3
1996,11996,4
1997,11997,3
1998,11998,3


In [16]:
subm['artist_id'].value_counts()

artist_id
3    494
4    467
2    421
1    387
0    231
Name: count, dtype: int64