In [None]:
import os
import random
import numpy as np
import pandas as pd
from PIL import Image
import cv2
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, train_test_split
import wandb

import torch
from torch import nn
from torchvision import models
import torch.optim as optim

from dataset import get_dataloaders, get_datasets
from utils import seed_everything
from trainer import Trainer

# Params
Image.MAX_IMAGE_PIXELS = 1e11
CFG = {
    'seed': 42,
    'base_model': 'resnet18',
    'img_size': 512,
    'batch_size': 32,
    'freeze_epochs': 1,
    'epochs': 10,
    'base_lr': 1e-3,
    'affine_degrees': 0,
    'affine_translate': None,
    'affine_scale': None,
    'cv_fold': 5,
}

# Wandb
wandb.login(key='1b0401db7513303bdea77fb070097f9d2850cf3b')
run = wandb.init(project='kaggle-ubc-ocean', config=CFG, tags=['torch', 'baseline'])

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Paths
root = '/media/latlab/MR/projects/kaggle-ubc-ocean'
data_dir = os.path.join(root, 'data')
results_dir = os.path.join(root, 'results')
train_csv = 'train.csv'
train_image_dir = os.path.join(data_dir, 'train_images')
train_thumbnail_dir = os.path.join(data_dir, 'train_thumbnails')

# Seed
seed_everything(CFG['seed'])

# Load data
df = pd.read_csv(os.path.join(data_dir, train_csv))

# # Train/validation split
# df_train, df_validation = train_test_split(df, test_size=0.2, stratify=df['label'], shuffle=True, random_state=CFG['seed'])

# Label encoder/decoder
encode = {v: k for k, v in enumerate(df.label.unique())}
decode = {v: k for k, v in encode.items()}

In [None]:
def train_model(CFG, train_image_dir, train_thumbnail_dir, df_train, df_validation, encode):
    # Data loaders
    datasets = get_datasets(CFG, train_image_dir, train_thumbnail_dir, df_train, df_validation, encode)
    dataloaders = get_dataloaders(CFG, datasets)

    # Model definition
    model = models.get_model(CFG['base_model'], weights='DEFAULT').to(device)
    for param in model.parameters():
        param.requires_grad = False
    model.fc = nn.Linear(model.fc.in_features, len(encode)).to(device)
    loss_fn = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=CFG['base_lr'], momentum=0.9)
    exp_lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

    # Training
    trainer = Trainer(model, dataloaders, loss_fn, optimizer, exp_lr_scheduler, device)
    model, _ = trainer.train(num_epochs=CFG['freeze_epochs'])
    trainer.unfreeze()
    model, best_balanced_acc = trainer.train(num_epochs=CFG['epochs'])
    return model, best_balanced_acc

In [None]:
skf = StratifiedKFold(n_splits=CFG['cv_fold'], random_state=CFG['seed'], shuffle=True)
balanced_acc_list = []
lb = df['label']
for cv, (train_index, valid_index) in enumerate(skf.split(np.zeros(len(lb)), lb)):
    df_train = df.iloc[train_index]
    df_validation = df.iloc[valid_index]
    run_name = f'{run.name}-cv{cv+1}'
    model, best_balanced_acc = train_model(CFG, train_image_dir, train_thumbnail_dir, df_train, df_validation, encode)
    balanced_acc_list.append(best_balanced_acc)
    torch.save(model.state_dict(), os.path.join(results_dir, 'models', f'ubc-ocean_{run_name}.pt'))
    wandb.log({f'best_balanced_acc_cv{cv}': best_balanced_acc})
wandb.log({f'mean_best_balanced_acc_cv{cv}': np.mean(balanced_acc_list)})
wandb.finish()

In [None]:
# # Save model
# torch.save(model.state_dict(), os.path.join(results_dir, 'models', f'ubc-ocean_{run.name}.pt'))

In [None]:
# # Plot samples
# fig = plt.figure(figsize=(16, 16))
# for i, sample in enumerate(dataset):
#     ax = plt.subplot(10, 10, i + 1)
#     plt.imshow(sample['image'])
#     ax.axis('off')
#     plt.title(f"#{i} {sample['label']}")

#     if i==99:
#         plt.show()
#         break