# Imports

In [None]:
import torch
import torchvision
import torchvision.transforms as transforms
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import numpy as np
from sklearn.model_selection import train_test_split
import os
import natsort
from PIL import Image
import copy
import pandas as pd
from sklearn.metrics import f1_score
torch.manual_seed(42)
np.random.seed(42)

# Configure the GPU

In [None]:
if torch.cuda.is_available():
    device = "cuda:0"
    print("Using GPU")
else:
    device = "cpu"
    print("Using CPU")

In [None]:
assert device=="cuda:0"

In [None]:
# https://www.youtube.com/watch?v=Z9G1Mf6TZRs
# быстрее, но немного теряется воспроизводимость
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark =  True 
torch.backends.cudnn.deterministic = False

# Data

### transforms

In [None]:
# полезно посмотреть на картинки после преобразований
# рисует тензор
imshow = lambda x: plt.imshow(x.permute(1,2,0))

In [None]:
transforms_dict = {
    'eval': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) # (imagenet normalization)
    ]),
    'train': transforms.Compose([
        transforms.Resize(256),
        transforms.RandomHorizontalFlip(),
        transforms.RandomResizedCrop(224, scale=(0.8, 1), ratio=(0.75, 1.3333333333333333), interpolation=2),
        torchvision.transforms.RandomPerspective(distortion_scale=0.3, p=0.9, interpolation=3, fill=0),
        torchvision.transforms.RandomAffine(degrees=30, shear=20, resample=False),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ]),
}

### load datasets

##### train data

In [None]:
# each folder inside has the name of the class
train_path = './big_data_folder/train/'
# folder with images that we predict classes for
test_path = './big_data_folder/test/'

In [None]:
data = datasets.ImageFolder(root = train_path, transform = transforms_dict['eval'])
data_augmented = datasets.ImageFolder(root = train_path, transform = transforms_dict['train'])

n_classes = len(data.classes)

label_to_name = {v: k for k, v in data.class_to_idx.items()}

# получим индексы для train/val сплита
train_indices, val_indices, _, _, = train_test_split(np.arange(len(data)), data.targets, test_size = 0.1, stratify=data.targets, random_state = 42)

# Subset сохраняет порядок индексов
train_dataloader = torch.utils.data.DataLoader(torch.utils.data.Subset(data_augmented, train_indices), batch_size = 100)
val_dataloader = torch.utils.data.DataLoader(torch.utils.data.Subset(data, val_indices), batch_size = 100)
final_train_dataloader = torch.utils.data.DataLoader(data_augmented, batch_size=100, shuffle=True)

##### test data

In [None]:
class TestImageFolder(torch.utils.data.Dataset):
    # custom dataset class
    # basically this is an ImageFolder version for a test set

    
    def __init__(self, root, transform, ext = '.jpg'):
        self.root = root
        self.transform = transform
        files_unsorted = [i for i in os.listdir(root) if ext in i]
        # uses smart sorting (basically goes from img0 to img9001 instead of lexicographical order)
        self.files = natsort.natsorted(files_unsorted) 

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        paths = os.path.join(self.root, self.files[idx])
        image = Image.open(paths).convert("RGB")
        tensor_image = self.transform(image)
        return tensor_image

In [None]:
test_data = TestImageFolder(test_path, transforms_dict['eval'], ext='.jpg')
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=100)

# Train loop

In [None]:
best_f1 = 0
best_model = None

In [None]:
def train(model, train_dataloader, val_dataloader, num_epoch, loss_function, optimizer, scheduler, device):
    
    # для чекпоинта
    global best_f1
    global best_model

    start_time = time.time()
    
    # лосс и метрики по эпохам
    train_losses = []
    val_losses = []
    val_f1 = []
    
    # для средних
    train_size = float(len(train_dataloader.dataset))
    if val_dataloader is not None:
        val_size = float(len(val_dataloader.dataset))
    
    for i in range(num_epoch):
        
        print('epoch',i)
        
        # train batches
        train_running_loss = 0
        model.train(True)
        for j, (inputs, labels) in enumerate(train_dataloader):
            
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            outputs = model(inputs)
            loss = loss_function(outputs, labels)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            train_running_loss += loss.item() * inputs.size(0) # лоссы усредняются по батчу, поэтому домножим на количество итемов в батче 
            
            # чтобы не скучно было сидеть за монитором
            if j%30 == True:
                print('Training batch:', str(j) + ',', 'Loss:', loss.item())

        train_losses.append(train_running_loss/train_size)
          
        # val batches
        if val_dataloader is not None:
            val_preds = []
            val_true = []
            val_running_loss = 0
            model.train(False)
            with torch.no_grad():
                for inputs, labels in val_dataloader:

                    inputs = inputs.to(device)
                    labels = labels.to(device)

                    outputs = model(inputs)
                    loss = loss_function(outputs, labels)

                    val_running_loss += loss.item() * inputs.size(0)
                    _, batch_preds = torch.max(outputs, axis=1)
                    val_preds += batch_preds.tolist()
                    val_true += labels.tolist()
            
            val_losses.append(val_running_loss/val_size)
            val_f1.append(f1_score(val_true, val_preds, average='macro'))
            
            print('Validation loss:', val_losses[-1])
            print('Validation f1:', val_f1[-1])
        
            # f1 checkpoint
            if val_f1[-1] > best_f1:
                best_f1 = val_f1[-1]
                best_model = copy.deepcopy(model)       
        
        
        # шаг по lr
        if scheduler is not None:
            scheduler.step()
        
    end_time = time.time()
    
    print('total time:',end_time-start_time)
    print('average time per epoch:',(end_time-start_time)/num_epoch)
    
    return {'train_losses':train_losses,
            'val_losses': val_losses,
            'val_f1': val_f1,
           }

# Model

In [None]:
model = models.resnet18(pretrained=True)
# change the last fc layer to our own
num_ftrs = model.fc.in_features

In [None]:
# freeze all but 2 last layers
for param in model.parameters():
    param.requires_grad = False
    
for param in model.layer4.parameters():
    param.requires_grad = True
    
# requires_grad is True by default for new layer
model.fc = torch.nn.Linear(num_ftrs, n_classes)

model = model.to(device)

In [None]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.3)

In [None]:
result_dict_frozen = train(model, final_train_dataloader, None, 5, loss_function, optimizer, scheduler, device)

In [None]:
# Unfreeze all
for param in model.parameters():
    param.requires_grad = True

In [None]:
result_dict_unfrozen = train(model, final_train_dataloader, None, 50, loss_function, optimizer, scheduler, device)

In [None]:
torch.save(model,'mymodel.pt') # сохраним на всякий

# Prediction

In [None]:
model = model.to(device)
model.train(False)
with torch.no_grad():
    test_preds = []
    for batch in test_dataloader:
        batch = batch.to(device)
        _, preds = model(batch).max(axis=1)
        test_preds.append(preds.tolist())
    # now we flatten test_preds [[batch1],[batch2]] - > [batch1,batch2]
    test_preds = sum(test_preds,[])
# decode labels
test_preds = [label_to_name[label] for label in test_preds]
submission_df = pd.DataFrame([*zip(test_data.files,test_preds,)]).rename(columns = {0:'Id', 1:'Expected'})

In [None]:
submission_df.head(3)

In [None]:
submission_df.to_csv('mysubmission.csv',index=False)