# Required Libraries and Data

In [1]:
# CAUTION: execute more than once create non-needed directory owing to current directory change
# !git clone https://github.com/bird0401/Instance_level_recognition.git

In [2]:
# %cd /content/Instance_level_recognition/app/ml
# %ls

In [3]:
# pip install -r requirements.txt

In [27]:
import os
import gc
import cv2
import math
import copy
import time
import random

# For data manipulation
import numpy as np
import pandas as pd

# # Pytorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Utils
import joblib
from tqdm import tqdm
from collections import defaultdict

import warnings
warnings.filterwarnings("ignore")

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

from pathlib import Path
import traceback    
import yaml

In [10]:
from Instance_level_recognition.app.ml.src.util import *
from Instance_level_recognition.app.ml.src.data import *
from Instance_level_recognition.app.ml.src.model import *
from Instance_level_recognition.app.ml.src.train import *

## import image data from GCS

In [11]:
# from google.colab import auth
# auth.authenticate_user()

In [12]:
# ! echo "deb http://packages.cloud.google.com/apt gcsfuse-`lsb_release -c -s` main" |sudo tee /etc/apt/sources.list.d/gcsfuse.list
# ! curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add -
# ! apt-get -y -q update
# ! apt-get -y -q install gcsfuse

In [13]:
# ! mkdir -p /content/Instance_level_recognition/app/ml/data
# ! gcsfuse --implicit-dirs --limit-bytes-per-sec -1 --limit-ops-per-sec -1 entity_dogs /content/Instance_level_recognition/app/ml/data

# wandb

In [14]:
%env "WANDB_NOTEBOOK_NAME" "pre_processing"
import wandb
wandb.login()

env: "WANDB_NOTEBOOK_NAME"="pre_processing"


ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mbird0401[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

# Configuration and Seed

In [16]:
with open('/content/Instance_level_recognition/app/ml/config.yml', 'r') as yml:
    CONFIG = yaml.safe_load(yml)

CONFIG["device"] = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
CONFIG

{'seed': 2022,
 'epochs': 6,
 'img_size': 32,
 'model_name': 'efficientnet_b0',
 'embedding_size': 512,
 'train_batch_size': 32,
 'valid_batch_size': 64,
 'learning_rate': 0.0001,
 'scheduler': 'CosineAnnealingLR',
 'min_lr': 1e-06,
 'T_max': 500,
 'T_0': 500,
 'weight_decay': 1e-06,
 'n_fold': 5,
 'n_accumulate': 1,
 's': 30.0,
 'm': 0.5,
 'ls_eps': 0.0,
 'easy_margin': False,
 'device': device(type='cuda', index=0)}

In [17]:
set_seed(CONFIG['seed'])

# Dataset

In [18]:
df_train = pd.read_csv("/content/Instance_level_recognition/app/ml/data/train.csv")

In [19]:
CONFIG["num_classees"] = CONFIG["out_features"] = len(df_train['label'].unique())

# Model

In [20]:
model = EntityLinkingModel(CONFIG['model_name'], CONFIG['out_features'])
model.to(CONFIG['device']);


# Training

In [21]:
data_transforms = GetTransforms(CONFIG['img_size'])

In [22]:
def criterion(outputs, labels):
    return nn.CrossEntropyLoss()(outputs, labels)

def collate_fn(batch):
    batch = list(filter(lambda x: x is not None, batch))
    return torch.utils.data.dataloader.default_collate(batch)

def prepare_loaders(df, fold):
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    train_dataset = EntityLinkingDataset(df_train, transforms=data_transforms["train"])
    valid_dataset = EntityLinkingDataset(df_valid, transforms=data_transforms["valid"])

    train_loader = DataLoader(train_dataset, batch_size=CONFIG['train_batch_size'], 
                              num_workers=2, collate_fn = collate_fn, shuffle=True, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset, batch_size=CONFIG['valid_batch_size'], 
                              num_workers=2, collate_fn = collate_fn, shuffle=False, pin_memory=True)
    
    return train_loader, valid_loader

def run_training(model, optimizer, scheduler, device, num_epochs):
    # To automatically log gradients
    wandb.watch(model, log_freq=100)
    
    if torch.cuda.is_available():
        print("[INFO] Using GPU: {}\n".format(torch.cuda.get_device_name()))
    
    start = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_epoch_loss = np.inf
    history = defaultdict(list)
    
    for epoch in range(1, num_epochs + 1): 
        gc.collect()
        train_epoch_loss = train_one_epoch(model, optimizer, scheduler, 
                                           dataloader=train_loader, 
                                           device=CONFIG['device'], epoch=epoch, n_accumulate = CONFIG['n_accumulate'], criterion = criterion)
        
        val_epoch_loss = valid_one_epoch(model, valid_loader, device=CONFIG['device'], 
                                         epoch=epoch, criterion=criterion, optimizer=optimizer)
    
        history['Train Loss'].append(train_epoch_loss)
        history['Valid Loss'].append(val_epoch_loss)
        
        # Log the metrics
        wandb.log({"Train Loss": train_epoch_loss})
        wandb.log({"Valid Loss": val_epoch_loss})
        
        # deep copy the model
        if val_epoch_loss <= best_epoch_loss:
            print(f"{b_}Validation Loss Improved ({best_epoch_loss} ---> {val_epoch_loss})")
            best_epoch_loss = val_epoch_loss
            run.summary["Best Loss"] = best_epoch_loss
            best_model_wts = copy.deepcopy(model.state_dict())
            PATH = "Loss{:.4f}_epoch{:.0f}.bin".format(best_epoch_loss, epoch)
            torch.save(model.state_dict(), PATH)
            # Save a model file from the current directory
            print(f"Model Saved{sr_}")
            
        print()
    
    end = time.time()
    time_elapsed = end - start
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    print("Best Loss: {:.4f}".format(best_epoch_loss))
    
    # load best model weights
    model.load_state_dict(best_model_wts)
    
    return model, history

In [28]:
train_loader, valid_loader = prepare_loaders(df_train, fold=0)

optimizer = optim.Adam(model.parameters(), lr=CONFIG['learning_rate'], 
                       weight_decay=CONFIG['weight_decay'])
scheduler = fetch_scheduler(optimizer, CONFIG['scheduler'], CONFIG['T_max'], CONFIG['T_0'], CONFIG['min_lr'])

run = wandb.init(project='EntityLinking', 
                 config=CONFIG,
                #  job_type='Train',
                #  tags=['arcface', 'gem-pooling', 'effnet-b0-ns', '448'],
                #  anonymous='must'
                 )

model, history = run_training(model, optimizer, scheduler,
                              device=CONFIG['device'],
                              num_epochs=CONFIG['epochs'])

run.finish()

VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

[INFO] Using GPU: Tesla T4



  0%|          | 0/459 [00:05<?, ?it/s]


KeyboardInterrupt: ignored