# Required Libraries

In [4]:
pip install albumentations

Collecting albumentations
  Downloading albumentations-1.3.0-py3-none-any.whl (123 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.5/123.5 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-image>=0.16.1
  Downloading scikit_image-0.19.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (13.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m70.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting qudida>=0.0.4
  Downloading qudida-0.0.4-py3-none-any.whl (3.5 kB)
Collecting opencv-python-headless>=4.1.1
  Downloading opencv_python_headless-4.6.0.66-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (48.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.3/48.3 MB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting tifffile>=2019.7.26
  Downloading tifffile-2021.11.2-py3-none-any.whl (178 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

In [128]:
import os
import gc
import cv2
import math
import copy
import time
import random

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp

# Utils
import joblib
from tqdm import tqdm
from collections import defaultdict

# Sklearn Imports
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

# For Image Models
import timm

# Albumentations for augmentations
import albumentations as A
from albumentations.pytorch import ToTensorV2

# For colored terminal text
from colorama import Fore, Back, Style
b_ = Fore.BLUE
sr_ = Style.RESET_ALL

import warnings
warnings.filterwarnings("ignore")

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


import torch
import torchvision.transforms as transforms
from sklearn.model_selection import train_test_split
import torchvision


from google.cloud import bigquery
import pandas


from pathlib import Path

# Configuration and Seed

In [131]:
CONFIG = {"seed": 2022,
          "epochs": 4,
          "img_size": 448,
          "model_name": "tf_efficientnet_b0_ns",
          "num_classes": 15587,
          "embedding_size": 512,
          "train_batch_size": 32,
          "valid_batch_size": 64,
          "learning_rate": 1e-4,
          "scheduler": 'CosineAnnealingLR',
          "min_lr": 1e-6,
          "T_max": 500,
          "weight_decay": 1e-6,
          "n_fold": 5,
          "n_accumulate": 1,
          "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
          # ArcFace Hyperparameters
          "s": 30.0, 
          "m": 0.50,
          "ls_eps": 0.0,
          "easy_margin": False
          }

ROOT_DIR = './data/imgs'
TRAIN_DIR = './imgs/data/train'
TEST_DIR = './imgs/data/test'
DATA_DIR = Path("data")


def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(CONFIG['seed'])

In [132]:
# function to query for BQ

location = "asia-northeast1"
client = bigquery.Client(location=location, project="root-beanbag-354111")
print("Client creating using default project: {}".format(client.project))

def query2df(query):
    query = query
    query_job = client.query(
        query,
        # Location must match that of the dataset(s) referenced in the query.
        location=location
    )  # API request - starts the query

    df = query_job.to_dataframe()
    return df

Client creating using default project: root-beanbag-354111


# import Dataframe

In [133]:
# take table used in this project

query = """
    WITH 
        full_table AS (
            SELECT ip.img_id, ip.path, iw.wikidata_id, n.name
            FROM `root-beanbag-354111.scraping_dog_breeds_by_name.img_path` AS ip
            INNER JOIN `root-beanbag-354111.scraping_dog_breeds_by_name.img_wikidata_id` AS iw
            USING(img_id)
            INNER JOIN `root-beanbag-354111.scraping_dog_breeds_by_name.names` AS n
            USING(wikidata_id)),
        has_morethan3imgs AS (
            SELECT wikidata_id, COUNT(wikidata_id)
            FROM full_table
            GROUP BY wikidata_id
            HAVING COUNT(wikidata_id) >= 3)
    SELECT img_id, path, wikidata_id, name
    FROM full_table
    INNER JOIN has_morethan3imgs USING(wikidata_id)
"""
df = query2df(query)
df.head()

Unnamed: 0,img_id,path,wikidata_id,name
0,1,./imgs/Q7254/image_0000.jpg,Q7254,Affenpinscher
1,2,./imgs/Q7254/image_0001.jpg,Q7254,Affenpinscher
2,3,./imgs/Q7254/image_0002.jpg,Q7254,Affenpinscher
3,4,./imgs/Q7254/image_0003.jpg,Q7254,Affenpinscher
4,5,./imgs/Q7254/image_0004.jpg,Q7254,Affenpinscher


In [159]:
def get_train_file_path(path):
    p = Path(path)
    return str(DATA_DIR / p)

In [160]:
# convert to this project's image path
df['file_path'] = df['path'].apply(get_train_file_path)
df.head()

Unnamed: 0,img_id,path,wikidata_id,name,file_path,label,is_train_val
0,1,./imgs/Q7254/image_0000.jpg,Q7254,Affenpinscher,data/imgs/Q7254/image_0000.jpg,475,1.0
1,2,./imgs/Q7254/image_0001.jpg,Q7254,Affenpinscher,data/imgs/Q7254/image_0001.jpg,475,1.0
2,3,./imgs/Q7254/image_0002.jpg,Q7254,Affenpinscher,data/imgs/Q7254/image_0002.jpg,475,0.0
3,4,./imgs/Q7254/image_0003.jpg,Q7254,Affenpinscher,data/imgs/Q7254/image_0003.jpg,475,1.0
4,5,./imgs/Q7254/image_0004.jpg,Q7254,Affenpinscher,data/imgs/Q7254/image_0004.jpg,475,1.0


In [136]:
encoder = LabelEncoder()
df['label'] = encoder.fit_transform(df['wikidata_id'])
df.head()

Unnamed: 0,img_id,path,wikidata_id,name,file_path,label
0,1,./imgs/Q7254/image_0000.jpg,Q7254,Affenpinscher,data/imgs/Q7254/image_0000.jpg,475
1,2,./imgs/Q7254/image_0001.jpg,Q7254,Affenpinscher,data/imgs/Q7254/image_0001.jpg,475
2,3,./imgs/Q7254/image_0002.jpg,Q7254,Affenpinscher,data/imgs/Q7254/image_0002.jpg,475
3,4,./imgs/Q7254/image_0003.jpg,Q7254,Affenpinscher,data/imgs/Q7254/image_0003.jpg,475
4,5,./imgs/Q7254/image_0004.jpg,Q7254,Affenpinscher,data/imgs/Q7254/image_0004.jpg,475


In [161]:
with open("le.pkl", "wb") as fp:
    joblib.dump(encoder, fp)
len(df.label)

22984

In [162]:
train_val_indices, test_indices = train_test_split(list(range(len(df.label))), test_size=0.2, stratify=df.label)
df.loc[train_val_indices, "is_train_val"] = 1
df.loc[test_indices, "is_train_val"] = 0
df_train, df_test = df[df["is_train_val"] == 1].reset_index(), df[df["is_train_val"] == 0].reset_index()
len(df_train), len(df_test)

(18387, 4597)

In [163]:
skf = StratifiedKFold(n_splits=CONFIG['n_fold'])
for fold, ( _, val_) in enumerate(skf.split(X=df_train, y=df_train.label)):
      df_train.loc[val_ , "kfold"] = fold
df_train.head(10)

Unnamed: 0,index,img_id,path,wikidata_id,name,file_path,label,is_train_val,kfold
0,0,1,./imgs/Q7254/image_0000.jpg,Q7254,Affenpinscher,data/imgs/Q7254/image_0000.jpg,475,1.0,0.0
1,1,2,./imgs/Q7254/image_0001.jpg,Q7254,Affenpinscher,data/imgs/Q7254/image_0001.jpg,475,1.0,0.0
2,2,3,./imgs/Q7254/image_0002.jpg,Q7254,Affenpinscher,data/imgs/Q7254/image_0002.jpg,475,1.0,0.0
3,3,4,./imgs/Q7254/image_0003.jpg,Q7254,Affenpinscher,data/imgs/Q7254/image_0003.jpg,475,1.0,1.0
4,4,5,./imgs/Q7254/image_0004.jpg,Q7254,Affenpinscher,data/imgs/Q7254/image_0004.jpg,475,1.0,1.0
5,6,7,./imgs/Q7254/image_0006.jpg,Q7254,Affenpinscher,data/imgs/Q7254/image_0006.jpg,475,1.0,2.0
6,7,8,./imgs/Q7254/image_0007.jpg,Q7254,Affenpinscher,data/imgs/Q7254/image_0007.jpg,475,1.0,2.0
7,9,10,./imgs/Q7254/image_0009.jpg,Q7254,Affenpinscher,data/imgs/Q7254/image_0009.jpg,475,1.0,3.0
8,10,11,./imgs/Q7254/image_0010.jpg,Q7254,Affenpinscher,data/imgs/Q7254/image_0010.jpg,475,1.0,3.0
9,11,12,./imgs/Q7254/image_0011.jpg,Q7254,Affenpinscher,data/imgs/Q7254/image_0011.jpg,475,1.0,4.0


# Dataset

In [164]:
CONFIG["out_features"] = len(df_train["label"].unique())

In [165]:
class HappyWhaleDataset(Dataset):
    def __init__(self, df, transforms=None):
        self.df = df
        self.file_names = df['file_path'].values
        self.labels = df['label'].values
        self.transforms = transforms
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        img_path = self.file_names[index]
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        label = self.labels[index]
        
        if self.transforms:
            img = self.transforms(image=img)["image"]
            
        return {
            'image': img,
            'label': torch.tensor(label, dtype=torch.long)
        }
    
data_transforms = {
    "train": A.Compose([
        A.Resize(CONFIG['img_size'], CONFIG['img_size']),
        A.ShiftScaleRotate(shift_limit=0.1, 
                           scale_limit=0.15, 
                           rotate_limit=60, 
                           p=0.5),
        A.HueSaturationValue(
                hue_shift_limit=0.2, 
                sat_shift_limit=0.2, 
                val_shift_limit=0.2, 
                p=0.5
            ),
        A.RandomBrightnessContrast(
                brightness_limit=(-0.1,0.1), 
                contrast_limit=(-0.1, 0.1), 
                p=0.5
            ),
        A.Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
                max_pixel_value=255.0, 
                p=1.0
            ),
        ToTensorV2()], p=1.),
    
    "valid": A.Compose([
        A.Resize(CONFIG['img_size'], CONFIG['img_size']),
        A.Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
                max_pixel_value=255.0, 
                p=1.0
            ),
        ToTensorV2()], p=1.)
}

# Model

In [166]:
class HappyWhaleModel(nn.Module):
    def __init__(self, model_name, out_features, pretrained=True):
        super(HappyWhaleModel, self).__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained)
        in_features = self.model.classifier.in_features
        self.model.classifier = nn.Linear(in_features, out_features)
        # self.model.classifier = nn.Identity()
        # self.model.global_pool = nn.Identity()
        # self.pooling = GeM()
        # self.embedding = nn.Linear(in_features, embedding_size)
        # self.fc = ArcMarginProduct(embedding_size, 
                                   # CONFIG["num_classes"],
                                   # s=CONFIG["s"], 
                                   # m=CONFIG["m"], 
                                   # easy_margin=CONFIG["ls_eps"], 
                                   # ls_eps=CONFIG["ls_eps"])

    def forward(self, images, labels):
        output = self.model(images)
        # features = self.model(images)
        # pooled_features = self.pooling(features).flatten(1)
        # output = self.embedding(pooled_features)
        # output = self.fc(embedding, labels)
        return output
    
    # def extract(self, images):
    #     features = self.model(images)
    #     pooled_features = self.pooling(features).flatten(1)
    #     embedding = self.embedding(pooled_features)
    #     return embedding

    
model = HappyWhaleModel(CONFIG['model_name'], CONFIG['out_features'])
model.to(CONFIG['device']);

# Training

In [167]:
def criterion(outputs, labels):
    return nn.CrossEntropyLoss()(outputs, labels)

In [168]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.train()
    
    dataset_size = 0
    running_loss = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        images = data['image'].to(device, dtype=torch.float)
        labels = data['label'].to(device, dtype=torch.long)
        
        batch_size = images.size(0)
        
        outputs = model(images, labels)
        loss = criterion(outputs, labels)
        loss = loss / CONFIG['n_accumulate']
            
        loss.backward()
    
        if (step + 1) % CONFIG['n_accumulate'] == 0:
            optimizer.step()

            # zero the parameter gradients
            optimizer.zero_grad()

            if scheduler is not None:
                scheduler.step()
                
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        bar.set_postfix(Epoch=epoch, Train_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])
    gc.collect()
    
    return epoch_loss

In [169]:
@torch.inference_mode()
def valid_one_epoch(model, dataloader, device, epoch):
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:        
        images = data['image'].to(device, dtype=torch.float)
        labels = data['label'].to(device, dtype=torch.long)
        
        batch_size = images.size(0)

        outputs = model(images, labels)
        loss = criterion(outputs, labels)
        
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        bar.set_postfix(Epoch=epoch, Valid_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])   
    
    gc.collect()
    
    return epoch_loss

In [170]:
def run_training(model, optimizer, scheduler, device, num_epochs):
    # To automatically log gradients
    # wandb.watch(model, log_freq=100)
    
    if torch.cuda.is_available():
        print("[INFO] Using GPU: {}\n".format(torch.cuda.get_device_name()))
    
    start = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_epoch_loss = np.inf
    history = defaultdict(list)
    
    for epoch in range(1, num_epochs + 1): 
        gc.collect()
        train_epoch_loss = train_one_epoch(model, optimizer, scheduler, 
                                           dataloader=train_loader, 
                                           device=CONFIG['device'], epoch=epoch)
        
        val_epoch_loss = valid_one_epoch(model, valid_loader, device=CONFIG['device'], 
                                         epoch=epoch)
    
        history['Train Loss'].append(train_epoch_loss)
        history['Valid Loss'].append(val_epoch_loss)
        
#         # Log the metrics
#         wandb.log({"Train Loss": train_epoch_loss})
#         wandb.log({"Valid Loss": val_epoch_loss})
        
        # deep copy the model
        if val_epoch_loss <= best_epoch_loss:
            print(f"{b_}Validation Loss Improved ({best_epoch_loss} ---> {val_epoch_loss})")
            best_epoch_loss = val_epoch_loss
            run.summary["Best Loss"] = best_epoch_loss
            best_model_wts = copy.deepcopy(model.state_dict())
            PATH = "Loss{:.4f}_epoch{:.0f}.bin".format(best_epoch_loss, epoch)
            torch.save(model.state_dict(), PATH)
            # Save a model file from the current directory
            print(f"Model Saved{sr_}")
            
        print()
    
    end = time.time()
    time_elapsed = end - start
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    print("Best Loss: {:.4f}".format(best_epoch_loss))
    
    # load best model weights
    model.load_state_dict(best_model_wts)
    
    return model, history

In [171]:
def fetch_scheduler(optimizer):
    if CONFIG['scheduler'] == 'CosineAnnealingLR':
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer,T_max=CONFIG['T_max'], 
                                                   eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == 'CosineAnnealingWarmRestarts':
        scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer,T_0=CONFIG['T_0'], 
                                                             eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == None:
        return None
        
    return scheduler

In [172]:
def prepare_loaders(df, fold):
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    train_dataset = HappyWhaleDataset(df_train, transforms=data_transforms["train"])
    valid_dataset = HappyWhaleDataset(df_valid, transforms=data_transforms["valid"])

    train_loader = DataLoader(train_dataset, batch_size=CONFIG['train_batch_size'], 
                              num_workers=2, shuffle=True, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset, batch_size=CONFIG['valid_batch_size'], 
                              num_workers=2, shuffle=False, pin_memory=True)
    
    return train_loader, valid_loader

In [174]:
train_loader, valid_loader = prepare_loaders(df_train, fold=0)

optimizer = optim.Adam(model.parameters(), lr=CONFIG['learning_rate'], 
                       weight_decay=CONFIG['weight_decay'])
scheduler = fetch_scheduler(optimizer)

# run = wandb.init(project='HappyWhale', 
#                  config=CONFIG,
#                  job_type='Train',
#                  tags=['arcface', 'gem-pooling', 'effnet-b0-ns', '448'],
#                  anonymous='must')

model, history = run_training(model, optimizer, scheduler,
                              device=CONFIG['device'],
                              num_epochs=CONFIG['epochs'])

# run.finish()

  0% 0/459 [00:00<?, ?it/s]ERROR: Unexpected bus error encountered in worker. This might be caused by insufficient shared memory (shm).
  0% 0/459 [00:13<?, ?it/s]


RuntimeError: DataLoader worker (pid(s) 2796) exited unexpectedly

ERROR: Unexpected bus error encountered in worker. This might be caused by insufficient shared memory (shm).
 