In [1]:
%load_ext autoreload
%autoreload 2

# Import and Setup

In [2]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import gc
import json
import pprint
import numpy as np
import pandas as pd
from tqdm import tqdm
from functools import partial
from argparse import Namespace
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import *


import wandb
from wandb.keras import WandbCallback

from model import SimpleSupervisedModel, ArcFaceSupervisedModel, get_feature_extractor
from config import get_train_config
from data import GetDataloader
from utils import ShowBatch, id_generator, get_stratified_k_fold
from callbacks import GetCallbacks

pp = pprint.PrettyPrinter(indent=1)

In [3]:
args = get_train_config()

random_id = id_generator(size=8)
args.exp_id = '23005S4V'

In [4]:
DEBUG = False

# Prepare Dataset

In [158]:
df = pd.read_csv('../cleaned_train_5_fold_individual_ids.csv')

with open('../label2ids_individual_ids.json') as json_file:
    label2ids = json.load(json_file)
    
args.num_labels = len(label2ids)
args.labels = label2ids

df.head()

Unnamed: 0,image,species,individual_id,img_path,fold,target
0,00021adfb725ed.jpg,melon_headed_whale,cadddb1636b9,../128x128/train_images-128-128/train_images-1...,0,0
1,000562241d384d.jpg,humpback_whale,1a71fbb72250,../128x128/train_images-128-128/train_images-1...,2,1
2,0007c33415ce37.jpg,false_killer_whale,60008f293a2b,../128x128/train_images-128-128/train_images-1...,4,2
3,0007d9bca26a99.jpg,bottlenose_dolphin,4b00fe572063,../128x128/train_images-128-128/train_images-1...,4,3
4,00087baf5cef7a.jpg,humpback_whale,8e5253662392,../128x128/train_images-128-128/train_images-1...,1,4


# Dataloader

In [6]:
# Sanity check
if DEBUG:
    # Get Split
    train_df = df[df.fold != 0]
    valid_df = df[df.fold == 0]

    # Get train and validation loaders
    dataset = GetDataloader(args)
    trainloader = dataset.dataloader(train_df, data_type='train')
    validloader = dataset.dataloader(valid_df, data_type='valid')

    # Display a batch
    if args.use_arcface:
        sample_inputs, sample_labels = next(iter(trainloader))
        sample_imgs, sample_labels = sample_inputs['img_input'], sample_inputs['label_input']
    else:
        sample_imgs, sample_labels = next(iter(trainloader))

    show_batch = ShowBatch(args)
    show_batch.show_batch(sample_imgs, sample_labels)

# Model

In [7]:
if DEBUG:
    tf.keras.backend.clear_session()
    
    if args.use_arcface:
        get_model = ArcFaceSupervisedModel(args)
    else:
        get_model = SimpleSupervisedModel(args)
        
    model = get_model.get_efficientnet()
    model.summary()

# Callbacks

In [8]:
callbacks = GetCallbacks(args)

# Train

In [9]:
if DEBUG:
    args.epochs = 10
    args.num_folds = 1

for fold in range(4, args.num_folds):
    print('Num fold: ', fold)
    # Get dataloaders
    train_df = df[df.fold != fold]
    valid_df = df[df.fold == fold]

    dataset = GetDataloader(args)
    trainloader = dataset.dataloader(train_df)
    validloader = dataset.dataloader(valid_df, data_type='valid')
    
    # Initialize model
    tf.keras.backend.clear_session()
    if args.use_arcface:
        get_model = ArcFaceSupervisedModel(args)
    else:
        get_model = SimpleSupervisedModel(args)
        
    model = get_model.get_efficientnet()

    # Compile model
    optimizer = 'adam'
    if args.use_arcface:
        loss = 'binary_crossentropy'
    else:
        loss = 'categorical_crossentropy'
        
    model.compile(optimizer,
                  loss=loss,
                  metrics=['acc',
                           tf.keras.metrics.TopKCategoricalAccuracy(1, name='top@1_acc'),
                           tf.keras.metrics.TopKCategoricalAccuracy(5, name='top@5_acc')])

    # Initialize W&B run
    run = wandb.init(project='happywhale',
                     config=vars(args),
                     group=f'effnetb0-{args.exp_id}',
                     job_type='train',
                     name=f'{args.exp_id}_{fold}_train')

    # Train
    model.fit(trainloader,
              epochs=args.epochs,
              validation_data=validloader,
              callbacks=[WandbCallback(save_model=False),
                         callbacks.get_reduce_lr_on_plateau()])
    
    # Save the model
    os.makedirs(f'{args.model_save_path}/{args.exp_id}', exist_ok=True)
    model.save(f'{args.model_save_path}/{args.exp_id}/model_{fold}')
    
    # Load the model
    model = tf.keras.models.load_model(f'{args.model_save_path}/{args.exp_id}/model_{fold}')
    
    # Evaluate and prepare oof 
    preds = model.predict(validloader)
    df.loc[list(df[df.fold == fold].index), 'preds'] = np.argmax(preds, axis=1)
    
    # Get Embedding and save it as npz files along with validation index
    feature_extractor = get_feature_extractor(model)
    embedding = feature_extractor.predict(validloader)

    os.makedirs(f'{args.embedding_save_path}/{args.exp_id}', exist_ok=True)
    np.savez(f'{args.embedding_save_path}/{args.exp_id}/embedding_{fold}.npz',
             embedding=embedding,
             index=np.array(valid_df.index))
    
    del trainloader, validloader, model, feature_extractor, embedding
    _ = gc.collect()

    # Close W&B run
    run.finish()
    
df[['image', 'individual_id', 'target', 'preds']].to_csv('../oof.csv', index=False)

Num fold:  4


[34m[1mwandb[0m: Currently logged in as: [33mayut[0m (use `wandb login --relogin` to force relogin)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30




VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
acc,▁▁▂▂▂▃▄▆▆▇▇▇▇█████████████████
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
loss,█▇▆▅▄▃▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
top@1_acc,▁▁▂▂▂▃▄▆▆▇▇▇▇█████████████████
top@5_acc,▁▁▂▃▃▄▅▆▇▇▇███████████████████
val_acc,▁▂▃▄▄▅▅▇▇▇▇███████████████████
val_loss,█▃▁▁▂▄▇▅▅▅▄▄▃▃▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄
val_top@1_acc,▁▂▃▄▄▅▅▇▇▇▇███████████████████
val_top@5_acc,▁▂▄▄▅▅▆▇▇▇████████████████████

0,1
acc,0.46648
best_epoch,3.0
best_val_loss,7.00836
epoch,29.0
loss,2.46656
top@1_acc,0.46648
top@5_acc,0.71125
val_acc,0.1213
val_loss,7.23763
val_top@1_acc,0.1213


In [20]:
# oof_df = pd.read_csv('../oof.csv')
# oof_df_copy = oof_df.copy()

# def correct_preds(row):
#     return int(row.preds)

# oof_df_copy['preds'] = oof_df_copy.apply(lambda row: correct_preds(row), axis=1)

# metric = tf.keras.metrics.SparseCategoricalCrossentropy()
# metric.update_state(oof_df_copy.target.values.reshape(-1,1), oof_df_copy.preds.values.reshape(-1,1))
# print(f'CV Score: {metric.result().numpy()}')

# oof_df_copy[oof_df_copy.target == oof_df_copy.preds]