In [1]:
%load_ext autoreload
%autoreload 2

# Import and Setup

In [12]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import gc
import string
import random
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None

from tqdm import tqdm
from functools import partial
from argparse import Namespace
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import *

import wandb
from wandb.keras import WandbCallback

from model import SimpleSupervisedtModel, ArcFaceSupervisedModel, get_feature_extractor
from config import get_test_config
from data import GetDataloader
from utils import ShowBatch
from callbacks import GetCallbacks

In [13]:
import pprint
pp = pprint.PrettyPrinter(indent=1)

In [27]:
args = get_test_config()
args.exp_id = '46EZHKMK'

DEBUG = True
MODEL_PATH = f'{args.model_save_path}/{args.exp_id}'
TRAIN_EMBED_PATH = f'{args.embedding_save_path}/{args.exp_id}'

pp.pprint(vars(args))

{'batch_size': 256,
 'embedding_save_path': '../embeddings',
 'exp_id': '46EZHKMK',
 'image_height': 128,
 'image_width': 128,
 'labels': {'beluga': 4,
            'blue_whale': 7,
            'bottlenose_dolphin': 3,
            'brydes_whale': 19,
            'commersons_dolphin': 20,
            'common_dolphin': 10,
            'cuviers_beaked_whale': 17,
            'dusky_dolphin': 13,
            'false_killer_whale': 2,
            'fin_whale': 6,
            'frasiers_dolphin': 25,
            'gray_whale': 8,
            'humpback_whale': 1,
            'killer_whale': 11,
            'long_finned_pilot_whale': 14,
            'melon_headed_whale': 0,
            'minke_whale': 5,
            'pantropic_spotted_dolphin': 23,
            'pygmy_killer_whale': 24,
            'rough_toothed_dolphin': 22,
            'sei_whale': 15,
            'short_finned_pilot_whale': 12,
            'southern_right_whale': 9,
            'spinner_dolphin': 16,
            'spotted_dolphin'

# Load Dataframes

In [15]:
train_df = pd.read_csv('../cleaned_5_fold_train.csv')
train_df.head()

Unnamed: 0,image,species,individual_id,img_path,target,fold
0,00021adfb725ed.jpg,melon_headed_whale,cadddb1636b9,../128x128/train_images-128-128/train_images-1...,0,2.0
1,000562241d384d.jpg,humpback_whale,1a71fbb72250,../128x128/train_images-128-128/train_images-1...,1,3.0
2,0007c33415ce37.jpg,false_killer_whale,60008f293a2b,../128x128/train_images-128-128/train_images-1...,2,2.0
3,0007d9bca26a99.jpg,bottlenose_dolphin,4b00fe572063,../128x128/train_images-128-128/train_images-1...,3,2.0
4,00087baf5cef7a.jpg,humpback_whale,8e5253662392,../128x128/train_images-128-128/train_images-1...,1,4.0


In [16]:
def source_path(row):
    return f'{args.test_img_path}/{row.image}'

test_df = pd.read_csv('../sample_submission.csv')
test_df['img_path'] = test_df.apply(lambda row: source_path(row), axis=1)

test_df.head()

Unnamed: 0,image,predictions,img_path
0,000110707af0ba.jpg,37c7aba965a5 114207cab555 a6e325d8e924 19fbb96...,../128x128/test_images-128-128/test_images-128...
1,0006287ec424cb.jpg,37c7aba965a5 114207cab555 a6e325d8e924 19fbb96...,../128x128/test_images-128-128/test_images-128...
2,000809ecb2ccad.jpg,37c7aba965a5 114207cab555 a6e325d8e924 19fbb96...,../128x128/test_images-128-128/test_images-128...
3,00098d1376dab2.jpg,37c7aba965a5 114207cab555 a6e325d8e924 19fbb96...,../128x128/test_images-128-128/test_images-128...
4,000b8d89c738bd.jpg,37c7aba965a5 114207cab555 a6e325d8e924 19fbb96...,../128x128/test_images-128-128/test_images-128...


# Get Train Embeddings

In [17]:
train_embeddings = []
train_index = []

for fold in tqdm(range(args.num_folds)):
    data = np.load(f'{TRAIN_EMBED_PATH}/embedding_{fold}.npz', allow_pickle=True)
    train_embeddings.extend(data['embedding'])
    train_index.extend(data['index'])
    
train_embeddings = np.array(train_embeddings)
train_index = np.array(train_index)

print(train_embeddings.shape, train_index.shape)

100% 5/5 [00:00<00:00, 12.84it/s]

(51033, 1280) (51033,)





In [13]:
MODEL_PATH = f'{args.model_save_path}/{args.exp_id}'
train_embeddings = []

for fold in tqdm(range(args.num_folds)):
    valid_df = train_df[train_df.fold == fold]
    dataset = GetDataloader(args)
    validloader = dataset.dataloader(valid_df, data_type='valid')
    
    model = tf.keras.models.load_model(f'{MODEL_PATH}/model_{fold}.h5')
    feature_extractor = get_feature_extractor(model)

    embeddings = feature_extractor.predict(validloader)
    train_embeddings.extend(embeddings)
    
train_embeddings = np.array(train_embeddings)

100% 5/5 [00:34<00:00,  6.95s/it]


# Learn Nearest Neighbors - Unsupervised

In [18]:
from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=50, metric='cosine')
neigh.fit(train_embeddings)

NearestNeighbors(metric='cosine', n_neighbors=50)

# Get Test Embeddings

In [19]:
dataset = GetDataloader(args)
testloader = dataset.dataloader(test_df, data_type='test')

In [21]:
test_embeddings = []

for fold in tqdm(range(args.num_folds)):
    model = tf.keras.models.load_model(f'{MODEL_PATH}/model_{fold}')
    feature_extractor = get_feature_extractor(model)

    embeddings = feature_extractor.predict(testloader)

    test_embeddings.append(embeddings)
    
test_embeddings = np.array(test_embeddings)
test_embeddings = np.mean(test_embeddings, axis=0)
print(test_embeddings.shape)

100% 5/5 [02:20<00:00, 28.17s/it]


(27956, 1280)


# Find the Nearest Ids

In [22]:
sub_df = test_df[['image', 'predictions']]

In [23]:
dist, idxs = neigh.kneighbors(test_embeddings, n_neighbors=50)

In [24]:
for i in tqdm(range(len(sub_df))):
    individual_ids = train_df.loc[train_index[idxs[i]].flatten().tolist()[:5]].individual_id.values
    individual_ids = ' '.join(individual_ids)
    sub_df.loc[i, 'predictions'] = individual_ids

100% 27956/27956 [00:16<00:00, 1728.34it/s]


In [25]:
SUBMISSION_DIR = f'../submissions/{args.exp_id}'
os.makedirs(SUBMISSION_DIR, exist_ok=True)
sub_df.to_csv(f'{SUBMISSION_DIR}/submission.csv', index=False)

# Log LB Score

In [29]:
run = wandb.init(project='happywhale',
                 config=vars(args),
                 group=f'effnetb0',
                 job_type='inference',
                 name=f'{args.exp_id}-infer')

wandb.log({'LB Score': 0.038})
wandb.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
LB Score,▁

0,1
LB Score,0.038
