In [1]:
from fastai.vision import *
from fastai.metrics import accuracy
from fastai.basic_data import *
import pandas as pd
 
from PIL import Image
from utils import *

In [2]:
import fastai
fastai.__version__

'1.0.39'

## A look at the data

In [3]:
df = pd.read_csv('data/train.csv')
df.head()

FileNotFoundError: File b'data/train.csv' does not exist

In [None]:
df[df.Image=='1d454afa1.jpg']

In [None]:
df[df.Id=='w_a9304b9'].Image.values

In [None]:
def show_pics(df, id):
    image_arr = df[df.Id==id].Image.values
    for im in image_arr:
        im_path = 'data/train/'+im
        print(im_path)
        Image.open(im_path)
        break
#Image.open('data/train/00fee3975.jpg')
show_pics(df, 'w_a9304b9')

In [None]:
df.Id.value_counts()

In [None]:
(df.Id == 'new_whale').mean()

In [None]:
(df.Id.value_counts() == 1).mean()

41% of all whales have only a single image associated with them.

38% of all images contain a new whale - a whale that has not been identified as one of the known whales.

There is a superb writeup on what a solution to this problem might look like [here](https://www.kaggle.com/martinpiotte/whale-recognition-model-with-score-0-78563/notebook). In general, the conversation in the Kaggle [forum](https://www.kaggle.com/c/humpback-whale-identification/discussion) also seems to have some very informative threads.

Either way, starting with a simple model that can be hacked together in a couple of lines of code is a recommended approach. It is good to have a baseline to build on - going for a complex model from start is a way for dying a thousand deaths by subtle bugs.

In [None]:
len(df)
df.head()

In [None]:
df_new_whale = df[df.Id=='new_whale']
df1 = df[df.Id != 'new_whale']
len(df1)
df1.index

In [None]:
df_new_whale = df[df.Id=='new_whale']
df_new_whale.index

In [None]:
df_known = df[df.Id!='new_whale']
type(df_known.Id.value_counts())

In [None]:
df.iloc[:1].index.values

In [None]:
def split_whale_set(df, train_portion=0.8, seed=1):
    np.random.seed(seed)
    df_known = df[df.Id!='new_whale']
    #list(df_known.groupby('Id'))
    train_idxes = []
    val_idxes = []
    for name, group in df_known.groupby('Id'):
        #print(name, len(group), group.index, type(group))
        if len(group) > 1:
            while True:
                mask = np.random.rand(len(group)) < train_portion
                if 0 < mask.sum() < len(group):
                    break
            #print(mask, group[mask].index.values)
            train_idxes.extend(group[mask].Image.values)
            val_idxes.extend(group[~mask].Image.values)
        else:
            #print(group.index.values)
            train_idxes.extend(group.Image.values)
            val_idxes.extend(group.Image.values)
    return train_idxes, val_idxes


In [None]:
train_idxes, val_idxes = split_whale_set(df, 0.8, 1)
len(train_idxes), len(val_idxes)

In [None]:
ItemList(train_idxes, 'data/train')

In [None]:
ImageItemList.from_df(df[df.Id != 'new_whale'], 'data/train')

In [None]:
ImageItemList.from_folder('data/train')

In [None]:
#ImageItemList.from_csv('data/train', 'train.csv')

In [None]:
df.Id.nunique()

In [None]:
df[df.Id=='w_00656c0']

In [None]:
fn2label = {row[1].Image: row[1].Id for row in df.iterrows()}

In [None]:
fn2label['001c1ac5f.jpg']

In [None]:
SZ = 224
BS = 64
NUM_WORKERS = 12
SEED=1

In [None]:
train_item_list = ItemList(train_idxes)
val_item_list = ItemList(val_idxes)
data = (
    ImageItemList
        #.from_df(df, 'data/train')
        .from_folder('data/train')
        .split_by_list(train_item_list, val_item_list)
        #.random_split_by_pct(seed=SEED)
        .label_from_func(lambda name: fn2label[name])
        .add_test(ImageItemList.from_folder('data/test'))
        .transform(get_transforms(do_flip=False, max_zoom=1, max_warp=0, max_rotate=2), size=SZ, resize_method=ResizeMethod.SQUISH)
        .databunch(bs=BS, num_workers=NUM_WORKERS, path='data')
)

In [None]:
data.train.__getitem__(0)

In [None]:
data.show_batch(rows=3)

## Train

In [None]:
name = f'res50-{SZ}'

In [None]:
learn = create_cnn(data, models.resnet50, pretrained=False, metrics=[accuracy, map5])

In [None]:
learn.fit_one_cycle(2)

In [None]:
learn.recorder.plot_losses()

In [None]:
learn.save(f'{name}-stage-1')

In [None]:
learn.unfreeze()

In [None]:
learn.lr_find()

In [None]:
learn.recorder.plot()

In [None]:
max_lr = 1e-4
lrs = [max_lr/100, max_lr/10, max_lr]

In [None]:
learn.fit_one_cycle(5, lrs)

In [None]:
learn.save(f'{name}-stage-2')

In [None]:
learn.recorder.plot_losses()

This is not a loss plot you would normally expect to see. Why does it look like this? Let's consider what images appear in the validation set:
 * images of whales that do not appear in the train set (whales where all their images were randomly assigned to the validation set) - there is nothing our model can learn about these!
 * images of whales with multiple images in the dataset where some subset of those got assigned to the validation set
 * `new_whale` images
 
Intuitively, a model such as the above does not seem to frame the problem in a way that would be easy for a neural network to solve. Nonetheless, it is interesting to think how we could improve on the construction of the validation set? What tweaks could be made to the model to improve its performance?

## Predict

In [None]:
preds, _ = learn.get_preds(DatasetType.Test)

In [None]:
mkdir -p subs

In [None]:
create_submission(preds, learn.data, name)

In [None]:
pd.read_csv(f'subs/{name}.csv.gz').head()

In [None]:
!kaggle competitions submit -c humpback-whale-identification -f subs/{name}.csv.gz -m "{name}"