In [1]:
from fastai.vision import *
from fastai.metrics import accuracy
from fastai.basic_data import *
import pandas as pd
 
from PIL import Image
from utils import *

In [2]:
import fastai
fastai.__version__

'1.0.39'

## A look at the data

In [3]:
df = pd.read_csv('data/train.csv')
df.head()

Unnamed: 0,Image,Id
0,0000e88ab.jpg,w_f48451c
1,0001f9222.jpg,w_c3d896a
2,00029d126.jpg,w_20df2c5
3,00050a15a.jpg,new_whale
4,0005c1ef8.jpg,new_whale


In [4]:
df[df.Image=='1d454afa1.jpg']

Unnamed: 0,Image,Id
2856,1d454afa1.jpg,w_b7411e8


In [5]:
df[df.Id=='w_a9304b9'].Image.values

array(['00fee3975.jpg', '042a3d0b3.jpg', '0b25273c7.jpg', '180a18d6e.jpg', '192c8a374.jpg', '1964299d3.jpg',
       '22fc9b132.jpg', '2dd2388d6.jpg', '37e677fe3.jpg', '3833dd9ab.jpg', '392032607.jpg', '419d217ac.jpg',
       '501d4b884.jpg', '519d17b05.jpg', '527cd75e6.jpg', '619d16c96.jpg', '709817b91.jpg', '7c3ed439b.jpg',
       '88285e65a.jpg', '9559af634.jpg', '95a9f8b28.jpg', '9a3ea3da2.jpg', 'a610046fd.jpg', 'a6e02dfd4.jpg',
       'b37f0ed5a.jpg', 'b4cb30afd.jpg', 'b69238dcd.jpg', 'b6ad10581.jpg', 'b75fc18b7.jpg', 'cd718a734.jpg',
       'ceefe609f.jpg', 'd8f13c4a3.jpg', 'ddb537ff2.jpg', 'e021d7ca2.jpg', 'e6010e078.jpg', 'eab9d8d27.jpg',
       'efae7b997.jpg'], dtype=object)

In [6]:
def show_pics(df, id):
    image_arr = df[df.Id==id].Image.values
    for im in image_arr:
        im_path = 'data/train/'+im
        print(im_path)
        Image.open(im_path)
        break
#Image.open('data/train/00fee3975.jpg')
show_pics(df, 'w_a9304b9')

data/train/00fee3975.jpg


In [7]:
df.Id.value_counts()

new_whale    9664
w_23a388d      73
w_9b5109b      65
w_9c506f6      62
w_0369a5c      61
w_700ebb4      57
w_3de579a      54
w_564a34b      51
w_fd3e556      50
w_88e4537      49
w_2b069ba      48
w_d405854      47
w_f0fe284      45
w_789c969      45
w_778e474      40
w_5e8e218      40
w_343f088      40
w_a9304b9      37
w_5a2634c      37
w_60ce6fc      37
w_6822dbc      36
w_af367c3      35
w_f765256      34
w_1ca9ab1      34
w_17b0d3a      33
w_d72771c      32
w_08630fd      31
w_6cda039      31
w_8c25681      31
w_04003e9      30
             ... 
w_268addf       1
w_f62bcba       1
w_438dd0b       1
w_9888ce0       1
w_ff6065f       1
w_2e5d992       1
w_25b2b3d       1
w_3bc2641       1
w_6db266a       1
w_bf39602       1
w_214e081       1
w_e3056e9       1
w_90f0cd0       1
w_92eaff4       1
w_871decc       1
w_982f0b3       1
w_d64c2ec       1
w_4e01dd9       1
w_7f4e251       1
w_b7ec7f8       1
w_f7ec202       1
w_948268e       1
w_4d96d06       1
w_e31480f       1
w_175d6fa 

In [8]:
(df.Id == 'new_whale').mean()

0.3810575292772367

In [9]:
(df.Id.value_counts() == 1).mean()

0.4141858141858142

41% of all whales have only a single image associated with them.

38% of all images contain a new whale - a whale that has not been identified as one of the known whales.

There is a superb writeup on what a solution to this problem might look like [here](https://www.kaggle.com/martinpiotte/whale-recognition-model-with-score-0-78563/notebook). In general, the conversation in the Kaggle [forum](https://www.kaggle.com/c/humpback-whale-identification/discussion) also seems to have some very informative threads.

Either way, starting with a simple model that can be hacked together in a couple of lines of code is a recommended approach. It is good to have a baseline to build on - going for a complex model from start is a way for dying a thousand deaths by subtle bugs.

In [10]:
len(df)
df.head()

Unnamed: 0,Image,Id
0,0000e88ab.jpg,w_f48451c
1,0001f9222.jpg,w_c3d896a
2,00029d126.jpg,w_20df2c5
3,00050a15a.jpg,new_whale
4,0005c1ef8.jpg,new_whale


In [11]:
df_new_whale = df[df.Id=='new_whale']
df1 = df[df.Id != 'new_whale']
len(df1)
df1.index

Int64Index([    0,     1,     2,     6,     8,     9,    10,    16,    17,
               21,
            ...
            25343, 25346, 25347, 25352, 25354, 25355, 25356, 25357, 25358,
            25359],
           dtype='int64', length=15697)

In [12]:
df_new_whale = df[df.Id=='new_whale']
df_new_whale.index

Int64Index([    3,     4,     5,     7,    11,    12,    13,    14,    15,
               18,
            ...
            25341, 25342, 25344, 25345, 25348, 25349, 25350, 25351, 25353,
            25360],
           dtype='int64', length=9664)

In [13]:
df_known = df[df.Id!='new_whale']
type(df_known.Id.value_counts())

pandas.core.series.Series

In [14]:
df.iloc[:1].index.values

array([0])

In [88]:
def split_whale_set(df, train_portion=0.8, seed=1):
    np.random.seed(seed)
    df_known = df[df.Id!='new_whale']
    #list(df_known.groupby('Id'))
    train_idxes = []
    val_idxes = []
    for name, group in df_known.groupby('Id'):
        #print(name, len(group), group.index, type(group))
        if len(group) > 1:
            while True:
                mask = np.random.rand(len(group)) < train_portion
                if 0 < mask.sum() < len(group):
                    break
            #print(mask, group[mask].index.values)
            train_idxes.extend(group[mask].Image.values)
            val_idxes.extend(group[~mask].Image.values)
        else:
            #print(group.index.values)
            train_idxes.extend(group.Image.values)
            val_idxes.extend(group.Image.values)
    return train_idxes, val_idxes


In [89]:
train_idxes, val_idxes = split_whale_set(df, 0.8, 1)
len(train_idxes), len(val_idxes)

(11500, 6270)

In [74]:
ItemList(train_idxes, 'data/train')

ItemList (12992 items)
['833675975.jpg', '2fe2cc5c0.jpg', '2f31725c6.jpg', '30eac8c9f.jpg', '3c4235ad2.jpg']...
Path: data/train

In [14]:
ImageItemList.from_df(df[df.Id != 'new_whale'], 'data/train')

ImageItemList (15697 items)
[Image (3, 700, 1050), Image (3, 325, 758), Image (3, 497, 1050), Image (3, 458, 1050), Image (3, 450, 1050)]...
Path: data/train

In [15]:
ImageItemList.from_folder('data/train')

ImageItemList (25361 items)
[Image (3, 385, 1050), Image (3, 450, 1050), Image (3, 443, 1050), Image (3, 700, 1050), Image (3, 525, 1050)]...
Path: data/train

In [16]:
#ImageItemList.from_csv('data/train', 'train.csv')

In [17]:
df.Id.nunique()

5005

In [85]:
df[df.Id=='w_00656c0']

Unnamed: 0,Image,Id
1643,108f230d8.jpg,w_00656c0
14411,910e6297a.jpg,w_00656c0


In [19]:
fn2label = {row[1].Image: row[1].Id for row in df.iterrows()}

In [20]:
fn2label['001c1ac5f.jpg']

'w_a6f9d33'

In [21]:
SZ = 224
BS = 64
NUM_WORKERS = 12
SEED=1

In [100]:
train_item_list = ItemList(train_idxes)
val_item_list = ItemList(val_idxes)
data = (
    ImageItemList
        #.from_df(df, 'data/train')
        .from_folder('data/train')
        .split_by_list(train_item_list, val_item_list)
        #.random_split_by_pct(seed=SEED)
        .label_from_func(lambda name: fn2label[name])
        .add_test(ImageItemList.from_folder('data/test'))
        .transform(get_transforms(do_flip=False, max_zoom=1, max_warp=0, max_rotate=2), size=SZ, resize_method=ResizeMethod.SQUISH)
        .databunch(bs=BS, num_workers=NUM_WORKERS, path='data')
)



  warn(f"There seems to be something wrong with your dataset, can't access self.train_ds[i] for all i in {idx}")


In [61]:
data.train.__getitem__(0)

AttributeError: 'Image' object has no attribute '__getitem__'

In [94]:
data.show_batch(rows=3)

AttributeError: Traceback (most recent call last):
  File "/home/wb/.pyenv/versions/anaconda3-5.3.1/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 138, in _worker_loop
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "/home/wb/.pyenv/versions/anaconda3-5.3.1/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 138, in <listcomp>
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "/home/wb/.pyenv/versions/anaconda3-5.3.1/lib/python3.7/site-packages/fastai/data_block.py", line 547, in __getitem__
    x = x.apply_tfms(self.tfms, **self.tfmargs)
AttributeError: 'str' object has no attribute 'apply_tfms'


## Train

In [95]:
name = f'res50-{SZ}'

In [98]:
learn = create_cnn(data, models.resnet50, pretrained=False, metrics=[accuracy, map5])

In [99]:
learn.fit_one_cycle(2)

epoch,train_loss,valid_loss,accuracy,map5


AttributeError: Traceback (most recent call last):
  File "/home/wb/.pyenv/versions/anaconda3-5.3.1/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 138, in _worker_loop
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "/home/wb/.pyenv/versions/anaconda3-5.3.1/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 138, in <listcomp>
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "/home/wb/.pyenv/versions/anaconda3-5.3.1/lib/python3.7/site-packages/fastai/data_block.py", line 547, in __getitem__
    x = x.apply_tfms(self.tfms, **self.tfmargs)
AttributeError: 'str' object has no attribute 'apply_tfms'


In [None]:
learn.recorder.plot_losses()

In [None]:
learn.save(f'{name}-stage-1')

In [None]:
learn.unfreeze()

In [None]:
learn.lr_find()

In [None]:
learn.recorder.plot()

In [None]:
max_lr = 1e-4
lrs = [max_lr/100, max_lr/10, max_lr]

In [None]:
learn.fit_one_cycle(5, lrs)

In [None]:
learn.save(f'{name}-stage-2')

In [None]:
learn.recorder.plot_losses()

This is not a loss plot you would normally expect to see. Why does it look like this? Let's consider what images appear in the validation set:
 * images of whales that do not appear in the train set (whales where all their images were randomly assigned to the validation set) - there is nothing our model can learn about these!
 * images of whales with multiple images in the dataset where some subset of those got assigned to the validation set
 * `new_whale` images
 
Intuitively, a model such as the above does not seem to frame the problem in a way that would be easy for a neural network to solve. Nonetheless, it is interesting to think how we could improve on the construction of the validation set? What tweaks could be made to the model to improve its performance?

## Predict

In [None]:
preds, _ = learn.get_preds(DatasetType.Test)

In [None]:
mkdir -p subs

In [None]:
create_submission(preds, learn.data, name)

In [None]:
pd.read_csv(f'subs/{name}.csv.gz').head()

In [None]:
!kaggle competitions submit -c humpback-whale-identification -f subs/{name}.csv.gz -m "{name}"