In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from fastai.vision import *
from fastai.metrics import accuracy
from fastai.basic_data import *
from skimage.util import montage
import pandas as pd
from torch import optim
import re

from utils import *

I take a curriculum approach to training here. I first expose the model to as many different images of whales as quickly as possible (no oversampling) and train on images resized to 224x224.

I would like the conv layers to start picking up on features useful for identifying whales. For that, I want to show the model as rich of a dataset as possible.

I then train on images resized to 448x448.

Finally, I train on oversampled data. Here, the model will see some images more often than others but I am hoping that this will help alleviate the class imbalance in the training data.

In [2]:
import fastai
from fastprogress import force_console_behavior
import fastprogress
fastprogress.fastprogress.NO_BAR = True
master_bar, progress_bar = force_console_behavior()
fastai.basic_train.master_bar, fastai.basic_train.progress_bar = master_bar, progress_bar

In [3]:
df = pd.read_csv('../data/train.csv')
val_fns = {'69823499d.jpg'}

In [4]:
fn2label = {row[1].Image: row[1].Id for row in df.iterrows()}
path2fn = lambda path: re.search('\w*\.jpg$', path).group(0)

In [5]:
name = f'res50-full-train'
name = f'den121-full-train'

In [6]:
SZ = 224
BS = 64
NUM_WORKERS = 12
SEED=0

In [7]:
data = (
    ImageItemList
        .from_df(df[df.Id != 'new_whale'], '../data/train', cols=['Image'])
        .split_by_valid_func(lambda path: path2fn(path) in val_fns)
        .label_from_func(lambda path: fn2label[path2fn(path)])
        .add_test(ImageItemList.from_folder('../data/test'))
        .transform(get_transforms(do_flip=False), size=SZ, resize_method=ResizeMethod.SQUISH)
        .databunch(bs=BS, num_workers=NUM_WORKERS, path='../data')
        .normalize(imagenet_stats)
)

In [8]:
%%time

learn = create_cnn(data, models.densenet121, lin_ftrs=[2048])
learn.clip_grad();

CPU times: user 2.3 s, sys: 698 ms, total: 3 s
Wall time: 3.01 s


In [9]:
learn.fit_one_cycle(14, 1e-2)
learn.save(f'{name}-stage-1')

learn.unfreeze()

max_lr = 1e-3
lrs = [max_lr/100, max_lr/10, max_lr]

learn.fit_one_cycle(24, lrs)
learn.save(f'{name}-stage-2')

epoch     train_loss  valid_loss
1         7.552574    4.138718    
2         6.715404    0.950479    
3         6.050995    0.002508    
4         5.139301    1.015872    
5         4.226835    0.023248    
6         3.324532    0.034236    
7         2.560887    0.000233    
8         1.738997    0.000066    
9         1.112600    0.000001    
10        0.671867    0.000003    
11        0.382803    0.000005    
12        0.203173    0.000002    
13        0.144431    0.000000    
14        0.100741    0.000000    
Total time: 20:17
epoch     train_loss  valid_loss
1         0.093720    0.000000    
2         0.098311    0.000000    
3         0.098892    0.000001    
4         0.110860    0.000000    
5         0.123511    0.000000    
6         0.143915    0.000000    
7         0.126430    0.000003    
8         0.154158    0.000000    
9         0.133794    0.000000    
10        0.135681    0.000000    
11        0.118621    0.000000    
12        0.106953    0.000001    
13    

In [10]:
SZ = 224 * 2
BS = 64 // 4
NUM_WORKERS = 12
SEED=0

In [11]:
data = (
    ImageItemList
        .from_df(df[df.Id != 'new_whale'], '../data/train', cols=['Image'])
        .split_by_valid_func(lambda path: path2fn(path) in val_fns)
        .label_from_func(lambda path: fn2label[path2fn(path)])
        .add_test(ImageItemList.from_folder('../data/test'))
        .transform(get_transforms(do_flip=False), size=SZ, resize_method=ResizeMethod.SQUISH)
        .databunch(bs=BS, num_workers=NUM_WORKERS, path='../data')
        .normalize(imagenet_stats)
)

In [12]:
%%time
learn = create_cnn(data, models.densenet121, lin_ftrs=[2048])
learn.clip_grad();
learn.load(f'{name}-stage-2')
# learn.load(f'{name}-stage-3')
learn.freeze_to(-1)

learn.fit_one_cycle(12, 1e-2 / 4)
learn.save(f'{name}-stage-3')

learn.unfreeze()

max_lr = 1e-3 / 4
lrs = [max_lr/100, max_lr/10, max_lr]

learn.fit_one_cycle(22, lrs)
learn.save(f'{name}-stage-4')

epoch     train_loss  valid_loss
1         1.472018    0.000000    
2         0.928063    0.000000    
3         1.286815    0.000244    
4         1.834080    0.000030    
5         1.918766    0.000001    
6         1.849328    0.000719    
7         1.658498    0.000119    
8         1.443799    0.000006    
9         1.134555    0.000010    
10        0.891679    0.000004    
11        0.735060    0.000006    
12        0.574668    0.000005    
Total time: 1:04:08
epoch     train_loss  valid_loss
1         0.571490    0.000008    
2         0.604931    0.000008    
3         0.596035    0.000009    
4         0.663896    0.000004    
5         0.668168    0.000016    
6         0.723208    0.000006    
7         0.722136    0.000003    
8         0.697669    0.000001    
9         0.656108    0.000001    
10        0.702317    0.000001    
11        0.652714    0.000000    
12        0.645954    0.000001    
13        0.600806    0.000000    
14        0.621588    0.000002    
15  

In [13]:
# with oversampling
df = pd.read_csv('../data/oversampled_train_and_val.csv')

In [14]:
data = (
    ImageItemList
        .from_df(df, '../data/train', cols=['Image'])
        .split_by_valid_func(lambda path: path2fn(path) in val_fns)
        .label_from_func(lambda path: fn2label[path2fn(path)])
        .add_test(ImageItemList.from_folder('../data/test'))
        .transform(get_transforms(do_flip=False), size=SZ, resize_method=ResizeMethod.SQUISH)
        .databunch(bs=BS, num_workers=NUM_WORKERS, path='../data')
        .normalize(imagenet_stats)
)

In [15]:
%%time
learn = create_cnn(data, models.densenet121, lin_ftrs=[2048])
learn.clip_grad();
learn.load(f'{name}-stage-4')
learn.freeze_to(-1)

learn.fit_one_cycle(2, 1e-2 / 4)
learn.save(f'{name}-stage-5')

learn.unfreeze()

max_lr = 1e-3 / 4
lrs = [max_lr/100, max_lr/10, max_lr]

learn.fit_one_cycle(3, lrs)
learn.save(f'{name}-stage-6')

epoch     train_loss  valid_loss
1         1.862462    0.000002    
2         0.703604    0.000084    
Total time: 52:13
epoch     train_loss  valid_loss
1         0.695476    0.000079    
2         0.768560    0.000130    
3         0.617248    0.000181    
Total time: 1:44:41
CPU times: user 1h 59min 51s, sys: 36min 49s, total: 2h 36min 40s
Wall time: 2h 36min 56s


## Predict

In [30]:
preds, _ = learn.get_preds(DatasetType.Test)

In [31]:
preds = torch.cat((preds, torch.ones_like(preds[:, :1])), 1)

In [32]:
preds[:, 5004] = 0.04

In [33]:
classes = learn.data.classes + ['new_whale']

In [34]:
create_submission(preds, learn.data, name, classes)

In [35]:
pd.read_csv(f'subs/{name}.csv.gz').head()

Unnamed: 0,Image,Id
0,41d6736e1.jpg,w_7e56d66 w_e2530bf new_whale w_d2c2be0 w_8b8dca8
1,c68904c64.jpg,new_whale w_492d9c5 w_2f3badb w_e798363 w_72fea5e
2,361293a53.jpg,new_whale w_171ca39 w_3132fce w_5f0fcab w_da443d9
3,0a9b3c0dc.jpg,w_0abdaf4 new_whale w_d7b6f17 w_efa100e w_c928809
4,0f41d9dee.jpg,new_whale w_9714922 w_6d8f72e w_491b2e5 w_550dd10


In [36]:
pd.read_csv(f'subs/{name}.csv.gz').Id.str.split().apply(lambda x: x[0] == 'new_whale').mean()

0.4904522613065327

In [39]:
np.save('../cache/preds_dn121', preds)

In [37]:
!kaggle competitions submit -c humpback-whale-identification -f subs/{name}.csv.gz -m "{name}"

100%|████████████████████████████████████████| 184k/184k [00:03<00:00, 51.8kB/s]
Successfully submitted to Humpback Whale Identification

In [38]:
!kaggle competitions submissions -c humpback-whale-identification

fileName                  date                 description        status    publicScore  privateScore  
------------------------  -------------------  -----------------  --------  -----------  ------------  
den121-full-train.csv.gz  2019-02-21 08:08:02  den121-full-train  complete  0.744        None          
res50-full-train.csv.gz   2019-02-20 23:10:32  res50-full-train   complete  0.757        None          
res50-full-train.csv.gz   2019-02-20 23:08:39  res50-full-train   complete  0.712        None          
res50-full-train.csv.gz   2019-02-20 23:07:12  res50-full-train   complete  0.649        None          
res50-full-train.csv.gz   2019-02-20 19:54:45  res50-full-train   complete  0.749        None          
sub7d.csv                 2019-02-20 00:22:56  None               complete  0.866        None          
sub7c.csv                 2019-02-16 21:40:09  None               complete  0.276        None          
sub7b.csv                 2019-02-14 19:57:55  None    