In [9]:
%matplotlib inline
import matplotlib.pyplot as plt
from fastai.vision import *
from fastai.metrics import accuracy
from fastai.basic_data import *
from skimage.util import montage
import pandas as pd
from torch import optim
import re
import os

from utils import *

I take a curriculum approach to training here. I first expose the model to as many different images of whales as quickly as possible (no oversampling) and train on images resized to 224x224.

I would like the conv layers to start picking up on features useful for identifying whales. For that, I want to show the model as rich of a dataset as possible.

I then train on images resized to 448x448.

Finally, I train on oversampled data. Here, the model will see some images more often than others but I am hoping that this will help alleviate the class imbalance in the training data.

In [10]:
import fastai
from fastprogress import force_console_behavior
import fastprogress
fastprogress.fastprogress.NO_BAR = True
master_bar, progress_bar = force_console_behavior()
fastai.basic_train.master_bar, fastai.basic_train.progress_bar = master_bar, progress_bar

In [11]:
data_dir = '/home/ys1/dataset/Humpback_Whale'
df = pd.read_csv(os.path.join(data_dir,'train.csv'))
val_fns = {'69823499d.jpg'}

In [12]:
fn2label = {row[1].Image: row[1].Id for row in df.iterrows()}
path2fn = lambda path: re.search('\w*\.jpg$', path).group(0)

In [13]:
name = f'res50-full-train'

In [32]:
SZ = 224
BS = 64
NUM_WORKERS = 12
SEED=0

In [10]:
data = (
    ImageItemList
        .from_df(df[df.Id != 'new_whale'], os.path.join(data_dir,'train'), cols=['Image'])
        .split_by_valid_func(lambda path: path2fn(path) in val_fns)
        .label_from_func(lambda path: fn2label[path2fn(path)])
        .add_test(ImageItemList.from_folder(os.path.join(data_dir,'test')))
        .transform(get_transforms(do_flip=False), size=SZ, resize_method=ResizeMethod.SQUISH)
        .databunch(bs=BS, num_workers=NUM_WORKERS, path='data')
        .normalize(imagenet_stats)
)

In [11]:
%%time

learn = create_cnn(data, models.resnet50, lin_ftrs=[2048])
learn.clip_grad();

learn.fit_one_cycle(14, 1e-2)
learn.save(f'{name}-stage-1')

learn.unfreeze()

max_lr = 1e-3
lrs = [max_lr/100, max_lr/10, max_lr]

learn.fit_one_cycle(24, lrs)
learn.save(f'{name}-stage-2')

Downloading: "https://download.pytorch.org/models/resnet50-19c8e357.pth" to /home/ys1/.torch/models/resnet50-19c8e357.pth
100%|██████████| 102502400/102502400 [00:23<00:00, 4308318.13it/s]


epoch     train_loss  valid_loss
1         7.556186    0.722522    
2         6.772644    1.319921    
3         6.040480    0.024072    
4         5.195282    0.057624    
5         4.270959    0.000412    
6         3.371464    0.031938    
7         2.519676    0.027870    
8         1.802380    0.015728    
9         1.086848    0.003684    
10        0.728262    0.000010    
11        0.400146    0.000076    
12        0.224943    0.000011    
13        0.144085    0.000014    
14        0.097615    0.000005    
epoch     train_loss  valid_loss
1         0.109182    0.000003    
2         0.118539    0.000015    
3         0.158741    0.000045    
4         0.226455    0.001579    
5         0.271545    0.034429    
6         0.299770    0.000245    
7         0.295235    0.004831    
8         0.296074    0.000000    
9         0.254274    0.000000    
10        0.241025    0.000002    
11        0.206619    0.000000    
12        0.188408    0.000000    
13        0.154951    0.

In [14]:
SZ = 224 * 2
BS = 64 // 4
NUM_WORKERS = 12
SEED=0

In [13]:
data = (
    ImageItemList
        .from_df(df[df.Id != 'new_whale'], os.path.join(data_dir,'train'), cols=['Image'])
        .split_by_valid_func(lambda path: path2fn(path) in val_fns)
        .label_from_func(lambda path: fn2label[path2fn(path)])
        .add_test(ImageItemList.from_folder(os.path.join(data_dir,'test')))
        .transform(get_transforms(do_flip=False), size=SZ, resize_method=ResizeMethod.SQUISH)
        .databunch(bs=BS, num_workers=NUM_WORKERS, path='data')
        .normalize(imagenet_stats)
)

In [14]:
%%time
learn = create_cnn(data, models.resnet50, lin_ftrs=[2048])
learn.clip_grad();
learn.load(f'{name}-stage-2')
learn.freeze_to(-1)

learn.fit_one_cycle(12, 1e-2 / 4)
learn.save(f'{name}-stage-3')

learn.unfreeze()

max_lr = 1e-3 / 4
lrs = [max_lr/100, max_lr/10, max_lr]

learn.fit_one_cycle(22, lrs)
learn.save(f'{name}-stage-4')

epoch     train_loss  valid_loss
1         1.164288    0.000000    
2         0.834836    0.000000    
3         1.353972    0.000000    
4         1.771464    0.000000    
5         1.844741    0.000000    
6         1.752141    0.000000    
7         1.625672    0.000000    
8         1.215744    0.000000    
9         1.033324    0.000000    
10        0.809320    0.000000    
11        0.669975    0.000000    
12        0.525881    0.000000    
epoch     train_loss  valid_loss
1         0.509183    0.000000    
2         0.531925    0.000000    
3         0.498649    0.000000    
4         0.618998    0.000000    
5         0.668862    0.000000    
6         0.766217    0.000000    
7         0.711975    0.000000    
8         0.708146    0.000000    
9         0.726887    0.000000    
10        0.645698    0.000000    
11        0.678513    0.000000    
12        0.580040    0.000000    
13        0.599746    0.000000    
14        0.524298    0.000000    
15        0.539903    0.

In [15]:
# with oversampling
df = pd.read_csv(os.path.join(data_dir,'oversampled_train_and_val.csv'))

In [16]:
data = (
    ImageItemList
        .from_df(df, os.path.join(data_dir,'train'), cols=['Image'])
        .split_by_valid_func(lambda path: path2fn(path) in val_fns)
        .label_from_func(lambda path: fn2label[path2fn(path)])
        .add_test(ImageItemList.from_folder(os.path.join(data_dir,'test')))
        .transform(get_transforms(do_flip=False), size=SZ, resize_method=ResizeMethod.SQUISH)
        .databunch(bs=BS, num_workers=NUM_WORKERS, path='data')
        .normalize(imagenet_stats)
)

In [38]:
%%time
learn = create_cnn(data, models.resnet50, lin_ftrs=[2048])
learn.clip_grad();
learn.load(f'{name}-stage-4')
learn.freeze_to(-1)

learn.fit_one_cycle(2, 1e-2 / 4)
learn.save(f'{name}-stage-5')

learn.unfreeze()

max_lr = 1e-3 / 4
lrs = [max_lr/100, max_lr/10, max_lr]

learn.fit_one_cycle(3, lrs)
learn.save(f'{name}-stage-6')

epoch     train_loss  valid_loss
1         1.590668    0.000020    
2         0.615475    0.000000    
epoch     train_loss  valid_loss
1         0.663883    0.000000    
2         0.700664    0.000000    
3         0.567299    0.000003    
CPU times: user 1h 31min 43s, sys: 34min 36s, total: 2h 6min 20s
Wall time: 2h 6min 27s


## Predict

In [17]:
learn = create_cnn(data, models.resnet50, lin_ftrs=[2048])
learn.load(f'{name}-stage-6')

Learner(data=ImageDataBunch;

Train: LabelList
y: CategoryList (76286 items)
[Category w_0003639, Category w_0003639, Category w_0003639, Category w_0003639, Category w_0003639]...
Path: /home/ys1/dataset/Humpback_Whale/train
x: ImageItemList (76286 items)
[Image (3, 700, 1050), Image (3, 700, 1050), Image (3, 700, 1050), Image (3, 700, 1050), Image (3, 700, 1050)]...
Path: /home/ys1/dataset/Humpback_Whale/train;

Valid: LabelList
y: CategoryList (1 items)
[Category w_23a388d]...
Path: /home/ys1/dataset/Humpback_Whale/train
x: ImageItemList (1 items)
[Image (3, 700, 1050)]...
Path: /home/ys1/dataset/Humpback_Whale/train;

Test: LabelList
y: CategoryList (7960 items)
[Category w_0003639, Category w_0003639, Category w_0003639, Category w_0003639, Category w_0003639]...
Path: /home/ys1/dataset/Humpback_Whale/train
x: ImageItemList (7960 items)
[Image (3, 700, 1050), Image (3, 369, 1050), Image (3, 343, 800), Image (3, 309, 1050), Image (3, 525, 1050)]...
Path: /home/ys1/dataset/Humpback_

In [18]:
preds, _ = learn.get_preds(DatasetType.Test)

In [19]:
preds = torch.cat((preds, torch.ones_like(preds[:, :1])), 1)

In [20]:
preds[:, 5004] = 0.06

In [21]:
classes = learn.data.classes + ['new_whale']

In [22]:
create_submission(preds, learn.data, name, classes)

In [24]:
pd.read_csv(f'submit/{name}.csv.gz').head()

Unnamed: 0,Image,Id
0,dd3ace9c4.jpg,new_whale w_314bc30 w_736cca3 w_05d52d0 w_32043b0
1,4d3e45158.jpg,w_903c254 new_whale w_56af0a9 w_ec4bcf9 w_685115a
2,0a810e2a1.jpg,w_7b989fb new_whale w_23a388d w_8b86b95 w_c684a38
3,46471ad77.jpg,w_752ffa2 new_whale w_07768b0 w_75f6ffa w_6822dbc
4,97fa23d21.jpg,new_whale w_9d22d21 w_14ece76 w_ca11f46 w_54e7b40


In [25]:
pd.read_csv(f'submit/{name}.csv.gz').Id.str.split().apply(lambda x: x[0] == 'new_whale').mean()

0.5182160804020101

In [26]:
!kaggle competitions submit -c humpback-whale-identification -f submit/{name}.csv.gz -m "{name}"

/bin/sh: 1: kaggle: not found
