In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from fastai.imports import *
from fastai.transforms import *
from fastai.conv_learner import *
from fastai.model import *
from fastai.dataset import *
from fastai.sgdr import *
from fastai.plots import *

In [None]:
PATH = "data/plant-seedlings/"

In [None]:
sz = 224
arch = resnet50
bs = 24
# try 224, 58 and 299, 58

In [None]:
label_csv = f'{PATH}labels.csv'
n = len(list(open(label_csv))) - 1
val_idxs = get_cv_idxs(n)

In [None]:
label_df = pd.read_csv(label_csv)

## Removing space between class names

In [None]:
label_df.species = label_df.species.apply(lambda x: re.sub(' ', '_', x))
label_df.to_csv(f'{PATH}labels.csv', index=False)

## Pivot table

In [None]:
label_df.pivot_table(index='species', aggfunc=len).sort_values('file', ascending=False)

## Analysis

In [None]:
row_sz = [PIL.Image.open(f'{PATH}train/'+k).size[0] for k in os.listdir(f'{PATH}train/')]

In [None]:
col_sz = [PIL.Image.open(f'{PATH}train/'+k).size[1] for k in os.listdir(f'{PATH}train/')]

In [None]:
row_sz = np.array(row_sz); col_sz = np.array(col_sz)

In [None]:
plt.hist(row_sz[row_sz < 500]);

In [None]:
plt.hist(col_sz[col_sz < 500]);

## Training setup

In [None]:
def get_data(sz, bs):
    tfms = tfms_from_model(arch, sz, aug_tfms=transforms_top_down, max_zoom=1.1)
    data = ImageClassifierData.from_csv(PATH, 'train', f'{PATH}labels.csv', test_name='test',
                                        val_idxs=val_idxs, tfms=tfms, bs=bs)
    return data if sz>300 else data.resize(340, 'tmp')

In [None]:
data = get_data(sz, bs)

In [None]:
learn = ConvLearner.pretrained(arch, data, precompute=True)

In [None]:
learn.lr_find()

In [None]:
learn.sched.plot()

In [None]:
lr = 1e-2

In [None]:
learn.fit(lr, 2, cycle_len=2)

In [None]:
learn = ConvLearner.pretrained(arch, data, precompute=True, ps=0.5)

In [None]:
learn.fit(lr, 2, cycle_len=2)

In [None]:
learn.precompute=False
learn.bn_freeze=True

In [None]:
learn.fit(lr, 2, cycle_len=1, cycle_mult=2)

In [None]:
learn.unfreeze()

In [None]:
lrs = np.array([lr/20,lr/10,lr])

In [None]:
learn.fit(lrs, 2, cycle_len=1, cycle_mult=2)

In [None]:
learn.fit(lrs, 2, cycle_len=1, cycle_mult=2)

In [None]:
learn.fit(lrs, 2, cycle_len=1, cycle_mult=2)

In [None]:
learn.save(f'{sz}')

In [None]:
sz = 299

In [None]:
learn.set_data(get_data(sz, bs))

In [None]:
learn.freeze()
learn.fit(lr, 2, cycle_len=2)

In [None]:
learn.fit(lr, 2, cycle_len=1, wds=0.025, use_wd_sched=True, cycle_mult=2)

In [None]:
learn.unfreeze()

In [None]:
learn.fit(lrs, 3, cycle_len=1, cycle_mult=2)

In [None]:
wd=0.025

In [None]:
learn.fit(lrs, 2, cycle_len=1, wds=[wd/18, wd/9, wd/2], use_wd_sched=True, cycle_mult=2)

In [None]:
learn.save(f'{sz}')

In [None]:
sz = 400

In [None]:
from sklearn import metrics

In [None]:
learn.set_data(get_data(sz, bs))

In [None]:
learn.freeze()
learn.fit(lr, 2, cycle_len=1)

In [None]:
learn.fit(lr, 2, cycle_len=1, wds=wd, use_wd_sched=True, cycle_mult=2)

In [None]:
learn.unfreeze()

In [None]:
learn.fit(lrs, 3, cycle_len=1, cycle_mult=2)

In [None]:
learn.save(f'{sz}')

In [None]:
learn.fit(lrs, 2, cycle_len=1, wds=[wd/18, wd/9, wd/2], use_wd_sched=True, cycle_mult=2)

In [None]:
learn.save(f'{sz}'+'_wds')

In [None]:
learn.load(f'{sz}'+'_wds')

In [None]:
sz = 450

In [None]:
learn.set_data(get_data(sz, 16))

In [None]:
learn.freeze()
learn.fit(lr, 2, cycle_len=1, cycle_mult=2)

In [None]:
learn.unfreeze()
learn.fit(lrs, 2, cycle_len=1, cycle_mult=2)

In [None]:
learn.save(f'{sz}')

In [None]:
learn.fit(lrs, 2, cycle_len=1, wds=[wd/18, wd/9, wd/2], use_wd_sched=True, cycle_mult=2)

In [None]:
learn.save(f'{sz}'+'_wds')

In [None]:
sz = 500

In [None]:
learn.set_data(get_data(sz, 16))

In [None]:
learn.freeze()
learn.fit(lr, 2, cycle_len=1, cycle_mult=2)

In [None]:
learn.unfreeze()
learn.fit(lrs, 2, cycle_len=1, cycle_mult=2)

In [None]:
learn.save(f'{sz}')

In [None]:
learn.fit(lrs, 2, cycle_len=1, wds=[wd/18, wd/9, wd/2], use_wd_sched=True, cycle_mult=2)

In [None]:
learn.save(f'{sz}'+'_wds')

In [None]:
learn.load('500_wds')

## Old steps

In [None]:
learn.set_data(get_data(299, bs))
learn.freeze()

In [None]:
learn.fit(1e-2, 3, cycle_len=1)

In [None]:
learn.fit(1e-2, 3, cycle_len=1, cycle_mult=2)

In [None]:
learn.save('299_r50')

In [None]:
learn.load('299_r50')

In [None]:
learn.set_data(get_data(350, bs))
learn.freeze()

In [None]:
learn.fit(1e-2, 3, cycle_len=1)

In [None]:
learn.fit(1e-2, 3, cycle_len=1, cycle_mult=2)

In [None]:
learn.save('350_r50')

In [None]:
learn.load('350_r50')

In [None]:
log_preds, y = learn.TTA()
probs = np.exp(log_preds)

In [None]:
accuracy(log_preds, y)

In [None]:
metrics.f1_score()

In [None]:
learn.save('350_r50')

## Prediction

In [None]:
log_preds, y = learn.TTA(is_test=True)

In [None]:
log_preds.shape

In [None]:
probs = np.mean(np.exp(log_preds), axis=0); probs

In [None]:
def save_array(fname, arr): c=bcolz.carray(arr, rootdir=fname, mode='w'); c.flush()

In [None]:
save_array('probs_500.bc', probs)

## Save to file for submission

In [None]:
def load_array(fname): return bcolz.open(fname)[:]

In [None]:
probs_400 = load_array('probs_400.bc')
probs_450 = load_array('probs_450.bc')
probs_500 = load_array('probs_500.bc')

In [None]:
probs = (probs_450 + probs_450 + probs_500)/3

In [None]:
data.classes

In [None]:
df = pd.DataFrame(np.argmax(probs, axis=1))

In [None]:
df.insert(0, 'file', [o[5:-4] for o in data.test_ds.fnames])

In [None]:
df.columns = ['file', 'species']

In [None]:
df.species = df.species.apply(lambda x: data.classes[x])

In [None]:
df.species = df.species.apply(lambda x: re.sub('_', ' ', x))

In [None]:
df.file = df.file.apply(lambda x: str(x) + '.png')

In [None]:
SUBM = f'{PATH}results/'
os.makedirs(SUBM, exist_ok=True)
df.to_csv(f'{SUBM}submission-5.csv', index=False)
#df.to_csv(f'{SUBM}subm.gz', compression='gzip', index=False)