[View in Colaboratory](https://colab.research.google.com/github/bhoomit/fastai-dl1-colab/blob/master/src/notebooks/lesson1_breeds.ipynb)

# Dogs breeds

https://youtu.be/JNxcznsrRb8?t=1h31m8s

# Setting up things

In [0]:
%matplotlib inline

## Install Pytorch

In [0]:
# http://pytorch.org/
from os import path
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())

accelerator = 'cu80' if path.exists('/opt/bin/nvidia-smi') else 'cpu'
print(accelerator)

!pip install --upgrade pip
!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.3.1-{platform}-linux_x86_64.whl torchvision

## Other packages

In [0]:
!pip install -q numpy scipy pandas matplotlib sklearn tqdm livelossplot 
!pip install fastai==0.7 kaggle
!pip install --no-cache-dir -I pillow==4.0.0

## Import everything

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import numpy
import pandas
import time
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

from tqdm import trange
from itertools import zip_longest
import pickle
import sklearn
import math
import sklearn.metrics

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

torch.manual_seed(1)

In [0]:
from fastai.imports import *
from fastai.torch_imports import *
from fastai.transforms import *
from fastai.conv_learner import *
from fastai.model import *
from fastai.dataset import *
from fastai.sgdr import *
from fastai.plots import *

In [0]:
torch.cuda.set_device(0)

## Kaggle Setup

1. Create account https://www.kaggle.com/
2. Go to your account settings page: https://www.kaggle.com/bhoomitvasani/account
3. Click on **Create New API Token**
4. Upload it to your Google Drive.

Kaggle Dog Breed Identification. Get data from https://www.kaggle.com/c/dog-breed-identification

In [0]:
from googleapiclient.discovery import build
import io, os
from googleapiclient.http import MediaIoBaseDownload
from google.colab import auth
auth.authenticate_user()
drive_service = build('drive', 'v3')
print(drive_service.files().list().execute())
results = drive_service.files().list(
        q="name = 'kaggle.json'", fields="files(id, name)"
).execute()
kaggle_api_key = results.get('files', [])
filename = "/root/.kaggle/kaggle.json"
os.makedirs(os.path.dirname(filename), exist_ok=True)
print(kaggle_api_key)
request = drive_service.files().get_media(fileId=kaggle_api_key[0]['id'])
fh = io.FileIO(filename, 'wb')
downloader = MediaIoBaseDownload(fh, request)
done = False
while done is False:
    status, done = downloader.next_chunk()
    print("Download %d%%." % int(status.progress() * 100))
os.chmod(filename, 600)

### Download Dog bread dataset

In [0]:
PATH = "data/dogbreed/"
!kaggle competitions files -c dog-breed-identification
!kaggle competitions download  -c dog-breed-identification -p '{PATH}'
!unzip '{PATH}*.zip' -d '{PATH}' > /dev/null

# Lets get started

In [0]:
sz = 224
arch = resnext101_64
bs = 58

In [0]:
label_csv = f'{PATH}labels.csv'
n = len(list(open(label_csv))) - 1 # header is not counted (-1)
val_idxs = get_cv_idxs(n) # random 20% data for validation set

In [0]:
n

In [0]:
len(val_idxs)

In [0]:
# If you haven't downloaded weights.tgz yet, download the file.
#     http://forums.fast.ai/t/error-when-trying-to-use-resnext50/7555
#     http://forums.fast.ai/t/lesson-2-in-class-discussion/7452/222
!mkdir fastai
!wget -O fastai/weights.tgz http://files.fast.ai/models/weights.tgz
!tar xvfz fastai/weights.tgz -C fastai

## Initial exploration

In [0]:
!ls {PATH}

In [0]:
label_df = pd.read_csv(label_csv)

In [0]:
label_df.head()

In [0]:
label_df.pivot_table(index="breed", aggfunc=len).sort_values('id', ascending=False)

In [0]:
tfms = tfms_from_model(arch, sz, aug_tfms=transforms_side_on, max_zoom=1.1)
data = ImageClassifierData.from_csv(PATH, 'train', f'{PATH}labels.csv', test_name='test', # we need to specify where the test set is if you want to submit to Kaggle competitions
                                   val_idxs=val_idxs, suffix='.jpg', tfms=tfms, bs=bs)

In [0]:
fn = PATH + data.trn_ds.fnames[0]; fn

In [0]:
img = PIL.Image.open(fn); img

In [0]:
img.size

In [0]:
size_d = {k: PIL.Image.open(PATH + k).size for k in data.trn_ds.fnames}

In [0]:
row_sz, col_sz = list(zip(*size_d.values()))

In [0]:
row_sz = np.array(row_sz); col_sz = np.array(col_sz)

In [0]:
row_sz[:5]

In [0]:
plt.hist(row_sz);

In [0]:
plt.hist(row_sz[row_sz < 1000])

In [0]:
plt.hist(col_sz);

In [0]:
plt.hist(col_sz[col_sz < 1000])

In [0]:
len(data.trn_ds), len(data.test_ds)

In [0]:
len(data.classes), data.classes[:5]

## Initial model

In [0]:
def get_data(sz, bs): # sz: image size, bs: batch size
    tfms = tfms_from_model(arch, sz, aug_tfms=transforms_side_on, max_zoom=1.1)
    data = ImageClassifierData.from_csv(PATH, 'train', f'{PATH}labels.csv', test_name='test',
                                       val_idxs=val_idxs, suffix='.jpg', tfms=tfms, bs=bs)
    
    # http://forums.fast.ai/t/how-to-train-on-the-full-dataset-using-imageclassifierdata-from-csv/7761/13
    # http://forums.fast.ai/t/how-to-train-on-the-full-dataset-using-imageclassifierdata-from-csv/7761/37
    return data if sz > 300 else data.resize(340, 'tmp') # Reading the jpgs and resizing is slow for big images, so resizing them all to 340 first saves time

#Source:   
#    def resize(self, targ, new_path):
#        new_ds = []
#        dls = [self.trn_dl,self.val_dl,self.fix_dl,self.aug_dl]
#        if self.test_dl: dls += [self.test_dl, self.test_aug_dl]
#        else: dls += [None,None]
#        t = tqdm_notebook(dls)
#        for dl in t: new_ds.append(self.resized(dl, targ, new_path))
#        t.close()
#        return self.__class__(new_ds[0].path, new_ds, self.bs, self.num_workers, self.classes)
#File:      ~/fastai/courses/dl1/fastai/dataset.py

### Precompute

In [0]:
data = get_data(sz, bs)

In [0]:
learn = ConvLearner.pretrained(arch, data, precompute=True)

In [0]:
learn.fit(1e-2, 5)

### Augment

In [0]:
from sklearn import metrics

In [0]:
data = get_data(sz, bs)

In [0]:
learn = ConvLearner.pretrained(arch, data, precompute=True, ps=0.5)

In [0]:
learn.fit(1e-2, 2)

In [0]:
learn.precompute = False

In [0]:
learn.fit(1e-2, 5, cycle_len=1)

In [0]:
learn.save('224_pre')

In [0]:
learn.load('224_pre')

## Increase size

In [0]:
# Starting training on small images for a few epochs, then switching to bigger images, and continuing training is an amazingly effective way to avoid overfitting.

# http://forums.fast.ai/t/planet-classification-challenge/7824/96
# set_data doesn’t change the model at all. It just gives it new data to train with.
learn.set_data(get_data(299, bs)) 
learn.freeze()

#Source:   
#    def set_data(self, data, precompute=False):
#        super().set_data(data)
#        if precompute:
#            self.unfreeze()
#            self.save_fc1()
#            self.freeze()
#            self.precompute = True
#        else:
#            self.freeze()
#File:      ~/fastai/courses/dl1/fastai/conv_learner.py

In [0]:
learn.summary()

In [0]:
learn.fit(1e-2, 3, cycle_len=1)

Validation loss is much lower than training loss. This is a sign of underfitting. Cycle_len=1 may be too short. Let's set cycle_mult=2 to find better parameter.

In [0]:
# When you are under fitting, it means cycle_len=1 is too short (learning rate is getting reset before it had the chance to zoom in properly).
learn.fit(1e-2, 3, cycle_len=1, cycle_mult=2) # 1+2+4 = 7 epochs

Training loss and validation loss are getting closer and smaller. We are on right track.

In [0]:
log_preds, y = learn.TTA() # (5, 2044, 120), (2044,)
probs = np.mean(np.exp(log_preds),0)
accuracy_np(probs, y), metrics.log_loss(y, probs)

In [0]:
len(data.val_ds.y), data.val_ds.y[:5]

In [0]:
learn.save('299_pre')

In [0]:
learn.load('299_pre')

In [0]:
learn.fit(1e-2, 1, cycle_len=2) # 1+1 = 2 epochs

In [0]:
learn.save('299_pre')

In [0]:
log_preds, y = learn.TTA()
probs = np.mean(np.exp(log_preds),0)
accuracy_np(probs, y), metrics.log_loss(y, probs)

This dataset is so similar to ImageNet dataset. Training convolution layers doesn't help much. We are not going to unfreeze.

## Create submission

https://youtu.be/9C06ZPF8Uuc?t=1905

In [0]:
data.classes

In [0]:
data.test_ds.fnames

In [0]:
log_preds, y = learn.TTA(is_test=True) # use test dataset rather than validation dataset
probs = np.mean(np.exp(log_preds),0)
#accuracy_np(probs, y), metrcs.log_loss(y, probs) # This does not make sense since test dataset has no labels

In [0]:
probs.shape # (n_images, n_classes)

In [0]:
df = pd.DataFrame(probs)
df.columns = data.classes

In [0]:
df.insert(0, 'id', [o[5:-4] for o in data.test_ds.fnames])

In [0]:
df.head()

In [0]:
SUBM = f'{PATH}/subm/'
os.makedirs(SUBM, exist_ok=True)
df.to_csv(f'{SUBM}subm.gz', compression='gzip', index=False)

In [0]:
FileLink(f'{SUBM}subm.gz')

## Individual prediction

In [0]:
fn = data.val_ds.fnames[0]
fn

In [0]:
Image.open(PATH + fn).resize((150, 150))

In [0]:
# Method 1.
trn_tfms, val_tfms = tfms_from_model(arch, sz)
ds = FilesIndexArrayDataset([fn], np.array([0]), val_tfms, PATH)
dl = DataLoader(ds)
preds = learn.predict_dl(dl)
np.argmax(preds)

In [0]:
learn.data.classes[np.argmax(preds)]

In [0]:
# Method 2.
trn_tfms, val_tfms = tfms_from_model(arch, sz)
im = val_tfms(open_image(PATH + fn)) # open_image() returns numpy.ndarray
preds = learn.predict_array(im[None])
np.argmax(preds)