## **Prepare dataset**

- Download the dataset from Kaggle https://www.kaggle.com/c/dog-breed-identification
- Put the dataset in this root directory
- Arrange data to the following format

```
Downloaded format:
train/
test/
labels.csv
```


```
Preferred format:
- root_dir/
    breed1/
    breed2/
    breed3/
```

- Use `ImageFolder`, `DataLoader` to get `train_loader`

Reference: https://www.youtube.com/watch?v=nCq_vy9qE-k

In [None]:
# install libraries if you do not have them yet
!pip install pytorch_lightning
!pip install kaggle  # you have to generate kaggle.json and put into ~/.kaggle/kaggle.json
!kaggle competitions download -c dog-breed-identification # download the dataset

In [None]:
import os
import os.path as op
import shutil
from glob import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from torchvision import datasets, models, transforms
from torch.utils.data import DataLoader
import pytorch_lightning as pl

In [None]:
# get all image paths
img_df = pd.DataFrame(glob("data/train/*.jpg"), columns=["path"])
img_df["id"] = img_df.path.map(lambda x: op.basename(x).replace(".jpg", ""))

# read label data
label_df = pd.read_csv("data/labels.csv")
train_df = img_df.merge(label_df, on="id")

In [None]:
train_df.head()

In [None]:
# save all dataest to a new folder called "train_data"
root_dir = "train_data"
for _, r in train_df.iterrows():
    if not op.exists(f"{root_dir}/{r.breed}"):
        os.makedirs(f"{root_dir}/{r.breed}")
    shutil.copy(r.path, f"{root_dir}/{r.breed}/{r.id}.jpg")

In [None]:
transform = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])
train_data = datasets.ImageFolder(root_dir, transform=transform)
train_loader = DataLoader(train_data, batch_size=32, num_workers=0, shuffle=True)

classes = train_data.classes
n_classes = len(classes)

In [None]:
print(n_classes)

In [None]:
x, y = next(iter(train_loader))

## **Transfer Learning with Pytorch lightning**

In [None]:
import torch
from torch.nn.functional import cross_entropy
from torchmetrics.functional import accuracy
from torchvision import models

# resnet50 = models.resnet50(pretrained=True)

In [None]:
# example
accuracy(torch.tensor([0,1,0]), torch.tensor([0,0,0]))

In [None]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks.progress import TQDMProgressBar

class DogBreedClassifier(pl.LightningModule):
    def __init__(self, n_classes=120, lr=1e-4):
        super().__init__()
        
        self.lr = lr
        # create resnet50, replace last layer
        self.backbone = models.resnet50(pretrained=True)
        for param in self.backbone.parameters():
            param.requires_grad = False
        self.backbone.fc = torch.nn.Linear(self.backbone.fc.in_features, n_classes)

    def forward(self, x):
        preds = self.backbone(x)
        return preds
        
    def training_step(self, batch, batch_idx):
        x, y = batch
        preds = self.backbone(x)
        loss = cross_entropy(preds, y)
        self.log("train_loss", loss)  # logging
        self.log("train_accuracy", accuracy(preds, y))
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        return optimizer

In [None]:
# initialize class instance
classifier = DogBreedClassifier(lr=1e-4)

# using pytorch lightning to fit the train_loader
trainer = pl.Trainer(callbacks=[TQDMProgressBar(refresh_rate=1)], max_epochs=5, gpus=1) # fast_dev_run=True, gpus=1
trainer.fit(classifier, train_loader)

In [None]:
%reload_ext tensorboard
%tensorboard --logdir lightning_logs/

## **Adding auto learning rate finder**

- Using `auto_lr_find` (automatically find learning rate)

In [None]:
class DogBreedClassifierPlus(pl.LightningModule):
    def __init__(self, n_classes=120, lr=1e-4):
        super().__init__()

        self.n_classes = n_classes
        self.lr = lr
        # create resnet50, replace last layer
        self.backbone = models.resnet50(pretrained=True)
        for param in self.backbone.parameters():
            param.requires_grad = False
        self.backbone.fc = torch.nn.Linear(self.backbone.fc.in_features, n_classes)

    def forward(self, x):
        preds = self.backbone(x)
        return preds
        
    def training_step(self, batch, batch_idx):
        x, y = batch
        preds = self.backbone(x)
        loss = cross_entropy(preds, y)
        self.log("train_loss", loss)  # logging
        self.log("train_accuracy", accuracy(preds, y))
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        return optimizer

In [None]:
# initialize class instance
classifier = DogBreedClassifierPlus()

# using pytorch lightning to fit the train_loader
trainer = pl.Trainer(
    callbacks=[TQDMProgressBar(refresh_rate=1)],
    max_epochs=1,
    gpus=1,
    auto_lr_find=True
)  # fast_dev_run=True

In [None]:
lr_finder = trainer.tuner.lr_find(classifier, train_loader)

In [None]:
lr_finder.suggestion()  # suggested lr_finder

In [None]:
classifier = DogBreedClassifier(lr=lr_finder.suggestion())

trainer = pl.Trainer(
    callbacks=[TQDMProgressBar(refresh_rate=1)],
    max_epochs=5,
    gpus=1,
)
trainer.fit(classifier, train_loader)

In [None]:
%reload_ext tensorboard
%tensorboard --logdir lightning_logs/

## **Prediction** 

In [None]:
from torch.nn.functional import softmax

In [None]:
images, labels = next(iter(train_loader))

In [None]:
y_pred = classifier(images.cuda())
y_pred = softmax(y_pred, dim=-1).argmax(dim=-1)

In [None]:
y_pred, labels

## **Homework**

- Try changing `max_epochs`
- Try changing the pre-trained model in `backbone`. See how it affects the final accuracy
- Observe the difference between `DogBreedClassifier` and `DogBreedClassifierPlus`. What happens after `current_epoch`. Hint: you can look at the tensorboard