## **Prepare dataset**

- Download the dataset from Kaggle https://www.kaggle.com/c/dog-breed-identification
- Put the dataset in this root directory
- Arrange data to the following format

```
Downloaded format:
train/
test/
labels.csv
```


```
Preferred format:
- root_dir/
    breed1/
    breed2/
    breed3/
```

- Use `ImageFolder`, `DataLoader` to get `train_loader`

Reference: https://www.youtube.com/watch?v=nCq_vy9qE-k

In [None]:
import os
import os.path as op
import shutil
from glob import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from torchvision import datasets, models, transforms
from torch.utils.data import DataLoader
import pytorch_lightning as pl

In [None]:
# get all image paths
img_df = pd.DataFrame(glob("train/*.jpg"), columns=["path"])
img_df["id"] = img_df.path.map(lambda x: op.basename(x).replace(".jpg", ""))

# read label data
label_df = pd.read_csv("labels.csv")
train_df = img_df.merge(label_df, on="id")

In [None]:
train_df.head()

In [None]:
# save all dataest to a new folder called "train_data"
root_dir = "train_data"
for _, r in train_df.iterrows():
    if not op.exists(f"{root_dir}/{r.breed}"):
        os.makedirs(f"{root_dir}/{r.breed}")
    shutil.copy(r.path, f"{root_dir}/{r.breed}/{r.id}.jpg")

In [None]:
transform = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])
train_data = datasets.ImageFolder(root_dir, transform=transform)
train_loader = DataLoader(train_data, batch_size=32, num_workers=0, shuffle=True)

classes = train_data.classes
n_classes = len(classes)

In [None]:
print(n_classes)

In [None]:
x, y = next(iter(train_loader))

## **Transfer Learning with Pytorch lightning**

In [None]:
import torch
from torch.nn.functional import cross_entropy
from torchvision import models

resnet50 = models.resnet50(pretrained=True)

In [None]:
import pytorch_lightning as pl

class DogBreedClassifier(pl.LightningModule):
    def __init__(self, n_classes=120, lr=1e-4):
        super().__init__()
        
        self.lr = 1e-4
        self.backbone = models.resnet50(pretrained=True)
        self.finetune_layer = torch.nn.Linear(self.backbone.fc.out_features, n_classes)

    def forward(x):
        features = self.backbone(x)
        preds = self.finetune_layer(features)
        return preds
        
    def training_step(self, batch, batch_idx):
        x, y = batch
        with torch.no_grad():
            features = self.backbone(x)
        preds = self.finetune_layer(features)
        loss = cross_entropy(preds, y)
        self.log("train_loss", loss)  # logging
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)

In [None]:
# initialize class instance
classifier = DogBreedClassifier()

# using pytorch lightning to fit the train_loader
trainer = pl.Trainer(progress_bar_refresh_rate=20, max_epochs=5) # fast_dev_run=True, gpus=1
trainer.fit(classifier, train_loader)

## **Make it a bit more complex**

In [None]:
from torchmetrics.functional import accuracy

In [None]:
# example
accuracy(torch.tensor([0,1,0]), torch.tensor([0,0,0]))

In [None]:
class DogBreedClassifierPlus(pl.LightningModule):
    def __init__(self, n_classes=120, lr=1e-4):
        super().__init__()
        self.save_hyperparameters()
        
        self.backbone = models.resnet50(pretrained=True)
        self.finetune_layer = torch.nn.Linear(self.backbone.fc.out_features, n_classes)

    def forward(x):
        features = self.backbone(x)
        preds = self.finetune_layer(features)
        return preds
        
    def training_step(self, batch, batch_idx):
        x, y = batch
        # only train 
        if self.trainer.current_epoch < 10:
            with torch.no_grad():
                features = self.backbone(x)
        else:
            features = self.backbone(x)
        preds = self.finetune_layer(features)
        loss = cross_entropy(preds, y)
        self.log("train_loss", loss)  # logging loss
        self.log("train_accuracy", accuracy(preds, y)) # logging accuracy
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr)

In [None]:
# initialize class instance
classifier = DogBreedClassifierPlus()

# using pytorch lightning to fit the train_loader
trainer = pl.Trainer(progress_bar_refresh_rate=1, max_epochs=5, fast_dev_run=True)
trainer.fit(classifier, train_loader)

## **Homework**

- Try changing `max_epochs`
- Try changing the pre-trained model in `backbone`. See how it affects the final accuracy
- Observe the difference between `DogBreedClassifier` and `DogBreedClassifierPlus`. What happens after `current_epoch`. Hint: you can look at the tensorboard