# Setup

**Download and install deepdrive_course repository when running in Google Colab (to have access to the libraries)**

In [None]:
import sys
in_colab = 'google.colab' in sys.modules

if in_colab:
  !git clone https://github.com/abojda/deepdrive_course.git dd_course
  !pip install dd_course/ -q

## wandb login

In [None]:
import wandb
wandb.login()

In [3]:
import timm
import pytorch_lightning as pl

# Prepare data

## Download images with FiftyOne
Warning: This may take a while and requires ~5GB of disk space

In [None]:
import fiftyone as fo

classes=["Bathtub", "Sink"]
splits = ["train", "validation"]

fo_dataset = fo.zoo.load_zoo_dataset(
              "open-images-v7",
              dataset_dir="fo_raw_data",
              splits=splits,
              label_types=["classifications"],
              classes=classes,
          )

## Export images to `data/split/classname` directories
Original splits contain:
- train:
  - 829 Bathtub images
  - 3359 Sink images
- validation:
  - 17 Bathtub images
  - 49 Sink images

Original validation split is only ~1.5% of all available images for these classes.

Therefore we will combine all images into one dataset and later perform 80/20 split on our own.

In [5]:
for _class in classes:
  view = fo_dataset.filter_labels("positive_labels", fo.ViewField("label") == _class)
  print(f'\n{_class} images: {len(view)}')

  view.export(
      export_dir=f'data/{_class.lower()}',
      dataset_type=fo.types.ImageDirectory
      )


Bathtub images: 846
 100% |█████████████████| 846/846 [1.0s elapsed, 0s remaining, 868.0 samples/s]         


INFO:eta.core.utils: 100% |█████████████████| 846/846 [1.0s elapsed, 0s remaining, 868.0 samples/s]         



Sink images: 3408
 100% |███████████████| 3408/3408 [4.3s elapsed, 0s remaining, 819.2 samples/s]      


INFO:eta.core.utils: 100% |███████████████| 3408/3408 [4.3s elapsed, 0s remaining, 819.2 samples/s]      


## Load PyTorch datasets from these directories

In [6]:
from torchvision.datasets import ImageFolder
from torchvision.transforms import Compose, ToTensor, Normalize, Resize, RandAugment
from deepdrive_course.utils import stratified_train_test_split


train_transform = Compose([
    Resize((224,224)),
    RandAugment(num_ops=3),
    ToTensor(),
    Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
])

val_transform = Compose([
    Resize((224,224)),
    ToTensor(),
    Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
])

full_ds = ImageFolder(root='data')

train_ds, val_ds, _, _ = stratified_train_test_split(full_ds,
                                                     train_size=0.8,
                                                     train_transform=train_transform,
                                                     test_transform=val_transform)

# Config

In [20]:
import numpy as np

pos_samples = np.count_nonzero(full_ds.targets)
neg_samples = len(full_ds.targets) - pos_samples

config = dict(
  project_name = 'binary_bathtub_sink',
  # run_name = 'baseline-onecyle_lr0.0001',
  run_name = 'baseline-onecyle_lr0.0001-weighted_loss',

  classes = full_ds.classes,

  timm_model = 'resnet50',
  timm_pretrained = True,
  timm_dropout = 0.3,

  epochs = 30,
  batch_size = 64,
  lr = 1e-4,
  seed = 42,

  # pos_class_weight = None,
  pos_class_weight = neg_samples / pos_samples,

  optimizer = 'Adam',
  # optimizer = 'RMSprop',
  optimizer_kwargs = {},
)


scheduler_config = dict(
  # scheduler = None,
  # scheduler_interval = 'step',
  # scheduler_kwargs = {}

  scheduler = 'OneCycleLR',
  scheduler_interval = 'step',
  scheduler_kwargs = dict(
      epochs = config["epochs"],
      max_lr = config["lr"],
      # steps_per_epoch is updated after training DataLoader instantiation
  ),
)

config.update(**scheduler_config)

# Model

In [None]:
from deepdrive_course.openimages_binary.modules import LitBinaryClassifier

model = timm.create_model(
    config['timm_model'],
    num_classes=1,
    pretrained=config['timm_pretrained'],
    drop_rate=config['timm_dropout']
)

model = LitBinaryClassifier(model, config)

# Training

## Reproducibility

In [None]:
from pytorch_lightning import seed_everything

seed_everything(config['seed'])

## Initialize dataloaders

In [None]:
from torch.utils.data import DataLoader
import multiprocessing

train_dl = DataLoader(train_ds,
                      batch_size=config['batch_size'],
                      shuffle=True,
                      drop_last=True,
                      num_workers=multiprocessing.cpu_count(),
                      pin_memory=True,
                      )

val_dl = DataLoader(val_ds,
                    batch_size=config['batch_size'],
                    shuffle=False,
                    drop_last=True,
                    num_workers=multiprocessing.cpu_count(),
                    pin_memory=True,
                    )


# Update steps_per_epoch in configuration dictionary
config["scheduler_kwargs"]["steps_per_epoch"] = len(train_dl)
print(config["scheduler_kwargs"]["steps_per_epoch"])

## Define callbacks

In [24]:
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor, ModelCheckpoint

checkpoint_cb = ModelCheckpoint(monitor='val_loss',
                                save_top_k=3,
                                dirpath=f'{config["project_name"]}/best/{config["run_name"]}',
                                filename='{epoch}-{val_loss:.2f}')

lr_monitor_cb = LearningRateMonitor(logging_interval='step')

callbacks = [
    checkpoint_cb,
    lr_monitor_cb,
]

In [None]:
from pytorch_lightning.loggers import WandbLogger

# Define logger
logger = WandbLogger(project=config['project_name'], name=config['run_name'])
logger.experiment.config.update(config)

# Setup summary metrics
logger.experiment.define_metric("val_loss", summary="min")
logger.experiment.define_metric("val_accuracy", summary="max")
logger.experiment.define_metric("val_balanced_accuracy", summary="max")
logger.experiment.define_metric("train_loss", summary="min")
logger.experiment.define_metric("train_accuracy", summary="max")
logger.experiment.define_metric("train_balanced_accuracy", summary="max")


try:
  trainer = pl.Trainer(
      max_epochs=config['epochs'],
      logger=logger,
      callbacks=callbacks,
      num_sanity_val_steps=0,
  )

  trainer.fit(model, train_dl, val_dl)
finally:
  wandb.finish()