In [1]:
!nvidia-smi

Sat Jul  4 15:32:03 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.36       Driver Version: 440.36       CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce GTX 108...  Off  | 00000000:01:00.0 Off |                  N/A |
| 64%   72C    P2   249W / 280W |  10933MiB / 11170MiB |    100%      Default |
+-------------------------------+----------------------+----------------------+
|   1  GeForce GTX 108...  Off  | 00000000:02:00.0 Off |                  N/A |
|  0%   40C    P5    18W / 280W |      0MiB / 11178MiB |      3%      Default |
+-------------------------------+----------------------+----------------------+
                                                                            

### Google Colaboratory setup

Clone repository contents in VM and install dependencies using the script:

```python
# (1) Replace contents of VM
!rm -rf sample_data
# (Replace username and password/token)
!git clone --single-branch --branch master https://username:password@github.com/aaossa/CuratorNet-experiments.git
!cp -a CuratorNet-experiments/. .
!rm -r CuratorNet-experiments/
# Setup VM using script
!chmod +x ./scripts/colaboratory.sh
!./scripts/colaboratory.sh requirements/dev.txt
```

Mount Google Drive in case the data is available there:

```python
# (2) Mount Google Drive
from google.colab import drive
drive.mount("/content/drive")
```

Extract data in the right folder:

```python
# (3) Bring actual data to VM
# Extract data from mounted drive to data folder
!tar -xvzf "/content/drive/My Drive/path_to_data/data.tar.gz" -C data/UGallery
```

**Important:** Restart the VM after following the steps to make sure you're using the right version of the declared requirements.

In [2]:
import os
import random

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data.sampler import RandomSampler, SequentialSampler

from datasets import UGalleryDataset
from models import CuratorNet
from samplers import SameProfileSizeBatchSampler
from trainers import train_with_batch_samplers, train_with_dataloaders
from utils.memory import max_memory_stats, memory_report


# Parameters
RNG_SEED = 0
EMBEDDING_PATH = os.path.join("data", "UGallery", "ugallery_embedding.npy")
TRAINING_PATH = os.path.join("data", "UGallery", "train.csv")
VALIDATION_PATH = os.path.join("data", "UGallery", "validation.csv")
SUMMARY_WRITER_DIR = os.path.join("runs", "CuratorNet")
CHECKPOINTS_DIR = os.path.join("checkpoints", "CuratorNet")
USE_GPU = True

# Parameters (training)
SETTINGS = {
    "batch_sampler:batch_size": 4096 * 3,
    "batch_sampler:profile_items_per_batch": 60_000,
    "dataloader:num_workers": os.cpu_count(),
    "dataloader:pin_memory": True,
    "optimizer:lr": 0.0001,
    "optimizer:weight_decay": 0.0001,
    "scheduler:factor": 0.6,
    "scheduler:patience": 2,
    "scheduler:threshold": 1e-4,  # Default value (https://pytorch.org/docs/stable/optim.html#torch.optim.lr_scheduler.ReduceLROnPlateau)
    "train:max_epochs": 150,
    "train:max_lrs": 10,
    "train:non_blocking": True,
    "train:train_per_valid_times": 1,
}


In [3]:
%%time
# Freezing RNG seed if needed
if RNG_SEED is not None:
    print(f"\nUsing random seed...")
    random.seed(RNG_SEED)
    torch.manual_seed(RNG_SEED)
    np.random.seed(RNG_SEED)

# Load embedding from file
print(f"\nLoading embedding from file... ({EMBEDDING_PATH})")
embedding = np.load(EMBEDDING_PATH, allow_pickle=True)

# Reshape embedding
print("\nReshape embedding")
new_shape = (embedding.shape[0], embedding[0, 1].shape[0])
embedding = np.concatenate(embedding[:, 1]).reshape(*new_shape)

# DataLoaders initialization
print("\nInitialize DataLoaders")
# Training DataLoader
train_dataset = UGalleryDataset(
    csv_file=TRAINING_PATH,
)
print(f">> Training dataset: {len(train_dataset)}")
train_sampler = RandomSampler(train_dataset)
train_batch_sampler = SameProfileSizeBatchSampler(
    sampler=train_sampler,
    batch_size=SETTINGS["batch_sampler:batch_size"],
    profile_items_per_batch=SETTINGS["batch_sampler:profile_items_per_batch"],
)
train_dataloader = DataLoader(
    train_dataset,
    sampler=train_batch_sampler,
    num_workers=SETTINGS["dataloader:num_workers"],
    pin_memory=SETTINGS["dataloader:pin_memory"],
)
print(f">> Training dataloader: {len(train_dataloader)}")
# Validation DataLoader
valid_dataset = UGalleryDataset(
    csv_file=VALIDATION_PATH,
)
print(f">> Validation dataset: {len(valid_dataset)}")
valid_sampler = SequentialSampler(valid_dataset)
valid_batch_sampler = SameProfileSizeBatchSampler(
    sampler=valid_sampler,
    batch_size=SETTINGS["batch_sampler:batch_size"],
    profile_items_per_batch=SETTINGS["batch_sampler:profile_items_per_batch"],
)
valid_dataloader = DataLoader(
    valid_dataset,
    sampler=valid_batch_sampler,
    num_workers=SETTINGS["dataloader:num_workers"],
    pin_memory=SETTINGS["dataloader:pin_memory"],
)
print(f">> Validation dataloader: {len(valid_dataloader)}")
# Model initialization
print("\nInitialize model")
device = torch.device("cuda:0" if torch.cuda.is_available() and USE_GPU else "cpu")
if torch.cuda.is_available() != USE_GPU:
    print((f"\nNotice: Not using GPU - "
           f"Cuda available ({torch.cuda.is_available()}) "
           f"does not match USE_GPU ({USE_GPU})"
    ))
model = CuratorNet(torch.Tensor(embedding), input_size=embedding.shape[1]).to(device)

# Training setup
print("\nSetting up training")
optimizer = optim.Adam(
    model.parameters(),
    lr=SETTINGS["optimizer:lr"],
    weight_decay=SETTINGS["optimizer:weight_decay"],
)
criterion = nn.BCEWithLogitsLoss(reduction="sum")
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="max", factor=SETTINGS["scheduler:factor"],
    patience=SETTINGS["scheduler:patience"], verbose=True,
    threshold=SETTINGS["scheduler:threshold"],
)

# Crurent memory status
print("\nMemory report")
memory_report()
print("\nMax memory stats")
max_memory_stats(device)

# Training
print("\nTraining")



Using random seed...

Loading embedding from file... (data/UGallery/ugallery_embedding.npy)

Reshape embedding

Initialize DataLoaders
>> Training dataset: 10000494
>> Training dataloader: 893
>> Validation dataset: 502068
>> Validation dataloader: 71

Initialize model

Setting up training

Memory report
Main process PID: 12618
CPU RAM free: 53.3 GB | Proc. size: 2.5 GB
GPU 0 (GeForce GTX 1080 Ti)
GPU RAM free: 237 MB | Used: 10933 MB | Util.: 98% | Total: 11170 MB
GPU 1 (GeForce GTX 1080 Ti)
GPU RAM free: 10379 MB | Used: 799 MB | Util.: 7% | Total: 11178 MB

Max memory stats
Device: 'cuda:0'
Max memory allocated: 222.6 MB
Max memory reserved:  241.2 MB

Training
CPU times: user 7.99 s, sys: 912 ms, total: 8.9 s
Wall time: 9.01 s


In [4]:
%%time
# Training
best_model, best_acc, best_epoch, last_model = train_with_batch_samplers(
    model, device, criterion, optimizer, scheduler,
    {"train": train_batch_sampler, "validation": valid_batch_sampler},
    max_epochs=SETTINGS["train:max_epochs"],
    max_lrs=SETTINGS["train:max_lrs"],
    train_per_valid_times=SETTINGS["train:train_per_valid_times"],
    checkpoint_dir=CHECKPOINTS_DIR, writer_dir=SUMMARY_WRITER_DIR,
)


>> Checkpoints stored at... checkpoints/CuratorNet/CuratorNet_2020-07-04-15-32-13.tar
>> Summary writer data stored at... runs/CuratorNet/CuratorNet_2020-07-04-15-32-13


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=150.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='Train', max=893.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='Valid', max=71.0, style=ProgressStyle(description_width='…

Epoch    13: reducing learning rate of group 0 to 6.0000e-05.
Epoch    22: reducing learning rate of group 0 to 3.6000e-05.
Epoch    39: reducing learning rate of group 0 to 2.1600e-05.
Epoch    47: reducing learning rate of group 0 to 1.2960e-05.
Epoch    52: reducing learning rate of group 0 to 7.7760e-06.
Epoch    60: reducing learning rate of group 0 to 4.6656e-06.
Epoch    66: reducing learning rate of group 0 to 2.7994e-06.
Epoch    71: reducing learning rate of group 0 to 1.6796e-06.
Epoch    76: reducing learning rate of group 0 to 1.0078e-06.
Epoch    82: reducing learning rate of group 0 to 6.0466e-07.
>> Reached max different lrs ([0.0001, 6e-05, 3.6e-05, 2.16e-05, 1.296e-05, 7.776e-06, 4.6656e-06, 2.79936e-06, 1.679616e-06, 1.0077696e-06])



>> Training completed in 46m 37s
>> Best validation accuracy: ~98.900%
>> Copy last model
>> Load best model
>> Save last state
CPU times: user 39min 2s, sys: 7min 31s, total: 46min 33s
Wall time: 46min 38s


In [5]:
# Crurent memory status
print("\nMemory report")
memory_report()
print("\nMax memory stats")
max_memory_stats(device)



Memory report
Main process PID: 12618
CPU RAM free: 53.0 GB | Proc. size: 2.5 GB
GPU 0 (GeForce GTX 1080 Ti)
GPU RAM free: 237 MB | Used: 10933 MB | Util.: 98% | Total: 11170 MB
GPU 1 (GeForce GTX 1080 Ti)
GPU RAM free: 6591 MB | Used: 4587 MB | Util.: 41% | Total: 11178 MB

Max memory stats
Device: 'cuda:0'
Max memory allocated: 1.9 GB
Max memory reserved:  4.2 GB


In [6]:
# Final result
print(f"\nBest ACC {best_acc} reached at epoch {best_epoch}")
print(best_model)



Best ACC 0.9889955145518137 reached at epoch 78
CuratorNet(
  (embedding): Embedding(13297, 4096)
  (selu_common1): Linear(in_features=4096, out_features=200, bias=True)
  (selu_common2): Linear(in_features=200, out_features=200, bias=True)
  (maxpool): AdaptiveMaxPool2d(output_size=(1, 200))
  (avgpool): AdaptiveAvgPool2d(output_size=(1, 200))
  (selu_pu1): Linear(in_features=400, out_features=300, bias=True)
  (selu_pu2): Linear(in_features=300, out_features=300, bias=True)
  (selu_pu3): Linear(in_features=300, out_features=200, bias=True)
)
