In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# !pip install -U "ray[default]"

In [None]:
import os
import glob
import numpy as np
import pandas as pd
from functools import partial

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader,random_split

import torchvision
from torchvision import transforms
from torchvision.io import read_image

from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
DATA_FOLDER = "/content/drive/MyDrive/Thesis/Experiments/VC-PRG-IMG/"

MODEL_NAME = "CNN"
BATCH_SIZE = 32
NUM_EPOCHS = 50
LEARNING_RATE = 1e-4

In [5]:
class VehicleDataset(Dataset):
    def __init__(self, X, y, transform=None):
        self.X = X
        self.y = y
        self.transform = transform

    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        image = read_image(self.X[idx])
        image = image[:3, :, :]
        if self.transform:
            image = self.transform(image)
        label = self.y[idx]
        return image, label

In [6]:
class CNNNetwork(nn.Module):
    def __init__(self):
        super().__init__()

        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.BatchNorm2d(32),
        )

        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=128, kernel_size=3, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.BatchNorm2d(128),
        )

        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.BatchNorm2d(128),
        )

        self.conv4 = nn.Sequential(
            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.BatchNorm2d(128),
        )

        self.flatten = nn.Flatten()

        self.linear = nn.Linear(in_features=28800, out_features=15)

        self.dropout = nn.Dropout(0.5)

        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, input_data):
        x = self.conv1(input_data)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.flatten(x)
        x = self.dropout(x)
        logits = self.linear(x)
        predictions = self.softmax(logits)
        return predictions

In [7]:
def get_label(filename):
    label = os.path.basename(filename).replace(".png", "").split("-")[-1]
    return int(label)

files = sorted(glob.glob(os.path.join(DATA_FOLDER, "*.png")))
labels = [get_label(file) for file in files]
df = pd.DataFrame({"filename": files, "label": labels})

temp_X, test_X, temp_y, test_y = train_test_split(df['filename'], df['label'], test_size=0.1, random_state=42)
temp_X.reset_index(drop=True, inplace=True)
temp_y.reset_index(drop=True, inplace=True)
test_X.reset_index(drop=True, inplace=True)
test_y.reset_index(drop=True, inplace=True)

train_X, val_X, train_y, val_y = train_test_split(temp_X, temp_y, test_size=0.1, random_state=42)
train_X.reset_index(drop=True, inplace=True)
train_y.reset_index(drop=True, inplace=True)
val_X.reset_index(drop=True, inplace=True)
val_y.reset_index(drop=True, inplace=True)

In [8]:
data_transforms = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

In [9]:
# train_dataset = VehicleDataset(X_train, y_train, transform=data_transforms)
# val_dataset = VehicleDataset(X_val, y_val, transform=data_transforms)
# test_dataset = VehicleDataset(X_test, y_test, transform=data_transforms)

# train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
# val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
# test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [10]:
def train(config, checkpoint_dir=None, data_dir=None):
  model = CNNNetwork()

  device = "cuda" if torch.cuda.is_available() else "cpu"

  model.to(device)

  loss_fn = nn.CrossEntropyLoss()
  optimizer = optim.Adam(model.parameters(), lr=config["lr"], weight_decay=config["wd"])

  if checkpoint_dir:
    model_state, optimizer_state = torch.load(os.path.join(checkpoint_dir, "checkpoint"))
    model.load_state_dict(model_state)
    optimizer.load_state_dict(optimizer_state)

  train_dataset = VehicleDataset(train_X, train_y, transform=data_transforms)
  val_dataset = VehicleDataset(val_X, val_y, transform=data_transforms)

  train_dataloader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True)
  val_dataloader = DataLoader(val_dataset, batch_size=config["batch_size"], shuffle=True)

  for epoch in range(NUM_EPOCHS):

    # Training
    train_running_loss = 0.0

    for idx, (X_train, y_train_trues) in enumerate(train_dataloader, 0):
      X_train, y_train_trues = X_train.to(device), y_train_trues.to(device)
      
      # Zero the gradients paramter
      optimizer.zero_grad()

      # Forward
      y_train_preds = model(X_train)
      train_loss = loss_fn(y_train_preds, y_train_trues)
      # Backward
      train_loss.backward()
      # Optimize
      optimizer.step()

      train_running_loss += train_loss.item()

    print(f"Epoch [{epoch+1}/{NUM_EPOCHS}]\t|\tTrain Loss: {train_running_loss/len(train_dataloader):.5f}\t|")

    # Validation
    val_running_loss = 0.0
    val_steps = 0
    total = 0
    correct = 0

    for idx, (X_val, y_val_trues) in enumerate(val_dataloader, 0):
      with torch.no_grad():
        X_val, y_val_trues = X_val.to(device), y_val_trues.to(device)

        y_val_preds = model(X_val)
        _, predicted = torch.max(y_val_preds.data, 1)
        total += y_val_trues.size(0)
        correct += (predicted == y_val_trues).sum().item()

        val_loss = loss_fn(y_val_preds, y_val_trues)
        val_running_loss += val_loss.item()
        val_steps += 1

    with tune.checkpoint_dir(epoch) as checkpoint_dir:
      path = os.path.join(checkpoint_dir, "checkpoint")
      torch.save((model.state_dict(), optimizer.state_dict()), path)
    
    tune.report(loss=(val_running_loss / val_steps), accuracy=correct / total)
  print("Finished Traiing")  

In [11]:
def test_accuracy(model, device="cpu"):
  test_dataset = VehicleDataset(test_X, test_y, transform=data_transforms)
  test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

  correct = 0
  total = 0
  with torch.no_grad():
    for X_test, y_test_trues in test_dataloader:
        X_test, y_test_trues = X_test.to(device), y_test_trues.to(device)

        y_test_preds = model(X_test)
        _, predicted = torch.max(y_test_preds.data, 1)
        total += y_test_trues.size(0)
        correct += (predicted == y_test_trues).sum().item()
  return correct / total

In [None]:
data_dir = os.path.abspath("./data")
checkpoint_dir = os.path.abspath("./checkpoint")

config = {
    "lr": tune.loguniform(1e-5, 1e-3),
    "batch_size": tune.choice([32, 64, 128]),
    "wd": tune.choice([0, 0.01, 0.05, 0.025]),
}

scheduler = ASHAScheduler(
    metric="loss",
    mode="min",
    max_t=NUM_EPOCHS,
    grace_period=1,
    reduction_factor=2
)

reporter = CLIReporter(
    metric_columns=["loss", "accuracy", "training_iteration"]
)

result = tune.run(
    partial(train, data_dir=data_dir, checkpoint_dir=checkpoint_dir),
    config=config,
    num_samples=10,
    scheduler=scheduler,
    progress_reporter=reporter
)

2023-02-13 16:59:45,135	INFO worker.py:1529 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m

from ray.air import session

def train(config):
    # ...
    session.report({"metric": metric}, checkpoint=checkpoint)

For more information please see https://docs.ray.io/en/master/tune/api_docs/trainable.html

2023-02-13 16:59:46,460	INFO tensorboardx.py:170 -- pip install "ray[tune]" to see TensorBoard files.


== Status ==
Current time: 2023-02-13 16:59:47 (running for 00:00:00.61)
Memory usage on this node: 2.3/12.7 GiB 
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 32.000: None | Iter 16.000: None | Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 1.0/2 CPUs, 0/0 GPUs, 0.0/7.36 GiB heap, 0.0/3.68 GiB objects
Result logdir: /root/ray_results/train_2023-02-13_16-59-46
Number of trials: 10/10 (9 PENDING, 1 RUNNING)
+-------------------+----------+-------------------+--------------+-------------+-------+
| Trial name        | status   | loc               |   batch_size |          lr |    wd |
|-------------------+----------+-------------------+--------------+-------------+-------|
| train_d1c1d_00000 | RUNNING  | 172.28.0.12:24870 |          128 | 0.000376744 | 0.05  |
| train_d1c1d_00001 | PENDING  |                   |          128 | 1.55631e-05 | 0.05  |
| train_d1c1d_00002 | PENDING  |                   |           64 | 1.57299e-05 | 0     

Trial name,accuracy,date,done,episodes_total,experiment_id,hostname,iterations_since_restore,loss,node_ip,pid,should_checkpoint,time_since_restore,time_this_iter_s,time_total_s,timestamp,timesteps_since_restore,timesteps_total,training_iteration,trial_id,warmup_time
train_d1c1d_00000,0.5,2023-02-13_17-11-31,False,,91aee05241f74c3aa9611cf5bd6f0df6,f0eac567a067,6,2.30472,172.28.0.12,24870,True,701.396,116.153,701.396,1676308291,0,,6,d1c1d_00000,0.00442886
train_d1c1d_00001,0.236842,2023-02-13_17-01-56,True,,2e6477526b5e4804a2d978ba1f9e7316,f0eac567a067,1,2.65228,172.28.0.12,24920,True,119.876,119.876,119.876,1676307716,0,,1,d1c1d_00001,0.018635
train_d1c1d_00002,0.157895,2023-02-13_17-03-52,True,,2e6477526b5e4804a2d978ba1f9e7316,f0eac567a067,1,2.61345,172.28.0.12,24920,True,116.126,116.126,116.126,1676307832,0,,1,d1c1d_00002,0.018635
train_d1c1d_00003,0.368421,2023-02-13_17-07-42,True,,2e6477526b5e4804a2d978ba1f9e7316,f0eac567a067,2,2.50695,172.28.0.12,24920,True,229.786,115.993,229.786,1676308062,0,,2,d1c1d_00003,0.018635
train_d1c1d_00004,0.157895,2023-02-13_17-09-36,True,,2e6477526b5e4804a2d978ba1f9e7316,f0eac567a067,1,2.61744,172.28.0.12,24920,True,113.732,113.732,113.732,1676308176,0,,1,d1c1d_00004,0.018635
train_d1c1d_00005,0.236842,2023-02-13_17-11-34,False,,2e6477526b5e4804a2d978ba1f9e7316,f0eac567a067,1,2.57637,172.28.0.12,24920,True,118.201,118.201,118.201,1676308294,0,,1,d1c1d_00005,0.018635


== Status ==
Current time: 2023-02-13 17:02:01 (running for 00:02:15.03)
Memory usage on this node: 6.2/12.7 GiB 
Using AsyncHyperBand: num_stopped=1
Bracket: Iter 32.000: None | Iter 16.000: None | Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: -2.5916587114334106
Resources requested: 2.0/2 CPUs, 0/0 GPUs, 0.0/7.36 GiB heap, 0.0/3.68 GiB objects
Result logdir: /root/ray_results/train_2023-02-13_16-59-46
Number of trials: 10/10 (7 PENDING, 2 RUNNING, 1 TERMINATED)
+-------------------+------------+-------------------+--------------+-------------+-------+---------+------------+----------------------+
| Trial name        | status     | loc               |   batch_size |          lr |    wd |    loss |   accuracy |   training_iteration |
|-------------------+------------+-------------------+--------------+-------------+-------+---------+------------+----------------------|
| train_d1c1d_00000 | RUNNING    | 172.28.0.12:24870 |          128 | 0.000376744 | 0.05  | 2.5

In [None]:
best_trial = result.get_best_trial("loss", "min", "last")
print("Best trial config: {}".format(best_trial.config))
print("Best trial final validation loss: {}".format(best_trial.last_result["loss"]))
print("Best trial final validation accuracy: {}".format(best_trial.last_result["accuracy"]))

In [None]:
best_trained_model = CNNNetwork()
best_checkpoint_dir = best_trial.checkpoint.value
model_state, optimizer_state = torch.load(os.path.join(best_checkpoint_dir, "checkpoint"))
best_trained_model.load_state_dict(model_state)

test_acc = test_accuracy(best_trained_model, "cuda")
print("Best trial test set accuracy: {}".format(test_acc))