In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import h5py
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import wandb

from dotenv import load_dotenv, find_dotenv
from pathlib import Path
from torch.utils.data import DataLoader, random_split
from torch.cuda.amp import autocast, GradScaler

from src.data import FimacDataset

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()
project_dir = Path(dotenv_path).parent

# load up the entries as environment variables
load_dotenv(dotenv_path)

True

# Data loading

In [3]:
dataset = FimacDataset(project_dir/'data/interim/renders.hdf5')

split = 0.8
test_size = int(len(dataset) * split)
train_data, val_data = random_split(dataset, (len(dataset) - test_size, test_size))

train_dataloader = DataLoader(train_data, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=32)

# Network definition

In [4]:
from src.models import TestNet

net = TestNet().to(device)
net

TestNet(
  (conv1): Conv2d(2, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (dropout1): Dropout2d(p=0.25, inplace=False)
  (fc1): Linear(in_features=1048576, out_features=128, bias=True)
  (dropout2): Dropout2d(p=0.5, inplace=False)
  (fc2): Linear(in_features=128, out_features=1, bias=True)
)

# Training loop

In [5]:
def int_MAE(y, y_hat):
    err = y.type(torch.IntTensor) - y_hat.type(torch.IntTensor)    
    err = err.abs()
    err = err.type(torch.FloatTensor).mean()

    return err.item()

In [7]:
epochs = 5

lr = 0.001
optimizer = torch.optim.SGD(net.parameters(), lr=lr)
criterion = nn.L1Loss()
scaler = GradScaler()

wandb.init(
    project="part-counting",
    entity="brunompac",
    config={
        "learning_rate": lr,
        "epochs": epochs,
        "batch_size": train_dataloader.batch_size,
        "model": type(net).__name__,
        "optmizer": type(optimizer).__name__,
        "loss_func": type(criterion).__name__,
})

wandb.watch(net)

for epoch in range(epochs):
    # train pass
    net.train()
    train_loss = 0
    with torch.set_grad_enabled(True):
        for i, (X, y) in enumerate(train_dataloader):
            X = X.to(device)
            y = y.to(device)

            optimizer.zero_grad()

            with autocast():
                y_hat = net(X)
                loss = criterion(y_hat.squeeze(), y.type(torch.float32))

            scaler.scale(loss).backward()

            train_loss += loss.item() / len(y)  # scales to data size

            scaler.step(optimizer)
            scaler.update()

    wandb.log({
        "train_loss": train_loss,
    }, step=epoch)

    # validation pass
    net.eval()
    val_loss = 0
    val_MAE = 0
    with torch.set_grad_enabled(False):
        for i, (X, y) in enumerate(train_dataloader):
            X = X.to(device)
            y = y.to(device)

            with autocast():
                y_hat = net(X)
                loss = criterion(y_hat.squeeze(), y.type(torch.float32))

            val_loss += loss.item() / len(y)  # scales to data size
            val_MAE += int_MAE(y, y_hat.squeeze()) / len(y)

    wandb.log({
        "val_loss": val_loss,
        "val_MAE": val_MAE,
    }, step=epoch, commit=True)

[34m[1mwandb[0m: Currently logged in as: [33mbrunompac[0m (use `wandb login --relogin` to force relogin)
