In [1]:
import os
import torch
import numpy as np
from tqdm import tqdm

from torch.optim import Adam
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data.dataloader import DataLoader

from preprocessor import AgeRecognitionPreprocessor
from dataset import AgeRecognitionDataset
from models import vit_l_16_age_recognizer, vit_b_16_age_recognizer, resent101_age_recogniser
from loss import AgeRecognitionLoss

In [2]:
lr = 1e-5

IMAGE_DIR = './Cleaned/'
VARIANT = 'resnet101'
TRAINING_PAIRINGS = './training_data.csv'
BATCH_SIZE = 24
EPOCHES = 1000
DEVICE = 'cuda'

In [3]:
model = resent101_age_recogniser().to(DEVICE)
loss_function = AgeRecognitionLoss().to(DEVICE)
# loss_function.importance.requires_grad = False
preprocessor = AgeRecognitionPreprocessor()
dataset = AgeRecognitionDataset(triplet_csv_path=TRAINING_PAIRINGS, image_dir=IMAGE_DIR, preprocessor=preprocessor, kfolds=1, device=DEVICE)

In [4]:
optimizer = Adam(list(model.parameters()) + list(loss_function.parameters()), lr=lr)
scheduler = CosineAnnealingLR(optimizer=optimizer, T_max=EPOCHES, eta_min=1e-7)

In [5]:
if os.path.exists(f'./Checkpoint/{VARIANT}/best.pt'):
    best_state = torch.load(f'./Checkpoint/{VARIANT}/best.pt')
    model.load_state_dict(best_state['model_state_dict'])
    loss_function.load_state_dict(best_state['loss_state_dict'])

In [6]:
torch.manual_seed(3407)
record = []
model.train()
for epoch in range(EPOCHES):
    for fold in range(dataset.kfolds):
        # training_dataset, validation_dataset = dataset.kfold_cross_validation(fold)
        training_dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
        # validation_dataloader = DataLoader(validation_dataset, batch_size=1, shuffle=True, num_workers=4)

        index = 0
        for batch in training_dataloader:
            optimizer.zero_grad()
            # Batch shape: (N, Anchor-Positive-Negative, C, H, W)
            predictions = model.forward_features(batch)
            training_loss = loss_function(predictions)
            training_loss.backward()    
            print(f"Training loss for batch {index} : {training_loss}")
            record.append(training_loss)
            index += 1
            optimizer.step()
        print(loss_function.regularizing_strength)
        scheduler.step()

    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'loss_state_dict': loss_function.state_dict(),
        'loss': training_loss,
        }, f'./Checkpoint/model_{epoch}.pt')



Training loss for batch 0 : 1.0000100135803223
Training loss for batch 1 : 1.0000100135803223
Training loss for batch 2 : 1.0000100135803223
Training loss for batch 3 : 1.0000098943710327
Training loss for batch 4 : 1.0000100135803223
Training loss for batch 5 : 1.0000100135803223
Training loss for batch 6 : 1.0000100135803223
Training loss for batch 7 : 1.0000100135803223
Training loss for batch 8 : 1.0000100135803223
Training loss for batch 9 : 1.0000100135803223
Training loss for batch 10 : 1.0000098943710327
Training loss for batch 11 : 1.0000100135803223
Training loss for batch 12 : 0.10966742783784866
Training loss for batch 13 : 0.04650833457708359
Training loss for batch 14 : 0.19805079698562622
Training loss for batch 15 : 0.20085152983665466
Training loss for batch 16 : 0.21650613844394684
Training loss for batch 17 : 0.03596194460988045
Training loss for batch 18 : 0.1458388715982437
Training loss for batch 19 : 0.13189668953418732
Training loss for batch 20 : 0.056037414819

KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt

record = [x.cpu().detach() for x in record]
plt.plot(record)

NameError: name 'record' is not defined

In [None]:
TEST_PAIRINGS = './test_data.csv'
state_dict = torch.load('./Checkpoint/model_1.pt')
model.load_state_dict(state_dict['model_state_dict'])
testset = AgeRecognitionDataset(triplet_csv_path=TEST_PAIRINGS, image_dir=IMAGE_DIR, preprocessor=preprocessor, kfolds=5, device=DEVICE)
testloader  = DataLoader(testset, batch_size=1)

model.eval()
test_loss = 0
for batch in tqdm(testloader):
    # Batch shape: (N, Anchor-Positive-Negative, C, H, W)
    predictions = model.forward_features(batch)
    test_loss += loss_function(predictions)
test_loss = test_loss * 8 / len(validation_dataloader) 
print(f"Average test loss : {test_loss}")

  0%|          | 29/49828 [00:02<58:03, 14.30it/s] 


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 11.99 GiB total capacity; 10.68 GiB already allocated; 0 bytes free; 11.18 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF