In [1]:
import os
import json
import argparse
import torch
import random
import numpy as np
from pathlib import Path
from datetime import datetime
from torch.utils.tensorboard import SummaryWriter
from model.FFTRadNet import FFTRadNet
from dataset.dataset import RADIal
from dataset.encoder import ra_encoder
from dataset.dataloader import CreateDataLoaders
from model.Efficientnet_SEG import EfficientNetEnc_Seg

import pkbar
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.nn.functional as F
from loss import pixor_loss
from utils.evaluation import run_evaluation
import torch.nn as nn
import torch
torch.cuda.empty_cache()

parser = argparse.ArgumentParser(description='FFTRadNet Training')
parser.add_argument('-f')
parser.add_argument('-c', '--config', default='config/config_FFTRadNet_192_56-Seg.json',type=str,                            help='Path to the config file (default: config.json)')
parser.add_argument('-r', '--resume', default=None, type=str,
                            help='Path to the .pth model checkpoint to resume training')

args = parser.parse_args()
config = json.load(open(args.config))
resume=args.resume

  from .autonotebook import tqdm as notebook_tqdm


In [None]:


# Setup random seed
torch.manual_seed(config['seed'])
np.random.seed(config['seed'])
random.seed(config['seed'])
torch.cuda.manual_seed(config['seed'])


# create experience name
curr_date = datetime.now()
exp_name = config['name'] + '___' + curr_date.strftime('%b-%d-%Y___%H:%M:%S')
print(exp_name)

# Create directory structure
output_folder = Path("Efficientnet_SEG_seq")
output_folder.mkdir(parents=True, exist_ok=True)
(output_folder / exp_name).mkdir(parents=True, exist_ok=True)
# and copy the config file
with open(output_folder / exp_name / 'config.json', 'w') as outfile:
    json.dump(config, outfile)

# set device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Initialize tensorboard
writer = SummaryWriter(output_folder / exp_name)

# Load the dataset
enc = ra_encoder(geometry = config['dataset']['geometry'], 
                    statistics = config['dataset']['statistics'],
                    regression_layer = 2)

dataset = RADIal(root_dir = config['dataset']['root_dir'],
                    statistics= config['dataset']['statistics'],
                    encoder=enc.encode,
                    difficult=True)

train_loader, val_loader, test_loader = CreateDataLoaders(dataset,config['dataloader'],config['seed'])


# Create the model
# net = FFTRadNet(blocks = config['model']['backbone_block'],
#                     mimo_layer  = config['model']['MIMO_output'],
#                     channels = config['model']['channels'], 
#                     regression_layer = 2, 
#                     detection_head = config['model']['DetectionHead'], 
#                     segmentation_head = config['model']['SegmentationHead'])
net = EfficientNetEnc_Seg(n_channels=32, n_classes=1, segmentation_head=True)


net.to('cuda')


# Optimizer
lr = float(config['optimizer']['lr'])
step_size = int(config['lr_scheduler']['step_size'])
gamma = float(config['lr_scheduler']['gamma'])
optimizer = optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
scheduler = lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)

# num_epochs=int(config['num_epochs'])
num_epochs=1


print('===========  Optimizer  ==================:')
print('      LR:', lr)
print('      step_size:', step_size)
print('      gamma:', gamma)
print('      num_epochs:', num_epochs)
print('')

# Train
startEpoch = 0
global_step = 0
history = {'train_loss':[],'val_loss':[],'lr':[],'mIoU':[]}
best_mAP = 0

freespace_loss = nn.BCEWithLogitsLoss(reduction='mean')

# resume="Segnet_SEG_seq/FFTRadNet_RA_192_56___May-24-2024___13:24:13/FFTRadNet_RA_192_56_epoch96_loss_157948.0619_IOU_0.6326.pth"
# if resume:
#     print('===========  Resume training  ==================:')
#     dict = torch.load(resume)
#     net.load_state_dict(dict['net_state_dict'])
#     optimizer.load_state_dict(dict['optimizer'])
#     scheduler.load_state_dict(dict['scheduler'])
#     startEpoch = dict['epoch']+1
#     history = dict['history']
#     global_step = dict['global_step']

#     print('       ... Start at epoch:',startEpoch)


for epoch in range(startEpoch,100):

    kbar = pkbar.Kbar(target=len(train_loader), epoch=epoch, num_epochs=num_epochs, width=20, always_stateful=False)

    ###################http://10.9.238.92:8080/
    ## Training loop ##
    ###################
    net.train()
    running_loss = 0.0

    for i, data in enumerate(train_loader):
        inputs = data[0].to('cuda').float()
        if(config['model']['SegmentationHead']=='True'):
            seg_map_label = data[2].to('cuda').double()

        # reset the gradient
        optimizer.zero_grad()

        # forward pass, enable to track our gradient
        with torch.set_grad_enabled(True):
            outputs = net(inputs)



        prediction = outputs['Segmentation'].contiguous().flatten()
        label = seg_map_label.contiguous().flatten()        
        loss_seg = freespace_loss(prediction, label)
        loss_seg *= inputs.size(0)

        loss_seg *=config['losses']['weight'][2]


        loss =  loss_seg

        writer.add_scalar('Loss/train', loss.item(), global_step)
        writer.add_scalar('Loss/train_freespace', loss_seg.item(), global_step)

        # backprop
        loss.backward()
        optimizer.step()

        # statistics
        running_loss += loss.item() * inputs.size(0)

        kbar.update(i, values=[("loss", loss.item()),("freeSpace", loss_seg.item())])


        global_step += 1


    scheduler.step()

    history['train_loss'].append(running_loss / len(train_loader.dataset))
    history['lr'].append(scheduler.get_last_lr()[0])


    ######################
    ## validation phase ##
    ######################

    eval = run_evaluation(net,val_loader,enc,check_perf=(epoch>=10),
                            detection_loss=None,segmentation_loss=freespace_loss,
                            losses_params=config['losses'])

    history['val_loss'].append(eval['loss'])

    history['mIoU'].append(eval['mIoU'])

    kbar.add(1, values=[("val_loss", eval['loss']),("mIoU", eval['mIoU'])])


    writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'], global_step)
    writer.add_scalar('Loss/test', eval['loss'], global_step)
    writer.add_scalar('Metrics/mIoU', eval['mIoU'], global_step)

    # Saving all checkpoint as the best checkpoint for multi-task is a balance between both --> up to the user to decide
    name_output_file = config['name']+'_epoch{:02d}_loss_{:.4f}_IOU_{:.4f}.pth'.format(epoch, eval['loss'],eval['mIoU'])
    filename = output_folder / exp_name / name_output_file

    checkpoint={}
    checkpoint['net_state_dict'] = net.state_dict()
    checkpoint['optimizer'] = optimizer.state_dict()
    checkpoint['scheduler'] = scheduler.state_dict()
    checkpoint['epoch'] = epoch
    checkpoint['history'] = history
    checkpoint['global_step'] = global_step

    torch.save(checkpoint,filename)

    print('')




FFTRadNet_RA_192_56___Jun-08-2024___21:19:45
      Mode: sequence
      Training: 6230
      Validation: 986
      Test: 1035

Loaded pretrained weights for efficientnet-b2
      LR: 0.0001
      step_size: 10
      gamma: 0.9
      num_epochs: 1

Epoch: 1/1

Epoch: 2/1

Epoch: 3/1

Epoch: 4/1

Epoch: 5/1

Epoch: 6/1

Epoch: 7/1

Epoch: 8/1

Epoch: 9/1

Epoch: 10/1

Epoch: 11/1

Epoch: 12/1

Epoch: 13/1

Epoch: 14/1

Epoch: 15/1

Epoch: 16/1

Epoch: 17/1

Epoch: 18/1

Epoch: 19/1

Epoch: 20/1

Epoch: 21/1

Epoch: 22/1

Epoch: 23/1

Epoch: 24/1

Epoch: 25/1

Epoch: 26/1

Epoch: 27/1

Epoch: 28/1

Epoch: 29/1

Epoch: 30/1

Epoch: 31/1

Epoch: 32/1

Epoch: 33/1

Epoch: 34/1

Epoch: 35/1

Epoch: 36/1

Epoch: 37/1

Epoch: 38/1

Epoch: 39/1

Epoch: 40/1

Epoch: 41/1

Epoch: 42/1

Epoch: 43/1

Epoch: 44/1

Epoch: 45/1

Epoch: 46/1

Epoch: 47/1

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)




Epoch: 49/1

Epoch: 50/1

Epoch: 51/1
 443/1558 [====>...............] - ETA: 9:06 - loss: 7.1457 - freeSpace: 7.1457

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)




Epoch: 55/1

Epoch: 56/1

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)




Epoch: 60/1

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)




Epoch: 64/1

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)




Epoch: 67/1

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)




Epoch: 70/1

Epoch: 71/1
 145/1558 [>...................] - ETA: 11:20 - loss: 6.0509 - freeSpace: 6.0509

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)




Epoch: 73/1

Epoch: 74/1

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)




Epoch: 77/1

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)




Epoch: 81/1
  64/1558 [....................] - ETA: 12:33 - loss: 5.6954 - freeSpace: 5.6954

In [None]:
# print('===========  Resume training  ==================:')
# dict = torch.load("../output_training-Seg/FFTRadNet_RA_192_56___Jan-31-2024___21:56:34/FFTRadNet_RA_192_56_epoch78_loss_94434.5930_IOU_0.6623.pth")
# net.load_state_dict(dict['net_state_dict'])
# optimizer.load_state_dict(dict['optimizer'])
# scheduler.load_state_dict(dict['scheduler'])
# startEpoch = dict['epoch']+1
# history = dict['history']
# global_step = dict['global_step']

# print('       ... Start at epoch:',startEpoch)