# Training

## Libraries

In [None]:
!pip install git+https://github.com/PatBall1/detectree2.git

In [None]:
import os, shutil, glob, time, json, random, yaml
from datetime import date, datetime
from pathlib import Path
from google.colab import drive
drive.mount('/content/drive')

from detectron2.engine import DefaultPredictor
from detectree2.preprocessing.tiling import tile_data_train, to_traintest_folders, tile_data
from detectree2.models.predict import predict_on_data
from detectree2.models.train import MyTrainer, setup_cfg, register_train_data, remove_registered_data, predictions_on_data, combine_dicts, get_tree_dicts, load_json_arr
from detectree2.models.outputs import project_to_geojson, stitch_crowns, clean_crowns, to_eval_geojson, clean_predictions
from detectree2.models.evaluation import site_f1_score2
from detectron2.utils.visualizer import Visualizer
from detectron2.evaluation.coco_evaluation import instances_to_coco_json

import cv2
import wandb
from PIL import Image
import rasterio
import rioxarray as rxr
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
from dotenv import load_dotenv



In [None]:
load_dotenv()
data_dir = os.getenv('DATA_FOLDER')

In [None]:
from detectree_addons import *

## Datasets

Assuming that the training datasets have already been transformmed into COCO format, it is necessary to setup the parameters for all the folders.

In [None]:
site_path = data_dir + "Cambridge/"

# Set tiling parameters
buffer = 0
tile_width = 200
tile_height = 200
threshold = 0
tilename = 'city_center'

### Small training dataset

In [None]:
# Set up input paths
small_train_dir = site_path + "train_small/"
small_crown_path = site_path + "crowns/tiles_0.25m_160_20_0_train_crowns.shp"
small_rgb_path = site_path + "rgb/"
small_data_name = 'Cambridge_25cm_2017_small'
small_tiles_dir = site_path + "tiles/"
small_train_dir = site_path + "train/"
small_test_dir = site_path + "test/"

small_imgs = read_multiple_rgb(small_rgb_path)

# Read in crowns (then filter by an attribute if required)
small_crowns = gpd.read_file(small_crown_path)
small_crowns = small_crowns.to_crs(small_imgs[0].crs.data)

# remove_registered_data(data_name)
register_train_data(small_train_dir, small_data_name, val_fold=5)

### Large training dataset

In [None]:
# Set up input paths
large_train_dir = site_path + "train_large/"
large_crown_path = site_path + "crowns/tiles_0.25m_160_20_0_train_crowns.shp"
large_rgb_path = site_path + "rgb/"
large_data_name = 'Cambridge_25cm_2017_large'
large_tiles_dir = site_path + "tiles/"
large_train_dir = site_path + "train/"
large_test_dir = site_path + "test/"

large_imgs = read_multiple_rgb(large_rgb_path)

# Read in crowns (then filter by an attribute if required)
large_crowns = gpd.read_file(large_crown_path)
large_crowns = large_crowns.to_crs(large_imgs[0].crs.data)

# remove_registered_data(data_name)
register_train_data(large_train_dir, large_data_name, val_fold=5)

## Training with wandb

Here, it is necessary to change the parameters for the different configurations, for the size of the training dataset and the pre-trained model.

In [None]:
dataset = 'half_dataset' # Select dataset
model_name = 'randresize' # Select model
data_name = large_data_name if dataset == 'full_dataset' else small_data_name # Trainning data name

In [None]:
# Set the base (pre-trained) model from the detectron2 model_zoo
time_now = datetime.now().strftime("%Y%m%dT%H%M%S")

models = {'coco': "COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml",
          'paracou': data_dir + "models/220723_withParacouUAV.pth",
          'randresize': data_dir + "models/230103_randresize_full.pth"}

base_model = models[model_name]
output_dir = data_dir + "Cambridge/0.25m_160_20_0_models/"
train_out_dir = output_dir + f"{time_now}_{model_name}/"

trains = (f"{data_name}_train",) # Registered train data
tests = (f"{data_name}_val",) # Registered validation data

if model_name == 'coco':
    cfg = setup_cfg(base_model, trains, tests, eval_period=100,
                    max_iter=3000, out_dir=train_out_dir,
                    **params[dataset][model_name])
    
else:
    cfg = setup_cfg(update_model=base_model, trains=trains, tests=tests,
                    eval_period=100, max_iter=10000, out_dir=train_out_dir,
                    **params[dataset][model_name])

cfg_wandb = yaml.safe_load(cfg.dump())

Thi is the configuration dictionary for the wandb sweep using Bayesian Search to maximise the segmentation AP50 metric.

In [None]:
#Hyperparameter sweep configuration (more info the the W&B docs)
sweep_config = {
    'method': 'bayes',
    'metric': {
      'name': 'segm/AP50',
      'goal': 'maximize'
    },
    'parameters': {
        'backbone_freeze_at':
            {'distribution': 'int_uniform',
            'max': 4,
            'min': 1},
        'base_lr':
            {'distribution': 'uniform',
            'max': 0.025,
            'min': 0.00025},
        'batch_size_per_image':
            {'distribution': 'int_uniform',
            'max': 2048,
            'min': 512},
        'dl_num_workers':
            {'distribution': 'int_uniform',
            'max': 8,
            'min': 1},
        'gamma':
            {'distribution': 'uniform',
            'max': 0.3,
            'min': 0.05},
        'warmup_iters':
            {'distribution': 'int_uniform',
            'max': 200,
            'min': 50},
        'weight_decay':
            {'distribution': 'uniform',
            'max': 0.1,
            'min': 0.001}
    }
}

#initialize the sweep
#Running this line will ask you to log into your W&B account
sweep_id = wandb.sweep(sweep_config, project="detectree2-Cambridge")

This will start a wandb run, assuming you have logged in with `wandb login` and have a project set up.

In [None]:
run = wandb.init(
    # set the wandb project where this run will be logged
    project="detectree2-Cambridge",
    sync_tensorboard=True,
    # track hyperparameters and run metadata
    config = cfg_wandb
)
print(run.name)

This is a wrapper around the trainer and wandb Sweeps. This will perform the training for each iteration.

In [None]:
def run():
    run = wandb.init(
    # set the wandb project where this run will be logged
    project="detectree2-Cambridge",
    sync_tensorboard=True,
    # track hyperparameters and run metadata
    config = cfg_wandb)

    trainer = MyTrainer(cfg, patience=5)
    trainer.resume_or_load(resume=False)
    trainer.train()

This will run the wandb sweep. It is the longest cell to run, depending on the configuration of the sweep and the training dataset size.

In [None]:
wandb.agent(sweep_id, run, count=100)

Monitor the performance of the sweep directly in the W&B website of the report.