# Predictions

## Libraries

In [None]:
!pip install git+https://github.com/PatBall1/detectree2.git

In [None]:
import os, shutil, glob, time, json, random, yaml
from datetime import date, datetime
from pathlib import Path
from google.colab import drive
drive.mount('/content/drive')

from detectron2.engine import DefaultPredictor
from detectree2.preprocessing.tiling import tile_data_train, to_traintest_folders, tile_data
from detectree2.models.predict import predict_on_data
from detectree2.models.train import MyTrainer, setup_cfg, register_train_data, remove_registered_data, predictions_on_data, combine_dicts, get_tree_dicts, load_json_arr
from detectree2.models.outputs import project_to_geojson, stitch_crowns, clean_crowns, to_eval_geojson, clean_predictions
from detectree2.models.evaluation import site_f1_score2
from detectron2.utils.visualizer import Visualizer
from detectron2.evaluation.coco_evaluation import instances_to_coco_json

import cv2
import wandb
from PIL import Image
import rasterio
import rioxarray as rxr
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
from dotenv import load_dotenv



In [None]:
load_dotenv()
data_dir = os.getenv('DATA_FOLDER')

In [None]:
from detectree_addons import *

## Convert to COCO format

The images in all folders need to be converted to the COCO format using detectree2 built-in methods. `preparare_tiled_data` and `to_traintest_folders` must be run only once

In [None]:
site_path = data_dir + "Cambridge/"

# Set tiling parameters
buffer = 0
tile_width = 200
tile_height = 200
threshold = 0
tilename = 'city_center'

### Small training dataset

In [None]:
# Set up input paths
small_train_dir = site_path + "train_small/"
small_crown_path = site_path + "crowns/tiles_0.25m_160_20_0_train_crowns.shp"
small_rgb_path = site_path + "rgb/"
small_data_name = 'Cambridge_25cm_2017_small'
small_tiles_dir = site_path + "tiles/"
small_train_dir = site_path + "train/"
small_test_dir = site_path + "test/"

small_imgs = read_multiple_rgb(small_rgb_path)

# Read in crowns (then filter by an attribute if required)
small_crowns = gpd.read_file(small_crown_path)
small_crowns = small_crowns.to_crs(small_imgs[0].crs.data)

# remove_registered_data(data_name)
register_train_data(small_train_dir, small_data_name, val_fold=5)

In [None]:
prepare_tiled_data_train(small_imgs, small_tiles_dir, tilename = tilename, buffer = buffer,
                         tile_size = tile_width, crowns = small_crowns, threshold = threshold, dtype_bool = True)
to_traintest_folders(small_tiles_dir, site_path, test_frac=0.1, folds=5)

### Large training dataset

In [None]:
# Set up input paths
large_train_dir = site_path + "train_large/"
large_crown_path = site_path + "crowns/tiles_0.25m_160_20_0_train_crowns.shp"
large_rgb_path = site_path + "rgb/"
large_data_name = 'Cambridge_25cm_2017_large'
large_tiles_dir = site_path + "tiles/"
large_train_dir = site_path + "train/"
large_test_dir = site_path + "test/"

large_imgs = read_multiple_rgb(large_rgb_path)

# Read in crowns (then filter by an attribute if required)
large_crowns = gpd.read_file(large_crown_path)
large_crowns = large_crowns.to_crs(large_imgs[0].crs.data)

# remove_registered_data(data_name)
register_train_data(large_train_dir, large_data_name, val_fold=5)

In [None]:
prepare_tiled_data_train(large_imgs, large_tiles_dir, tilename = tilename, buffer = buffer,
                         tile_size = tile_width, crowns = large_crowns, threshold = threshold, dtype_bool = True)
to_traintest_folders(large_tiles_dir, site_path, test_frac=0.1, folds=5)

### Testing dataset

In [None]:
test_dir = site_path + "test_large/"
test_crown_path = site_path + "crowns/tiles_0.25m_160_20_0_test_crowns.shp"
test_rgb_path = site_path + "rgb/"
test_tiles_dir = site_path + "tiles/"

test_imgs = read_multiple_rgb(large_rgb_path)

# Read in crowns (then filter by an attribute if required)
test_crowns = gpd.read_file(test_crown_path)
test_crowns = large_crowns.to_crs(test_imgs[0].crs.data)

In [None]:
# RUN ONLY ONCE
prepare_tiled_data_train(test_imgs, test_tiles_dir, tilename = tilename, buffer = buffer,
                         tile_size = tile_width, crowns = test_crowns, threshold = threshold, dtype_bool = True)

## Re-training best models for each combination

In [None]:
# Best runs
params = {'half_dataset': {'coco': {'base_lr': 0.01341, 'gamma': 0.09478, 'warm_iter': 182, 'weight_decay': 0.0353, 'backbone_freeze': 3, 'batch_size_per_im': 1969, 'workers': 1},
                           'paracou': {'base_lr': 0.01762, 'gamma': 0.1578, 'warm_iter': 160, 'weight_decay': 0.003313, 'backbone_freeze': 2, 'batch_size_per_im': 1707, 'workers': 1},
                           'randresize': {'base_lr': 0.002454, 'gamma': 0.0581, 'warm_iter': 88, 'weight_decay': 0.09564, 'backbone_freeze': 2, 'batch_size_per_im': 650, 'workers': 2}},
          'full_dataset': {'coco': {'base_lr': 0.005957, 'gamma': 0.2076, 'warm_iter': 166, 'weight_decay': 0.02602, 'backbone_freeze': 2, 'batch_size_per_im': 938, 'workers': 1}, 
                           'paracou': {'base_lr': 0.01709, 'gamma': 0.08866, 'warm_iter': 184, 'weight_decay': 0.006519, 'backbone_freeze': 2, 'batch_size_per_im': 623, 'workers': 6}, 
                           'randresize': {'base_lr': 0.01609, 'gamma': 0.1536, 'warm_iter': 194, 'weight_decay': 0.09707, 'backbone_freeze': 4, 'batch_size_per_im': 1172, 'workers': 4}}} 

Here, it is necessary to change the parameters for the different configurations, for the size of the training dataset and the pre-trained model.

In [None]:
dataset = 'half_dataset' # Select dataset
model_name = 'randresize' # Select model
data_name = large_data_name if dataset == 'full_dataset' else small_data_name # Trainning data name

In [None]:
# Set the base (pre-trained) model from the detectron2 model_zoo
time_now = datetime.now().strftime("%Y%m%dT%H%M%S")

models = {'coco': "COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml",
          'paracou': data_dir + "models/220723_withParacouUAV.pth",
          'randresize': data_dir + "models/230103_randresize_full.pth"}

base_model = models[model_name]
output_dir = data_dir + "Cambridge/0.25m_160_20_0_models/"
train_out_dir = output_dir + f"{time_now}_{model_name}/"

trains = (f"{data_name}_train",) # Registered train data
tests = (f"{data_name}_val",) # Registered validation data

if model_name == 'coco':
    cfg = setup_cfg(base_model, trains, tests, eval_period=100,
                    max_iter=3000, out_dir=train_out_dir,
                    **params[dataset][model_name])
    
else:
    cfg = setup_cfg(update_model=base_model, trains=trains, tests=tests,
                    eval_period=100, max_iter=10000, out_dir=train_out_dir,
                    **params[dataset][model_name])

cfg_wandb = yaml.safe_load(cfg.dump())

This will start a wandb run, assuming you have logged in with `wandb login` and have a project set up.

In [None]:
run = wandb.init(
    # set the wandb project where this run will be logged
    project="detectree2-Cambridge",
    sync_tensorboard=True,
    # track hyperparameters and run metadata
    config = cfg_wandb
)
print(run.name)

This is. the actual trainer, hence the longest cell to run.

In [None]:
trainer = MyTrainer(cfg, patience=5)
trainer.resume_or_load(resume=False)
trainer.train()
run.log({'base_model': model_name, 'run_date': time_now})

This is further setup for the location of the prediction files.

In [None]:
trained_models = os.listdir(output_dir)
model_name = trained_models[-1]
# train_out_dir = output_dir + model_name + '/' #if os.path.exists(train_out_dir) == False or len(os.listdir()) == 0 else train_out_dir
# experiment_metrics = load_json_arr(train_out_dir + 'metrics.json')

saved_models = list(filter(lambda x: x[-4:] == '.pth', os.listdir(train_out_dir)))
trained_model = train_out_dir + saved_models[-1]
print(trained_model)
train_pred_folder = train_out_dir + "train/predictions/"
train_pred_geo_folder = train_out_dir + "train/predictions_geo/"
test_pred_folder = train_out_dir + "test/predictions/"
test_pred_geo_folder = train_out_dir + "test/predictions_geo/"

# set up config
cfg = setup_cfg(update_model=trained_model)

This cell produces the predictions for both the training and testing datasets.

In [None]:
predict_on_data(tiles_dir, DefaultPredictor(cfg), out_dir=train_pred_folder) # Train predictions (overfit)
predict_on_data(test_tiles_dir, DefaultPredictor(cfg), out_dir=test_pred_folder) # Test prediction with change of folder

# Read in the tiff file
project_to_geojson(tiles_dir, train_pred_folder, train_pred_geo_folder)
project_to_geojson(test_tiles_dir, test_pred_folder, test_pred_geo_folder)

This measures the model performance on the testing dataset and logs the results to wandb.

In [None]:
test_prec, test_recall, test_f1 = site_f1_score2(
    test_tiles_dir, test_directory=test_site_path + 'test',
    pred_directory = test_pred_geo_folder[:-1],
    IoU_threshold=0.5, border_filter=[False, 1], conf_threshold=0.6,
    area_threshold=20)

In [None]:
run.log({'Precision_test': test_prec, 'Recall_test': test_recall, 'F1_test': test_f1})
run.finish()
os.rename(train_out_dir, output_dir + f"{run.name}/")