# 0. Setup

In [1]:
import os

labelbox_dir = "./data/labelbox"
labelbox_landcover_dir = f'{labelbox_dir}/landcover'
labelbox_landcover_rgb_dir = f'{labelbox_landcover_dir}/rgb'
labelbox_landcover_labels_dir = f'{labelbox_landcover_dir}/labels'
labelbox_landcover_source_dir = f'{labelbox_landcover_dir}/source'
labelbox_landcover_temp_dir = f'{labelbox_landcover_dir}/temp'

os.makedirs(labelbox_landcover_labels_dir, exist_ok=True)
os.makedirs(labelbox_landcover_rgb_dir, exist_ok=True)
os.makedirs(labelbox_landcover_source_dir, exist_ok=True)
os.makedirs(labelbox_landcover_temp_dir, exist_ok=True)

landcover_state_path = "./data/labelbox/landcover/uploadState"
cloud_detection_model_path = os.path.join(os.getcwd(), 'cloud_detection_model_resnet18_dice_20230301.pth')


In [2]:

try:
    from common.sagemaker_env import LABELBOX_API_KEY
except: 
    LABELBOX_API_KEY = os.environ['LABELBOX_API_KEY']
    
LANDCOVER_PROJECT_ID = "clcrkcvn9281j07xybmww69pn"


# 1. Create chips and upload to Labelbox

In [183]:
%reload_ext autoreload
%autoreload 2

import numpy as np
from patchify import patchify
import rasterio
import rasterio.transform

from common.constants import NODATA_BYTE, NODATA_FLOAT32, RES, S2_BANDS_TIFF_ORDER
import common.utilities.download as download
import common.utilities.imagery as imagery
import common.utilities.masking as masking

PATCH_SIZE = 512


def create_landcover_chips_from_bbox(bbox, start_date, end_date):
    
    collection_path = f'{labelbox_landcover_temp_dir}/collection.json'
    collection = download.get_collection(start_date, end_date, bbox, collection_path, max_cloud_cover=60, max_tile_count=1, min_tile_count=1)
    original_scenes = download.download_collection(collection, bbox, S2_BANDS_TIFF_ORDER, labelbox_landcover_temp_dir, RES)

    for scene in original_scenes:
        print(f'\tpatchifying... {scene}')
        
        meta = original_scenes[scene]['meta']
        stack_path = original_scenes[scene]["stack_original_tif_path"]
        with rasterio.open(stack_path) as src:
            if src.width < 512 or src.height < 512:
                print(f'\t\tskipping... {scene}')
                continue

        bbox_str = ''.join([str(round(coord, 2)) for coord in bbox]).replace('.', '').replace('-', 'n')
        
        with rasterio.open(stack_path) as stack_src:
            stack_data = stack_src.read(masked=True).transpose((1, 2, 0))
            transform = stack_src.transform

            source_patches = patchify(stack_data, (PATCH_SIZE, PATCH_SIZE, stack_data.shape[2]), step=PATCH_SIZE)
            
            for irow in range(source_patches.shape[0]):
                for icol in range(source_patches.shape[1]):
                    source_data = source_patches[irow, icol, 0, :, :, :]  
                    source_data = np.ma.array(source_data, mask=(source_data==NODATA_FLOAT32))
                      
                    # save basic normalized source data, then later if can do percentile normalization if things aren't working
                    source_data = source_data.transpose((2, 0, 1))
                    source_data, pct_masked = masking.apply_nn_cloud_mask('', meta, '', cloud_detection_model_path, bbox=bbox, stack_data=source_data)
                    
                    if pct_masked > 0.60:
                        print(f'\t\t{round(pct_masked*100, 2)}% masked; skipping r{irow} c{icol}')
                        continue
                                        
                    rgb_data = source_data[[2, 1, 0], :, :]
                    rgb_data = imagery.normalize_3d_array_percentiles(rgb_data, p_low=0.1, p_high=99.9)                    
                    rgb_data_norm = (rgb_data * 254).astype(np.uint8)
                    rgb_data_norm[rgb_data_norm > 254] = 254
        
                    rgb_path = f'{labelbox_landcover_rgb_dir}/{scene}_{bbox_str}_{irow}_{icol}.tif'
                    source_path = f'{labelbox_landcover_source_dir}/{scene}_{bbox_str}_{irow}_{icol}.tif'
                    
                    x_min, y_min = rasterio.transform.xy(transform, PATCH_SIZE*(irow+1), PATCH_SIZE*icol)
                    x_max, y_max = rasterio.transform.xy(transform, PATCH_SIZE*irow, PATCH_SIZE*(icol+1))             
                    chip_bbox = [x_min, y_min, x_max, y_max]          

                    source_data = source_data.transpose((1, 2, 0))
                    rgb_data_norm = rgb_data_norm.transpose((1, 2, 0))
                        
                    imagery.write_array_to_tif(source_data, source_path, chip_bbox, dtype=np.float32, nodata=NODATA_FLOAT32)
                    imagery.write_array_to_tif(rgb_data_norm, rgb_path, chip_bbox, dtype=np.uint8, nodata=NODATA_BYTE, is_cog=True)   
                    

In [184]:
import common.constants as constants
import common.aws.s3 as s3_utils


def save_landcover_rgb_chip_to_s3(rgb_path):
    file_name = rgb_path.split('/')[-1]    
    object_key = f'training/landcover/rgb/{file_name}'
    href = f'https://data.smartcarte.earth/{object_key}'
    s3_utils.put_item(rgb_path, constants.S3_DATA_BUCKET, object_key)
    return href


In [185]:
import datetime
from labelbox import Client, Dataset, DataRow
import glob
import os
import shelve


client = Client(api_key=LABELBOX_API_KEY)
project = client.get_project(LANDCOVER_PROJECT_ID)


def create_labelbox_landcover_dataset(prefix=""):
    
    today = datetime.datetime.today().strftime('%Y%m%d_%H%M')
    landcover_dataset_name = f"Landcover {prefix} {today}"

    print(f'\tprocessing {landcover_dataset_name}')
    
    landcover_dataset = client.get_datasets(where=(Dataset.name==landcover_dataset_name)).get_one()
    if landcover_dataset is not None:
        raise ValueError("landcover dataset already exists; wait a minute")        

    rgb_paths = glob.glob(f'{labelbox_landcover_rgb_dir}/*[0-9].tif')
    print(f'{len(rgb_paths)} total chips')
        
    with shelve.open(landcover_state_path) as upload_state:
        
        payload = []
        for rgb_path in rgb_paths:
            chip_id = rgb_path.split('/')[-1].replace('.tif', '')
            chip_state = upload_state.get(chip_id)
            
            if chip_state and chip_state.get('uploaded_to_labelbox', False):
                continue
            
            s3_href = save_landcover_rgb_chip_to_s3(rgb_path)
            
            payload.append({
                "chip_id": chip_id,
                "min_zoom": 12,
                "max_zoom": 15,
                "tile_layer_url": s3_href
            })

            upload_state[chip_id] = {
                'rgb_cog_href': s3_href,
                'uploaded_to_s3': True,
                'uploaded_to_labelbox': False
            }
                                
        if len(payload) > 0:
            print(f'{len(payload)} chips to add to Labelbox')

            landcover_dataset = client.create_dataset(name=landcover_dataset_name)
            datarow_payload = [{DataRow.row_data: row} for row in payload]
            task = landcover_dataset.create_data_rows(datarow_payload)
            task.wait_till_done()

            for row in payload:
                chip_id = row["chip_id"]
                upload_state[chip_id] = {
                    'rgb_cog_href': s3_href,
                    'uploaded_to_s3': True,
                    'uploaded_to_labelbox': True
                }
            

In [186]:
import shutil


def create_new_chips(bbox, region_name, dates):
    for date in dates:
        start_date, end_date = date[0], date[1]
        create_landcover_chips_from_bbox(bbox, start_date, end_date)
        
    create_labelbox_landcover_dataset(prefix=region_name)
    
    
def clean_up():
    if os.path.exists(labelbox_landcover_temp_dir):
        shutil.rmtree(labelbox_landcover_temp_dir)
        os.makedirs(labelbox_landcover_temp_dir, exist_ok=True)



In [187]:
import datetime as dt

dates = [
    (dt.datetime(2019, 1, 1), dt.datetime(2019, 2, 1)),
    (dt.datetime(2020, 3, 1), dt.datetime(2020, 4, 1)),
    (dt.datetime(2021, 6, 1), dt.datetime(2021, 7, 1)),
    (dt.datetime(2022, 9, 1), dt.datetime(2022, 10, 1)),
]


## Virunga National Park (Volcano Sector) 🌋

In [189]:

bbox_volcano = [29.037475, -1.445046, 29.345781, -1.269466]
create_new_chips(bbox_volcano, "Virunga Volcano Sector", dates)
clean_up()

print('done')

./data/labelbox/landcover/temp/collection.json: {'QU': 1}
	downloading... S2B_35MQU_20190123_0_L2A
	patchifying... S2B_35MQU_20190123_0_L2A
		82.63% masked; skipping r0 c3
		95.55% masked; skipping r0 c4
		66.76% masked; skipping r1 c1
		61.21% masked; skipping r1 c2
		68.06% masked; skipping r1 c5
		85.7% masked; skipping r2 c1
		61.22% masked; skipping r2 c2
		67.72% masked; skipping r2 c3
		60.97% masked; skipping r2 c4
		61.49% masked; skipping r2 c5
./data/labelbox/landcover/temp/collection.json: {'QU': 1}
	downloading... S2A_35MQU_20200323_0_L2A
	patchifying... S2A_35MQU_20200323_0_L2A
		65.18% masked; skipping r1 c0
		65.59% masked; skipping r1 c1
		94.49% masked; skipping r2 c0
		96.22% masked; skipping r2 c1
		76.78% masked; skipping r2 c2
./data/labelbox/landcover/temp/collection.json: {'QU': 1}
	downloading... S2B_35MQU_20210611_0_L2A
	patchifying... S2B_35MQU_20210611_0_L2A
./data/labelbox/landcover/temp/collection.json: {'QU': 1}
	downloading... S2A_35MQU_20220929_0_L2A
	p

## Virunga National Park (Ndwali) 🦛

In [192]:

bbox_ndwali = [29.265225, -0.728130, 29.480121, -0.584068]
create_new_chips(bbox_ndwali, "Virunga Ndwali", dates)
clean_up()

print('done')

./data/labelbox/landcover/temp/collection.json: {'QV': 1}
	downloading... S2B_35MQV_20190123_0_L2A
	patchifying... S2B_35MQV_20190123_0_L2A
./data/labelbox/landcover/temp/collection.json: {'QV': 1}
	downloading... S2A_35MQV_20200303_0_L2A
	patchifying... S2A_35MQV_20200303_0_L2A
./data/labelbox/landcover/temp/collection.json: {'QV': 1}
	downloading... S2B_35MQV_20210611_0_L2A
	patchifying... S2B_35MQV_20210611_0_L2A
./data/labelbox/landcover/temp/collection.json: {'QV': 1}
	downloading... S2A_35MQV_20220929_0_L2A
	patchifying... S2A_35MQV_20220929_0_L2A
	processing Landcover Virunga Ndwali 20230304_2011
94 total chips
48 chips to add to Labelbox
done


## Virunga National Park (Gorilla Sector) 🦍

In [193]:

bbox_gorilla = [29.355053, -1.539090, 29.591672, -1.264626]
create_new_chips(bbox_gorilla, "Virunga Gorilla Sector", dates)
clean_up()

print('done')

./data/labelbox/landcover/temp/collection.json: {'QU': 1}
	downloading... S2B_35MQU_20190123_0_L2A
	patchifying... S2B_35MQU_20190123_0_L2A
		87.31% masked; skipping r2 c3
		68.92% masked; skipping r2 c4
		76.95% masked; skipping r3 c0
		98.55% masked; skipping r3 c1
		95.2% masked; skipping r3 c2
		96.77% masked; skipping r3 c3
		84.76% masked; skipping r3 c4
		94.43% masked; skipping r4 c0
		96.82% masked; skipping r4 c1
		99.94% masked; skipping r4 c2
		100.0% masked; skipping r4 c3
		81.67% masked; skipping r4 c4
./data/labelbox/landcover/temp/collection.json: {'QU': 1}
	downloading... S2A_35MQU_20200323_0_L2A
	patchifying... S2A_35MQU_20200323_0_L2A
		91.79% masked; skipping r2 c0
		79.25% masked; skipping r2 c4
		99.61% masked; skipping r4 c0
		63.23% masked; skipping r4 c4
./data/labelbox/landcover/temp/collection.json: {'QU': 1}
	downloading... S2B_35MQU_20210611_0_L2A
	patchifying... S2B_35MQU_20210611_0_L2A
		79.78% masked; skipping r3 c4
		84.21% masked; skipping r4 c4
./dat

## Virunga National Park (Ishango) 🐟

In [194]:

bbox_ishango = [29.516798, -0.163506, 29.693365, -0.026481]
create_new_chips(bbox_ishango, "Virunga Ishango", dates)
clean_up()

print('done')

./data/labelbox/landcover/temp/collection.json: {'QA': 1, 'QV': 1}
	downloading... S2B_35NQA_20190123_0_L2A
	downloading... S2B_35MQV_20190123_0_L2A
	patchifying... S2B_35NQA_20190123_0_L2A
	patchifying... S2B_35MQV_20190123_0_L2A
./data/labelbox/landcover/temp/collection.json: {'QA': 1, 'QV': 1}
	downloading... S2A_35NQA_20200323_0_L2A
	downloading... S2A_35MQV_20200303_0_L2A
	patchifying... S2A_35NQA_20200323_0_L2A
	patchifying... S2A_35MQV_20200303_0_L2A
./data/labelbox/landcover/temp/collection.json: {'QV': 1, 'QA': 1}
	downloading... S2B_35MQV_20210611_0_L2A
	downloading... S2B_35NQA_20210621_0_L2A
	patchifying... S2B_35MQV_20210611_0_L2A
	patchifying... S2B_35NQA_20210621_0_L2A
./data/labelbox/landcover/temp/collection.json: {'QV': 1, 'QA': 1}
	downloading... S2A_35MQV_20220929_0_L2A
	downloading... S2A_35NQA_20220929_0_L2A
	patchifying... S2A_35MQV_20220929_0_L2A
		79.09% masked; skipping r0 c0
	patchifying... S2A_35NQA_20220929_0_L2A
		79.3% masked; skipping r0 c0
	processing L

## Okapi Wildlife Reserve 🌳

In [195]:

bbox_okapi = [28.524382, 1.376513, 28.716677, 1.515305]
create_new_chips(bbox_okapi, "Okapi", dates)
clean_up()

print('done')


./data/labelbox/landcover/temp/collection.json: {'PB': 1}
	downloading... S2A_35NPB_20190121_0_L2A
	patchifying... S2A_35NPB_20190121_0_L2A
./data/labelbox/landcover/temp/collection.json: {'PB': 1}
	downloading... S2B_35NPB_20200331_0_L2A
	patchifying... S2B_35NPB_20200331_0_L2A
./data/labelbox/landcover/temp/collection.json: {'PB': 1}
	downloading... S2B_35NPB_20210624_0_L2A
	patchifying... S2B_35NPB_20210624_0_L2A
		99.97% masked; skipping r0 c0
		97.33% masked; skipping r0 c1
		99.88% masked; skipping r0 c2
		99.79% masked; skipping r0 c3
		98.55% masked; skipping r1 c0
		96.82% masked; skipping r1 c1
		99.69% masked; skipping r1 c2
		100.0% masked; skipping r1 c3
		99.9% masked; skipping r2 c0
		94.82% masked; skipping r2 c1
		96.27% masked; skipping r2 c2
		99.95% masked; skipping r2 c3
./data/labelbox/landcover/temp/collection.json: {'PB': 1}
	downloading... S2B_35NPB_20220917_0_L2A
	patchifying... S2B_35NPB_20220917_0_L2A
		96.3% masked; skipping r0 c0
		96.7% masked; skipping r

## Reset chips

In [None]:
import re
import shelve

    
with shelve.open(landcover_state_path) as upload_state:
    
    keys = list(upload_state.keys())
    for key in keys:
        if re.match(r'^(S2A_35MQU|S2B_35MQU)', key):
            del upload_state[key]


# 2. Pull labeled data from Labelbox

In [3]:
from labelbox import Client


def get_data_rows():
    client = Client(api_key=LABELBOX_API_KEY)
    project = client.get_project(LANDCOVER_PROJECT_ID)
    data_rows = project.export_labels(download=True)  
    return data_rows


def is_island(target_polygon, polygon_list):
    for curr_poly in polygon_list:
        if target_polygon.within(curr_poly):
            return True
    return False


In [9]:
import json
import numpy as np
import rasterio
from rasterio.features import geometry_mask
from shapely.geometry import Polygon

from common.constants import NODATA_BYTE
from common.utilities.imagery import write_array_to_tif

# the lower the value the higher the priority
# higher values get overwritten by lower ones
LANDCOVER_CLASSES = {
    'unclassified': 0,
    'cloud' : 1,
    'built' : 2,
    'water' : 3,
    'bare_ground' : 4,
    'burned' : 5,
    'trees' : 6,
    'agriculture' : 7,
    'semi_natural_vegetation' : 8,
}

data_rows = get_data_rows()
data_rows = [dr for dr in data_rows if dr['DataRow Workflow Info']['taskName'] == "Done"]
print(f'{len(data_rows)} done data rows')


for i, row in enumerate(data_rows):
        
    row_id = row['DataRow ID']    
    labels = row['Label']['objects']
    metadata = json.loads(row['Labeled Data'])
    rgb_cog_url = metadata['tileLayerUrl']
    file_name = rgb_cog_url.split('/')[-1]
        
    rgb_path = f'{labelbox_landcover_rgb_dir}/{file_name}'
    label_path = f'{labelbox_landcover_labels_dir}/{file_name}'
    source_path = f'{labelbox_landcover_source_dir}/{file_name}'

    with rasterio.open(source_path) as src:
        bbox = list(src.bounds)
        source_shape = src.shape
        source_transform = src.transform
        source_mask = src.read(masked=True).mask[0, :, :]
    
    label_polygons = {
        label['featureId']: Polygon(label['geometry']['coordinates'][0])
        for label in labels
    }
        
    # 0: unclassified
    print(source_mask.shape)
    labels_data = np.zeros(source_shape).astype(np.uint8)
    labels_data = np.ma.array(labels_data, mask=source_mask)
    print(labels_data.shape)

    islands = []    
    sorted_labels = sorted(labels, key=lambda lab: LANDCOVER_CLASSES[lab['value']], reverse=True)

    print(f'\t{len(sorted_labels)} sorted labels')

    for label in sorted_labels:
        class_value = label['value']
        class_idx = LANDCOVER_CLASSES[class_value]
                
        label_id = label['featureId']
        polygon = label_polygons[label_id]
        polygon_mask = geometry_mask([polygon], source_shape, source_transform, invert=True)
        
        island_test_polygons = label_polygons.copy()
        del island_test_polygons[label_id]
        is_poly_island = is_island(polygon, list(island_test_polygons.values()))
                
        if is_poly_island:
            islands.append((class_idx, polygon_mask))
        else:
            labels_data[polygon_mask] = class_idx

    for class_idx, polygon_mask in islands:
        labels_data[polygon_mask] = class_idx
          

    write_array_to_tif(labels_data, label_path, bbox, dtype=np.uint8, nodata=NODATA_BYTE)
       

print('done')

1 done data rows
	52 sorted labels
semi_natural_vegetation 8
semi_natural_vegetation 8
semi_natural_vegetation 8
semi_natural_vegetation 8
semi_natural_vegetation 8
semi_natural_vegetation 8
semi_natural_vegetation 8
semi_natural_vegetation 8
semi_natural_vegetation 8
semi_natural_vegetation 8
semi_natural_vegetation 8
semi_natural_vegetation 8
agriculture 7
agriculture 7
trees 6
trees 6
trees 6
trees 6
trees 6
trees 6
trees 6
trees 6
trees 6
trees 6
trees 6
trees 6
trees 6
trees 6
trees 6
trees 6
trees 6
built 2
built 2
built 2
built 2
built 2
built 2
built 2
built 2
built 2
built 2
built 2
built 2
built 2
built 2
cloud 1
cloud 1
cloud 1
cloud 1
cloud 1
unclassified 0
unclassified 0
done
