## Set up directories

In [52]:
import os

labelbox_dir = "./data/labelbox"
labelbox_clouds_dir = f'{labelbox_dir}/clouds'
labelbox_clouds_rgb_dir = f'{labelbox_clouds_dir}/rgb'
labelbox_clouds_labels_dir = f'{labelbox_clouds_dir}/labels'
labelbox_clouds_source_dir = f'{labelbox_clouds_dir}/source'

labelbox_clouds_temp_dir = f'{labelbox_clouds_dir}/temp'

os.makedirs(labelbox_clouds_labels_dir, exist_ok=True)
os.makedirs(labelbox_clouds_rgb_dir, exist_ok=True)
os.makedirs(labelbox_clouds_source_dir, exist_ok=True)
os.makedirs(labelbox_clouds_temp_dir, exist_ok=True)

state_path = "./data/labelbox/clouds/uploadState"


## Create chips from bounding box and dates

In [53]:
%reload_ext autoreload
%autoreload 2

import multiprocessing
import numpy as np
from patchify import patchify
import rasterio
import rasterio.transform

from common.constants import NODATA_BYTE, NODATA_FLOAT32, RES, S2_BANDS_TIFF_ORDER
import common.utilities.download as download
import common.utilities.imagery as imagery

PATCH_SIZE = 512


def create_chips_from_bbox(bbox, start_date, end_date):
    
    collection_path = f'{labelbox_clouds_temp_dir}/collection.json'
    collection = download.get_collection(start_date, end_date, bbox, collection_path, max_cloud_cover=100, max_tile_count=1, min_tile_count=1)
    original_scenes = download.download_collection(collection, bbox, S2_BANDS_TIFF_ORDER, labelbox_clouds_temp_dir, RES)

    for scene in original_scenes:
        print(f'\tpatchifying... {scene}')
        
        stack_path = original_scenes[scene]["stack_original_tif_path"]
        with rasterio.open(stack_path) as src:
            if src.width < 512 or src.height < 512:
                print(f'\t\tskipping... {scene}')
                continue

        bbox_str = ''.join([str(round(coord, 2)) for coord in bbox]).replace('.', '').replace('-', 'n')
            
        with rasterio.open(stack_path) as src:
            stack_data = src.read().transpose((1, 2, 0))
            transform = src.transform

            source_patches = patchify(stack_data, (PATCH_SIZE, PATCH_SIZE, stack_data.shape[2]), step=PATCH_SIZE)
            
            for irow in range(source_patches.shape[0]):
                for icol in range(source_patches.shape[1]):
                    source_data = source_patches[irow, icol, 0, :, :, :]
                    
                    rgb_data = source_data[:, :, [2, 1, 0]]
                    rgb_data_norm = (rgb_data * 254).astype(np.uint8)
                    rgb_data_norm[rgb_data_norm > 254] = 254
        
                    rgb_path = f'{labelbox_clouds_rgb_dir}/{scene}_{bbox_str}_{irow}_{icol}.tif'
                    source_path = f'{labelbox_clouds_source_dir}/{scene}_{bbox_str}_{irow}_{icol}.tif'
                    
                    x_min, y_min = rasterio.transform.xy(transform, PATCH_SIZE*(irow+1), PATCH_SIZE*icol)
                    x_max, y_max = rasterio.transform.xy(transform, PATCH_SIZE*irow, PATCH_SIZE*(icol+1))             
                    chip_bbox = [x_min, y_min, x_max, y_max]
                                        
                    imagery.write_array_to_tif(source_data, source_path, chip_bbox, dtype=np.float32, nodata=NODATA_FLOAT32)
                    imagery.write_array_to_tif(rgb_data_norm, rgb_path, chip_bbox, dtype=np.uint8, nodata=NODATA_BYTE, is_cog=True)                        
                    

In [54]:
import common.constants as constants
import common.aws.s3 as s3_utils


def save_rgb_chip_to_s3(rgb_path):
    file_name = rgb_path.split('/')[-1]    
    object_key = f'training/clouds/rgb/{file_name}'
    href = f'https://data.smartcarte.earth/{object_key}'
    s3_utils.put_item(rgb_path, constants.S3_DATA_BUCKET, object_key)
    return href


In [55]:
import datetime
from labelbox import Client, Dataset, DataRow
import glob
import os
import shelve


try:
    from common.sagemaker_env import LABELBOX_API_KEY
except: 
    LABELBOX_API_KEY = os.environ['LABELBOX_API_KEY']
    
CLOUD_PROJECT_ID = "cleamnf3q398707ug5s2z4rp6"


client = Client(api_key=LABELBOX_API_KEY)
project = client.get_project(CLOUD_PROJECT_ID)


def create_labelbox_dataset(prefix=""):
    
    today = datetime.datetime.today().strftime('%Y%m%d_%H%M')
    clouds_dataset_name = f"Clouds {prefix} {today}"
    
    clouds_dataset = client.get_datasets(where=(Dataset.name==clouds_dataset_name)).get_one()
    if clouds_dataset is not None:
        raise ValueError("cloud dataset already exists; wait a minute")        

    rgb_paths = glob.glob(f'{labelbox_clouds_rgb_dir}/*[0-9].tif')
    print(f'{len(rgb_paths)} total chips')
        
    with shelve.open(state_path) as upload_state:
        
        payload = []
        for rgb_path in rgb_paths:
            chip_id = rgb_path.split('/')[-1].replace('.tif', '')
            chip_state = upload_state.get(chip_id)
            
            if chip_state and chip_state.get('uploaded_to_labelbox', False):
                continue
            
            s3_href = save_rgb_chip_to_s3(rgb_path)
            
            payload.append({
                "chip_id": chip_id,
                "min_zoom": 10,
                "max_zoom": 14,
                "tile_layer_url": s3_href
            })

            upload_state[chip_id] = {
                'rgb_cog_href': s3_href,
                'uploaded_to_s3': True,
                'uploaded_to_labelbox': False
            }
                    
        if len(payload) > 0:
            print(f'{len(payload)} chips to add to Labelbox')

            clouds_dataset = client.create_dataset(name=clouds_dataset_name)
            datarow_payload = [{DataRow.row_data: row} for row in payload]
            task = clouds_dataset.create_data_rows(datarow_payload)
            task.wait_till_done()

            for row in payload:
                chip_id = row["chip_id"]
                upload_state[chip_id] = {
                    'rgb_cog_href': s3_href,
                    'uploaded_to_s3': True,
                    'uploaded_to_labelbox': True
                }
            

# Create new chips 🐿️🐿️🐿️

In [56]:
import shutil


def create_new_chips(bbox, region_name, dates):
    for date in dates:
        start_date, end_date = date[0], date[1]
        create_chips_from_bbox(bbox, start_date, end_date)
        
    create_labelbox_dataset(prefix=region_name)
    
    
def clean_up():
    if os.path.exists(labelbox_clouds_temp_dir):
        shutil.rmtree(labelbox_clouds_temp_dir)
        os.makedirs(labelbox_clouds_temp_dir, exist_ok=True)



In [57]:
import datetime as dt

dates = [
    (dt.datetime(2019, 1, 1), dt.datetime(2019, 2, 1)),
    (dt.datetime(2020, 3, 1), dt.datetime(2020, 4, 1)),
    (dt.datetime(2021, 6, 1), dt.datetime(2021, 7, 1)),
    (dt.datetime(2022, 9, 1), dt.datetime(2022, 10, 1)),
]


## Boma National Park 🐘

In [58]:
import datetime as dt

bbox_boma = [33.494145, 6.592713, 33.730720, 6.753140]
create_new_chips(bbox_boma, "Boma", dates)
clean_up()

print('done')

./data/labelbox/clouds/temp/collection.json: {'WN': 1}
	downloading... S2A_36NWN_20190115_0_L2A
	patchifying... S2A_36NWN_20190115_0_L2A
./data/labelbox/clouds/temp/collection.json: {'WN': 1}
	downloading... S2B_36NWN_20200325_0_L2A
	patchifying... S2B_36NWN_20200325_0_L2A
./data/labelbox/clouds/temp/collection.json: {'WN': 1}
	downloading... S2B_36NWN_20210608_0_L2A
	patchifying... S2B_36NWN_20210608_0_L2A
./data/labelbox/clouds/temp/collection.json: {'WN': 1}
	downloading... S2A_36NWN_20220906_0_L2A
	patchifying... S2A_36NWN_20220906_0_L2A
60 total chips
60 chips to add to Labelbox
done


## Virunga National Park (Gorilla Sector) 🦍

In [59]:
import datetime as dt

bbox_virunga_gorilla_sector = [29.397261, -1.464377, 29.55281, -1.366300]
create_new_chips(bbox_virunga_gorilla_sector, "Virunga Gorilla Sector", dates)
clean_up()

print('done')

./data/labelbox/clouds/temp/collection.json: {'QU': 1}
	downloading... S2B_35MQU_20190123_0_L2A
	patchifying... S2B_35MQU_20190123_0_L2A
./data/labelbox/clouds/temp/collection.json: {'QU': 1}
	downloading... S2A_35MQU_20200323_0_L2A
	patchifying... S2A_35MQU_20200323_0_L2A
./data/labelbox/clouds/temp/collection.json: {'QU': 1}
	downloading... S2B_35MQU_20210611_0_L2A
	patchifying... S2B_35MQU_20210611_0_L2A
./data/labelbox/clouds/temp/collection.json: {'QU': 1}
	downloading... S2A_35MQU_20220929_0_L2A
	patchifying... S2A_35MQU_20220929_0_L2A
84 total chips
24 chips to add to Labelbox
done


## Virunga National Park (Volcano Sector) 🌋

In [60]:
import datetime as dt

bbox_volcano = [29.037465, -1.445036, 29.364781, -1.217466]
create_new_chips(bbox_volcano, "Virunga Volcano Sector", dates)
clean_up()

print('done')

./data/labelbox/clouds/temp/collection.json: {'QU': 1}
	downloading... S2B_35MQU_20190123_0_L2A
	patchifying... S2B_35MQU_20190123_0_L2A
./data/labelbox/clouds/temp/collection.json: {'QU': 1}
	downloading... S2A_35MQU_20200323_0_L2A
	patchifying... S2A_35MQU_20200323_0_L2A
./data/labelbox/clouds/temp/collection.json: {'QU': 1}
	downloading... S2B_35MQU_20210611_0_L2A
	patchifying... S2B_35MQU_20210611_0_L2A
./data/labelbox/clouds/temp/collection.json: {'QU': 1}
	downloading... S2A_35MQU_20220929_0_L2A
	patchifying... S2A_35MQU_20220929_0_L2A
196 total chips
112 chips to add to Labelbox
done


## Virunga Central Sector 🦛

In [64]:
import datetime as dt

bbox_virunga_central_sector = [29.258226, -0.760528, 29.545592, -0.583160]
create_new_chips(bbox_virunga_central_sector, "Virunga Central Sector", dates)
clean_up()

print('done')

./data/labelbox/clouds/temp/collection.json: {'QV': 1}
	downloading... S2B_35MQV_20190123_0_L2A
	patchifying... S2B_35MQV_20190123_0_L2A
./data/labelbox/clouds/temp/collection.json: {'QV': 1}
	downloading... S2A_35MQV_20200303_0_L2A
	patchifying... S2A_35MQV_20200303_0_L2A
./data/labelbox/clouds/temp/collection.json: {'QV': 1}
	downloading... S2B_35MQV_20210611_0_L2A
	patchifying... S2B_35MQV_20210611_0_L2A
./data/labelbox/clouds/temp/collection.json: {'QV': 1}
	downloading... S2A_35MQV_20220929_0_L2A
	patchifying... S2A_35MQV_20220929_0_L2A
324 total chips
72 chips to add to Labelbox
done


## Goma 🏙️

In [62]:
import datetime as dt

bbox_goma = [29.084695, -1.708073, 29.310951, -1.605942]
create_new_chips(bbox_goma, "Goma", dates)
clean_up()

print('done')

./data/labelbox/clouds/temp/collection.json: {'QU': 1}
	downloading... S2B_35MQU_20190123_0_L2A
	patchifying... S2B_35MQU_20190123_0_L2A
./data/labelbox/clouds/temp/collection.json: {'QU': 1}
	downloading... S2A_35MQU_20200323_0_L2A
	patchifying... S2A_35MQU_20200323_0_L2A
./data/labelbox/clouds/temp/collection.json: {'QU': 1}
	downloading... S2B_35MQU_20210611_0_L2A
	patchifying... S2B_35MQU_20210611_0_L2A
./data/labelbox/clouds/temp/collection.json: {'QU': 1}
	downloading... S2A_35MQU_20220929_0_L2A
	patchifying... S2A_35MQU_20220929_0_L2A
228 total chips
32 chips to add to Labelbox
done


## Zakouma National Park 🦒

In [63]:
import datetime as dt

bbox_zakouma = [19.742523, 10.831293, 19.903319, 10.960331]
create_new_chips(bbox_zakouma, "Zakouma", dates)
clean_up()

print('done')

./data/labelbox/clouds/temp/collection.json: {'CT': 1, 'CS': 1}
	downloading... S2A_34PCT_20190123_1_L2A
	downloading... S2A_34PCS_20190123_0_L2A
	patchifying... S2A_34PCT_20190123_1_L2A
	patchifying... S2A_34PCS_20190123_0_L2A
		skipping... S2A_34PCS_20190123_0_L2A
./data/labelbox/clouds/temp/collection.json: {'CT': 1, 'CS': 1}
	downloading... S2B_34PCT_20200303_0_L2A
	downloading... S2B_34PCS_20200323_0_L2A
	patchifying... S2B_34PCT_20200303_0_L2A
	patchifying... S2B_34PCS_20200323_0_L2A
		skipping... S2B_34PCS_20200323_0_L2A
./data/labelbox/clouds/temp/collection.json: {'CS': 1, 'CT': 1}
	downloading... S2B_34PCS_20210626_0_L2A
	downloading... S2B_34PCT_20210626_0_L2A
	patchifying... S2B_34PCS_20210626_0_L2A
		skipping... S2B_34PCS_20210626_0_L2A
	patchifying... S2B_34PCT_20210626_0_L2A
./data/labelbox/clouds/temp/collection.json: {'CT': 1, 'CS': 1}
	downloading... S2B_34PCT_20220919_0_L2A
	downloading... S2B_34PCS_20220919_0_L2A
	patchifying... S2B_34PCT_20220919_0_L2A
	patchifying

## Add old data to new Labelbox flow

In [None]:
import glob
import shutil


old_source_paths = glob.glob('./data/mergedCloudTrainingData/original/source/S2*[0-9].tif')

for old_path in old_source_paths:
    shutil.copy2(old_path, labelbox_clouds_source_dir)


## Pull labeled data from Labelbox

In [None]:
from labelbox import Client


def get_data_rows():
    client = Client(api_key=LABELBOX_API_KEY)
    project = client.get_project(CLOUD_PROJECT_ID)
    data_rows = project.export_labels(download=True)  
    return data_rows


def is_island(target_polygon, polygon_list):
    for curr_poly in polygon_list:
        if target_polygon.within(curr_poly):
            return True
    return False


In [None]:
import json
import numpy as np
import rasterio
from rasterio.features import geometry_mask
from shapely.geometry import Polygon


from common.utilities.imagery import write_array_to_tif


CLOUD_CLASSES = {
    'no_cloud': 0,
    'cloud': 1
}

data_rows = get_data_rows()
data_rows = [dr for dr in data_rows if dr['DataRow Workflow Info']['taskName'] == "Done"]
print(f'{len(data_rows)} done data rows')

for i, row in enumerate(data_rows):
        
    row_id = row['DataRow ID']
    labels = row['Label']['objects']
    metadata = json.loads(row['Labeled Data'])
    rgb_cog_url = metadata['tileLayerUrl']
    file_name = rgb_cog_url.split('/')[-1]
    
    rgb_path = f'{labelbox_clouds_rgb_dir}/{file_name}'
    composite_path = f'{labelbox_clouds_source_dir}/{file_name}'
    label_path = f'{labelbox_clouds_labels_dir}/{file_name}'

    # TODO: need to handle orphan labels

    with rasterio.open(composite_path) as src:
        bbox = list(src.bounds)
        composite_shape = src.shape
        composite_transform = src.transform
    
    label_polygons = {
        label['featureId']: Polygon(label['geometry']['coordinates'][0])
        for label in labels
    }
        
    # 0: no_cloud, 1: cloud
    labels_data = np.zeros(composite_shape).astype(np.uint8)
    
    islands = []    
    sorted_labels = sorted(labels, key=lambda lab: lab['value'], reverse=True)   
    for label in sorted_labels:
        class_value = label['value']
        class_idx = CLOUD_CLASSES[class_value]
                
        label_id = label['featureId']
        polygon = label_polygons[label_id]
        polygon_mask = geometry_mask([polygon], composite_shape, composite_transform, invert=True)
        
        test_polygons = label_polygons.copy()
        del test_polygons[label_id]
        is_poly_island = is_island(polygon, list(test_polygons.values()))
                
        if is_poly_island:
            islands.append((class_idx, polygon_mask))
        else:
            labels_data[polygon_mask] = class_idx

    for class_idx, polygon_mask in islands:
        labels_data[polygon_mask] = class_idx
          
    write_array_to_tif(labels_data, label_path, bbox, dtype=np.uint8, nodata=255)
       
        
print('done')