In [17]:
import os

try:
    from common.sagemaker_env import LABELBOX_API_KEY
except: 
    LABELBOX_API_KEY = os.environ['LABELBOX_API_KEY']
    

In [16]:

labelbox_dir = "./data/labelbox"
clouds_dir = f'{labelbox_dir}/clouds'
source_dir = f'{clouds_dir}/source'
labels_dir = f'{clouds_dir}/labels'
rgb_dir = f'{clouds_dir}/rgb'


In [3]:

def is_island(target_polygon, polygon_list):
    for curr_poly in polygon_list:
        if target_polygon.within(curr_poly):
            return True
        
    return False


In [None]:
%reload_ext autoreload
%autoreload 2

import glob
import numpy as np
from patchify import patchify
import rasterio


from common.constants import NODATA_BYTE, NODATA_FLOAT32
import common.utilities.imagery as imagery
import common.utilities.visualization as visualization


# normalize_3d_array

# 3. upload rgb to S3 and build json for Labelbox
# 4. upload to labelbox
# 5. pull from labelbox and update labels/


import matplotlib.pyplot as plt



PATCH_SIZE = 512

scene_dirs = glob.glob(f'./data/20*/S2*')

for i, scene_dir in enumerate(scene_dirs):
    
    scene = scene_dir.split('/')[-1]
    print(scene)
    
    with rasterio.open(f'{scene_dir}/B08.tif') as src:
        if src.width < 512 or src.height < 512:
            continue
    
    stack_data = []
    transform = None
    tif_paths = sorted(glob.glob(f'{scene_dir}/B*.tif'))
    for path in tif_paths:
        with rasterio.open(path) as src:
            transform = src.transform
            stack_data.append(src.read(1))
            
    stack_data = np.ma.array(stack_data)
    stack_data = imagery.normalize_3d_array(stack_data).transpose((1, 2, 0))
        
    source_patches = patchify(stack_data, (PATCH_SIZE, PATCH_SIZE, stack_data.shape[2]), step=PATCH_SIZE)
    
    for irow in range(source_patches.shape[0]):
        for icol in range(source_patches.shape[1]):
            source_data = source_patches[irow, icol, 0, :, :, :]            
            source_data = np.ma.array(source_data, mask=(source_data==NODATA_FLOAT32))
            
            if source_data.mask.sum() > 0:
                print('skipping:', irow, icol)
                continue
                
            #if irow < 3 or icol < 3:
            #    continue

            rgb_data = source_data[:, :, [2, 1, 0]]
            rgb_data_norm = np.round(np.multiply(rgb_data, 254)).astype(int)
            rgb_data_norm2 = imagery.normalize_0_254_3d_array(rgb_data)
            
            #print('\t', np.min(rgb_data_norm), np.median(rgb_data_norm), np.max(rgb_data_norm))
            #print('\t', np.min(rgb_data_norm2), np.median(rgb_data_norm2), np.max(rgb_data_norm2))
            #print('\t-------------')
            
            #fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 12))
            #visualization.plot_bands(rgb_data1, ax=ax1, bands=[0, 1, 2], transpose=False)
            #visualization.plot_bands(rgb_data2, ax=ax2, bands=[0, 1, 2], transpose=False)
            #raise
                
            source_path = f'{source_dir}/{scene}_{irow}_{icol}.tif'
            rgb_path = f'{rgb_dir}/{scene}_{irow}_{icol}.tif'
                    
            x_min, y_min = rasterio.transform.xy(transform, PATCH_SIZE*(irow+1), PATCH_SIZE*icol)
            x_max, y_max = rasterio.transform.xy(transform, PATCH_SIZE*irow, PATCH_SIZE*(icol+1))             
            bbox = [x_min, y_min, x_max, y_max]
            
            imagery.write_array_to_tif(rgb_data_norm, rgb_path, bbox, dtype=np.uint8, nodata=255, is_cog=True) 
            imagery.write_array_to_tif(source_data, source_path, bbox, dtype=np.float32, nodata=NODATA_FLOAT32) 
                   

S2A_35MQV_20200731_0_L2A
S2A_35MQV_20200820_0_L2A
S2A_35MQU_20211123_0_L2A
S2B_35MQU_20211108_0_L2A
S2B_35MQU_20211029_0_L2A


KeyboardInterrupt: 

## Create scene patches in parallel

In [36]:
%reload_ext autoreload
%autoreload 2

import glob
import multiprocessing
import numpy as np
from patchify import patchify
import rasterio


from common.constants import NODATA_BYTE, NODATA_FLOAT32
import common.utilities.imagery as imagery
import common.utilities.visualization as visualization



PATCH_SIZE = 512


def process_scene(scene_dir):
    
    scene = scene_dir.split('/')[-1]
    
    with rasterio.open(f'{scene_dir}/B08.tif') as src:
        if src.width < 512 or src.height < 512:
            return

    stack_data = []
    transform = None
    tif_paths = sorted(glob.glob(f'{scene_dir}/B*.tif'))
    for path in tif_paths:
        with rasterio.open(path) as src:
            transform = src.transform
            stack_data.append(src.read(1))

    stack_data = np.ma.array(stack_data)
    stack_data = imagery.normalize_3d_array(stack_data).transpose((1, 2, 0))
        
    source_patches = patchify(stack_data, (PATCH_SIZE, PATCH_SIZE, stack_data.shape[2]), step=PATCH_SIZE)
    
    for irow in range(source_patches.shape[0]):
        for icol in range(source_patches.shape[1]):
            source_data = source_patches[irow, icol, 0, :, :, :]            
            source_data = np.ma.array(source_data, mask=(source_data==NODATA_FLOAT32))
            
            if source_data.mask.sum() > 0:
                print('skipping:', irow, icol)
                continue
                
            rgb_data = source_data[:, :, [2, 1, 0]]
            rgb_data_norm = np.round(np.multiply(rgb_data, 255)).astype(np.uint8) + 1
            
            source_path = f'{source_dir}/{scene}_{irow}_{icol}.tif'
            rgb_path = f'{rgb_dir}/{scene}_{irow}_{icol}.tif'
                    
            x_min, y_min = rasterio.transform.xy(transform, PATCH_SIZE*(irow+1), PATCH_SIZE*icol)
            x_max, y_max = rasterio.transform.xy(transform, PATCH_SIZE*irow, PATCH_SIZE*(icol+1))             
            bbox = [x_min, y_min, x_max, y_max]
            
            imagery.write_array_to_tif(rgb_data_norm, rgb_path, bbox, dtype=np.uint8, nodata=0, is_cog=True) 
            imagery.write_array_to_tif(source_data, source_path, bbox, dtype=np.float32, nodata=NODATA_FLOAT32) 
                       
            
            
scene_dirs = glob.glob(f'./data/20*/S2*')
print(f'{len(scene_dirs)} scenes')
with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
    patches = pool.map(process_scene, scene_dirs)
    

103 scenes


In [29]:
%%bash

gdalinfo -stats ./data/labelbox/clouds/rgb/S2A_35MQV_20200731_0_L2A_0_0.tif


Driver: GTiff/GeoTIFF
Files: ./data/labelbox/clouds/rgb/S2A_35MQV_20200731_0_L2A_0_0.tif
Size is 512, 512
Coordinate System is:
GEOGCRS["WGS 84",
    DATUM["World Geodetic System 1984",
        ELLIPSOID["WGS 84",6378137,298.257223563,
            LENGTHUNIT["metre",1]]],
    PRIMEM["Greenwich",0,
        ANGLEUNIT["degree",0.0174532925199433]],
    CS[ellipsoidal,2],
        AXIS["geodetic latitude (Lat)",north,
            ORDER[1],
            ANGLEUNIT["degree",0.0174532925199433]],
        AXIS["geodetic longitude (Lon)",east,
            ORDER[2],
            ANGLEUNIT["degree",0.0174532925199433]],
    ID["EPSG",4326]]
Data axis to CRS axis mapping: 2,1
Origin = (29.242784885946573,-0.591630233123690)
Pixel Size = (0.000089771893148,-0.000090466247379)
Metadata:
  AREA_OR_POINT=Area
Image Structure Metadata:
  INTERLEAVE=PIXEL
  LAYOUT=COG
Corner Coordinates:
Upper Left  (  29.2427849,  -0.5916302) ( 29d14'34.03"E,  0d35'29.87"S)
Lower Left  (  29.2427849,  -0.6379490) ( 29d14'3

In [37]:
%reload_ext autoreload
%autoreload 2

import common.constants as constants
import common.aws.s3 as s3_utils


def save_patch_to_s3(tif_path):
    
    file_name = tif_path.split('/')[-1]    
    object_key = f'clouds/{file_name}'
    href = f'https://data.smartcarte.earth/{object_key}'
    # print(f'uploading {tif_path} to s3://{constants.S3_DATA_BUCKET}/{object_key}')
    s3_utils.put_item(tif_path, constants.S3_DATA_BUCKET, object_key)
    
    return href


In [38]:
import shelve


state_path = "./data/trainCloudsState"
with shelve.open(state_path) as state:

    rgb_paths = glob.glob(f'{rgb_dir}/*.tif')
    print(f'{len(rgb_paths)} paths')
    for i, path in enumerate(rgb_paths):
        
        if i % 200 == 0:
            print(f'{i} done')
            
        job_name = path.split('/')[-1].replace('.tif', '')
        s3_href = save_patch_to_s3(path)
        state[job_name] = {
            'rgb_cog_href': s3_href
        }
    

3632 paths
0 done
200 done
400 done
600 done
800 done
1000 done
1200 done
1400 done
1600 done
1800 done
2000 done
2200 done
2400 done
2600 done
2800 done
3000 done
3200 done
3400 done
3600 done


In [39]:
import datetime
import glob
import json
from labelbox import Client, Dataset, DataRow
import os
import random
import shelve
from uuid import uuid4


client = Client(api_key=LABELBOX_API_KEY)
project = client.get_project("cleamnf3q398707ug5s2z4rp6")

today = datetime.datetime.today().strftime('%Y%m%d')

clouds_dataset_name = f"{today} Clouds"

clouds_dataset = client.get_datasets(where=(Dataset.name==clouds_dataset_name)).get_one()
if clouds_dataset is None:
    clouds_dataset = client.create_dataset(name=clouds_dataset_name)
else:
    raise ValueError("today's clouds dataset already exists")

    
payload = []
with shelve.open(state_path) as state:
    for job_name in state.keys():        
        row = {
            "min_zoom": 12,
            "max_zoom": 14,
            "tile_layer_url": state[job_name]["rgb_cog_href"]
        }
        
        payload.append(row)


payload = [{DataRow.row_data: row} for row in payload]

random.seed(666)
random.shuffle(payload)

start_idx, end_idx = 0, 1000
payload = payload[start_idx:end_idx]

print(f'{len(payload)} data rows')
task = clouds_dataset.create_data_rows(payload)
task.wait_till_done()

print('done')

1000 data rows
done


In [None]:
files = glob.glob("./data/labelbox/clouds/source/*.tif")
print(len(files))

664
