In [1]:
import os
#from sentinelsat import SentinelAPI, read_geojson, geojson_to_wkt, LTATriggered, make_path_filter, LTAError, ServerError
from satsearch import Search
import rasterio as rio
from rasterio.windows import from_bounds, Window
from rasterio.mask import mask
import shutil
import numpy as np
import gdal
import geopandas as gpd
import pandas as pd
#from getpass import getpass
from shapely.geometry import box, mapping
import json
import datetime as dt
from time import sleep
import json
from skimage.morphology import binary_dilation, disk, square
from scipy import ndimage
from glob import glob
from joblib import Parallel, delayed
from skimage.morphology import remove_small_holes

overwrite=False

startDate = "2019-06-01"
endDate = "2019-09-01"
target_bands = ["SCL", "B02", "B03", "B04", "B05", "B06", "B07", "B08", "B8A", "B11", "B12"]

sentinel_dir = "..\\Sentinel2Data"
os.makedirs(sentinel_dir, exist_ok=True)

region = gpd.read_file("../EPCExtent_30cm/Ortho_5kSubIndex.gpkg")
region_4326 = region.to_crs("EPSG:4326").unary_union
footprint = box(*region_4326.bounds)
footprint_wkt = footprint.wkt

pd.set_option("max_columns", None)

#native resolution is meters. Set buffer to 1/2 mile
buffer = 1609/2

In [14]:
def pullWindowedRaster(inFile, outFile, window):
    print(inFile)
    with rio.open(inFile) as src:
        res = src.res[0]
        kwargs = src.profile
        if type(window) == rio.windows.Window and "SCL" not in inFile and res == 20:
            read_window = Window(col_off=window.col_off, row_off=window.row_off,
                                 width=window.width, height=window.height)
        elif type(window) == rio.windows.Window and "SCL" not in inFile and res == 10:
            read_window = Window(col_off=window.col_off*2, row_off=window.row_off*2,
                                 width=window.width*2, height=window.height*2)
        elif type(window) ==  gpd.GeoDataFrame and "SCL" in inFile:
            crs_bounds = window.to_crs(src.crs).unary_union.intersection(box(*src.bounds)).bounds
            read_window = from_bounds(*crs_bounds, transform=src.transform).round_offsets().round_lengths()
        else:
            print(type(window), "SCL" in inFile, res)
            raise ValueError("Unknown window type. Should be geodataframe for SCL or rio Window for other bands")
            
        if not os.path.exists(outFile):
            win_data = src.read(1, window=read_window)
            kwargs.update(
                transform = src.window_transform(read_window),
                height = read_window.height,
                width = read_window.width)

            with rio.open(outFile, "w", **kwargs) as dst:
                dst.write(win_data, 1)

    return read_window


def downloadFilesForScene(row, target_region, bands = ["B02", "B03", "B04", "B08", "B11"]):
    outDir = os.path.join(sentinel_dir, row.id)
    os.makedirs(outDir, exist_ok=True)
    files = []
    try:
        scl_loc = row.assets["SCL"]["href"]
        outSCL = os.path.join(outDir, row.id + "_" + os.path.basename(scl_loc))
        
        scl_window = pullWindowedRaster(scl_loc, outSCL, target_region)
        files.append(outSCL)
        
        for band in bands:
            file_loc = row.assets[band]["href"]
            outfile = os.path.join(outDir, row.id + "_" + os.path.basename(file_loc))
            if not os.path.exists(outfile):
                band_window = pullWindowedRaster(file_loc, outfile, scl_window)
            
            files.append(outfile)
            
        print(f"Got Sentinel files for {row.id}")
    except TypeError as e:
        print(f"Problem collecting files for {row.id}\n\t{e}")
        shutil.rmtree(outDir)
        
    return files

def getDatasetValuesInTargetArea(downloads, target_area):
    scl_file = [file for file in downloads if "SCL" in os.path.basename(file)][0]
    with rio.open(scl_file) as src:
        if target_area.crs != src.crs:
            target_area = target_area.to_crs(src.crs)
        polygons = [target_area.unary_union]
        try:
            scl_targetArea, reg_transform = mask(src, polygons, all_touched=True, crop=True, indexes=1)
        except ValueError as e:
            print(f"{e} - {landsatDir}")
            return None, None, None, 100

        #return scl_targetArea
        cloudValues = [3,8,9,10]
         # 0 are values created from the rasterio mask and correspond to sentinel SCL 0 values for nodata. Values of 1 for SCL are "Saturated or Defective"
        invalidValues_mask = np.isin(scl_targetArea, [0,1])
        cloudValues_mask = np.isin(scl_targetArea, cloudValues)
        
        num_clear = len(scl_targetArea[(~cloudValues_mask) & (~invalidValues_mask)])
        num_potential = len(scl_targetArea[~invalidValues_mask])
        num_obscured = len(scl_targetArea[~invalidValues_mask & cloudValues_mask])
                
        if num_clear != 0:
            percentObscured = round(100*(num_obscured/num_potential),2)
        else:
            percentObscured = 100

        #print("num_nd", num_nd)
        #print("num_pot", num_potential)
        #print("num_obscured", num_obscured)
        #print("num_clear", num_clear)
        #print("percentObscured", percentObscured)
    #return [num_potential, num_obscured, num_clear, percentObscured]
    return pd.Series([num_potential, num_obscured, num_clear, percentObscured])


def calcCloudAllTime(a):
    mask = np.all(a==0, axis=0)
    cloud_all_the_time = np.all(a!=2, axis=0)
    cloud_all_the_time[mask] = 0
    
    return cloud_all_the_time


def getSizes(binary_array, selem=square(3)):
    #https://stackoverflow.com/questions/53576830/how-to-find-the-largest-connected-region-using-scipy-ndimage
    binary_array = binary_opening(binary_array, selem=selem)
    label_im, nb_labels = ndimage.label(binary_array, selem)
    sizes = ndimage.sum(binary_array, label_im, range(nb_labels + 1))
    size_img = sizes[label_im]
    maxVal = size_img.max()
            
    return size_img


def maskFiles(downloads, buffer, notClouds, overwrite=False):
    scl_file = [f for f in downloads if "SCL" in f][0].replace(".tif", "_cloudBuff.tif")
    band_files = [f for f in downloads if "SCL" not in f]
    files = []
    with rio.open(scl_file) as sclSrc:
        
        scl_bad = sclSrc.read(1)
        with rio.open(notClouds) as ncSrc:
            win = from_bounds(*sclSrc.bounds, transform=ncSrc.transform)
            acat = ncSrc.read(1, window=win)
        # Where identified as always obscured (incorrect classification), set to clear (2)
        scl_bad[acat==1] = 2
        
        for band_file in band_files:
            bandOut_file = band_file.replace(".tif","_masked.tif")
            files.append(bandOut_file)
            if os.path.exists(bandOut_file) and not overwrite:
                print(f"{os.path.basename(bandOut_file)} already exists")
                continue 
                
            with rio.open(band_file) as bSrc:
                band = bSrc.read(1, out_shape=scl_bad.shape)
                kwargs = bSrc.profile
                # Where identified as obscured, set to no data
                band[scl_bad==1] = bSrc.nodata
                with rio.open(bandOut_file, "w", **kwargs) as dst:
                    dst.write(band, 1)
                print(f"Wrote out {os.path.basename(bandOut_file)}")
                
    return files


def reject_outliers(data, nodata=0, m=2.5):
    data = data.astype(float)
    data[data==nodata] = np.nan
    diff_mean = np.abs(data-np.nanmedian(data,axis=0))
    mdev = np.nanmedian(diff_mean, axis=0)
    s = diff_mean/mdev
    out = np.where(s > m, np.nan, data)
    
    return out


def buildVRT(files, band, overwrite=False):
    vrt_opts = gdal.BuildVRTOptions(separate=True)
    stack_VRTfile = f"../Sentinel2Data/{band}Stack_2019.vrt"
    if not os.path.exists(stack_VRTfile) or overwrite:
        vrt = gdal.BuildVRT(stack_VRTfile, files, options=vrt_opts)
        vrt = None
        print(f"Built VRT file {stack_VRTfile}")
    
    return stack_VRTfile


def createCloudMaskFiles(file, buffer, overwrite=False):
    ofile = file.replace(".tif", "_cloudBuff.tif")
    if not os.path.exists(ofile) or overwrite:
        print(f"Starting {ofile} - {dt.datetime.now()}")
        with rio.open(file) as src:
            kwargs = src.profile
            trans = src.transform
            trans_resamp = rio.Affine(trans.a/scale, trans.b, trans.c,
                                      trans.d, trans.e/scale, trans.f)
            kwargs.update(driver="GTIFF",
                          width = src.width*scale,
                          height = src.height*scale,
                          transform = trans_resamp)
            scl = src.read(1, out_shape=(src.height*scale, src.width*scale))
            mask = scl == 0
            clouds = np.isin(scl, [1,2,3,8,9,10])
            cloudSizes = getSizes(clouds, selem=disk(1))
            clouds[cloudSizes <= 3*(scale**2)] = 0
            bufferSize = int(buffer/(src.res[0]/scale))
            cloud_buff = binary_dilation(clouds, disk(bufferSize))
            scl[cloud_buff] = 1
            scl[~cloud_buff] = 2
            scl[mask] = 0
            with rio.open(ofile, "w", **kwargs) as dst:
                dst.write(scl,1)

    return ofile

In [15]:
sentSearch_file = os.path.join(sentinel_dir, f"SentinelCOGS_EPC2019_{startDate.replace('-','')}to{endDate.replace('-','')}.feather")

if not os.path.exists(sentSearch_file) and not overwrite:
    search = Search(bbox=[*region_4326.bounds],
                    url='https://earth-search.aws.element84.com/v0',
                    datetime=f'{startDate}/{endDate}',
                    collections=['sentinel-s2-l2a-cogs']
                   )

    print('bbox search: %s items' % search.found())

    items = search.items()

    items_df = gpd.GeoDataFrame(items.geojson()["features"])

    items_df["CloudPerc"] = items_df.properties.apply(lambda p: p["eo:cloud_cover"])
    items_df = items_df[items_df["CloudPerc"] < 20]
    print(items_df.shape)
    items_df["Downloads"] = items_df.apply(lambda r: downloadFilesForScene(r, region, target_bands), axis=1)
    items_df = items_df[items_df["Downloads"].str.len() != 0].copy()
    items_df[["NUM_POTENT", "NUM_OBSC", "NUM_CLEAR", "PERCENT_OBSC"]] = items_df.Downloads.apply(lambda d: getDatasetValuesInTargetArea(d, region))
    print(len(items_df))
    items_df["Day"] = items_df.properties.apply(lambda p: p["datetime"].split("T")[0])
    items_df["EPSG"] = items_df.properties.apply(lambda p: p["proj:epsg"])
    items_df.reset_index(drop=True, inplace=True)
    items_df.to_feather(sentSearch_file)
else:
    print(f"Reading in {sentSearch_file}")
    items_df = pd.read_feather(sentSearch_file)

items_df.head(3)

Reading in ..\Sentinel2Data\SentinelCOGS_EPC2019_20190601to20190901.feather


Unnamed: 0,type,stac_version,stac_extensions,id,bbox,geometry,properties,collection,assets,links,CloudPerc,Downloads,NUM_POTENT,NUM_OBSC,NUM_CLEAR,PERCENT_OBSC,Day,EPSG
0,Feature,1.0.0-beta.2,"[eo, view, proj]",S2B_12SVB_20190828_0_L2A,"[-112.0761856373385, 32.4446409087737, -110.89...",{'coordinates': [[[-112.06425617 32.44464091...,"{'constellation': 'sentinel-2', 'created': '20...",sentinel-s2-l2a-cogs,{'AOT': {'href': 'https://sentinel-cogs.s3.us-...,[{'href': 'https://earth-search.aws.element84....,1.65,[..\Sentinel2Data\S2B_12SVB_20190828_0_L2A\S2B...,858556.0,799.0,857757.0,0.09,2019-08-28,32612
1,Feature,1.0.0-beta.2,"[eo, view, proj]",S2B_12SWB_20190828_0_L2A,"[-111.00020441126554, 32.44372276568956, -109....",{'coordinates': [[[-109.83214339 32.44372277...,"{'constellation': 'sentinel-2', 'created': '20...",sentinel-s2-l2a-cogs,{'AOT': {'href': 'https://sentinel-cogs.s3.us-...,[{'href': 'https://earth-search.aws.element84....,1.43,[..\Sentinel2Data\S2B_12SWB_20190828_0_L2A\S2B...,1029533.0,127861.0,901672.0,12.42,2019-08-28,32612
2,Feature,1.0.0-beta.2,"[eo, view, proj]",S2A_12RVV_20190823_0_L2A,"[-112.05488634092909, 30.640635895864275, -110...",{'coordinates': [[[-112.04397869 30.6406359 ...,"{'constellation': 'sentinel-2', 'created': '20...",sentinel-s2-l2a-cogs,{'AOT': {'href': 'https://sentinel-cogs.s3.us-...,[{'href': 'https://earth-search.aws.element84....,2.86,[..\Sentinel2Data\S2A_12RVV_20190823_0_L2A\S2A...,1921150.0,257.0,1920893.0,0.01,2019-08-23,32612


# Create Cloud Mask Files
and Identify Regions always classified as obscured (cloud, cloud-shadow, etc)

In [16]:
scl_files = glob("../Sentinel2Data/*/*SCL.tif")
#scl_files = glob("../Sentinel2Data/S2B_12RVV_20190808_0_L2A/*SCL.tif")
#print(scl_files)
scale = 2
cloud_files = Parallel(backend="threading", n_jobs=6)(delayed(createCloudMaskFiles)(infile, buffer, overwrite=False) for infile in scl_files)
    
cloudBuff_vrt = buildVRT(cloud_files, "CloudBuff")
allCloudAllTime = "../Sentinel2Data/AllCloudStackResamp.tif"

if not os.path.exists(allCloudAllTime) or overwrite:
    with rio.open(cloudBuff_vrt) as src:
        kwargs = src.profile
        kwargs.update(count = 1,
                      driver="GTIFF")#,

        with rio.open(allCloudAllTime, "w", nbits=1, **kwargs) as dst:
            for (i,j), window in src.block_windows():

                cloud_w = src.read(window=window)#, out_shape=(src.count, window.height*scale, window.width*scale))
                cloud_at = calcCloudAllTime(cloud_w)
                dst.write(cloud_at.astype(np.uint8), 1, window=window)

    print(f"Wrote out to {allCloudAllTime}")

# Using Cloud Masks, Create New Masked Files

In [17]:
masked_files = Parallel(prefer="threads", n_jobs=4)(delayed(maskFiles)(d, buffer, notClouds=allCloudAllTime, overwrite=overwrite) for d in items_df.Downloads.values)

S2B_12SVB_20190828_0_L2A_B02_masked.tif already exists
S2B_12SVB_20190828_0_L2A_B03_masked.tif already exists
S2B_12SVB_20190828_0_L2A_B04_masked.tif already exists
S2B_12SVB_20190828_0_L2A_B05_masked.tif already exists
S2B_12SVB_20190828_0_L2A_B06_masked.tif already exists
S2B_12SVB_20190828_0_L2A_B07_masked.tif already exists
S2B_12SVB_20190828_0_L2A_B08_masked.tif already exists
S2B_12SVB_20190828_0_L2A_B8A_masked.tif already exists
S2B_12SVB_20190828_0_L2A_B11_masked.tif already exists
S2B_12SWB_20190828_0_L2A_B02_masked.tif already exists
S2B_12SWB_20190828_0_L2A_B03_masked.tif already exists
S2B_12SWB_20190828_0_L2A_B04_masked.tif already exists
S2B_12SWB_20190828_0_L2A_B05_masked.tif already exists
S2B_12SWB_20190828_0_L2A_B06_masked.tif already exists
S2B_12SWB_20190828_0_L2A_B07_masked.tif already exists
S2B_12SWB_20190828_0_L2A_B08_masked.tif already exists
S2B_12SWB_20190828_0_L2A_B8A_masked.tif already exists
S2B_12SWB_20190828_0_L2A_B11_masked.tif already exists
S2A_12RVV_

## Create VRTs of Masked and Unmasked Bands

In [20]:
def tifsToVRT(band, numExpected):
    allFiles = glob(f"../Sentinel2Data/*/*{band}.tif")
    if len(allFiles) != numExpected:
        raise ValueError("Wrong number of masked files")
    stack_VRTfile = buildVRT(allFiles, band)
    
    return stack_VRTfile

vrts= {}

for band in target_bands:
    vrts[band] = {"unmasked": tifsToVRT(band, len(items_df))}
    if band != "SCL":
        mBand = band + "_masked"
        vrts[band]["masked"] = tifsToVRT(mBand, len(items_df))

vrts

## Using VRTs and the all CloudAllTime file, create Mean of Band Across Dates

In [30]:
with rio.open("../Sentinel2Data/B04_maskedStack_2019.vrt") as src:
    values = [v for v in src.sample([[491933.9, 3547027.5]])]
    files = src.files
values

[array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0, 2438,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0], dtype=uint16)]

In [23]:
import multiprocessing
import threading
import concurrent.futures
from rasterio._example import compute

In [45]:
%%time
bandMeanFiles = []
overwrite=False
for band, files in vrts.items():
    if band == "SCL":
        continue
    
    band_MeanFile = f"../Sentinel2Data/{band}Mean_{startDate.replace('-','')}to{endDate.replace('-','')}.tif"
    bandMeanFiles.append(band_MeanFile)
    if os.path.exists(band_MeanFile) and not overwrite:
        print(f"Skipping {band}")
        continue
        
    unmaskedVRT = files["unmasked"]
    maskedVRT = files["masked"]
        
    print(f"Starting {band}")
    with rio.open(maskedVRT) as bmaskSrc:
        with rio.open(unmaskedVRT) as bSrc:
            with rio.open(allCloudAllTime) as acSrc:
                
                kwargs = acSrc.profile
                kwargs.update(dtype=np.uint16)
                
                scaler = bmaskSrc.res[0]/acSrc.res[0]
                
                with rio.open(band_MeanFile, "w", **kwargs) as dst:
                    windows = [window for ij, window in bmaskSrc.block_windows(1)]

                    #read_lock = threading.Lock()
                    #write_lock = threading.Lock()
                    
                    win = from_bounds(509566,3554924,511764,3557462, bmaskSrc.transform)
                    for window in windows:
                        #def buildMeanWindow(window):
                        outshape = (int(round(window.height*scaler,0)),
                                    int(round(window.width*scaler,0)))

                        if scaler != 1:
                            cloudWindow = Window(col_off=window.col_off*scaler, row_off=window.row_off*scaler,
                                                 width=window.width*scaler, height=window.height*scaler)
                        else:
                            cloudWindow=window

                        b_ClMskd = bmaskSrc.read(window=window, out_shape=outshape).astype(float)
                        b_Orig = bSrc.read(window=window, out_shape=outshape)
                        allClouds = acSrc.read(1, window=cloudWindow, out_shape=outshape)

                        # set nodata value (should be zero) to nan
                        b_ClMskd[b_ClMskd==bmaskSrc.nodata] = np.nan
                        #calculate mean at pixel for all pre-masked data
                        b_ClMskd_mean = np.nanmean(b_ClMskd, axis=0).astype(np.uint16)

                        # where there are clouds all the time, use rej_outliers to filter out real possible bad values
                        b_Orig_rej = reject_outliers(b_Orig, nodata=0, m=2.5)
                        b_Orig_rej_mean = np.nanmean(b_Orig_rej, axis=0).astype(np.uint16)

                        b_ClMskd_mean[allClouds==1] = b_Orig_rej_mean[allClouds==1]

                        dst.write(b_ClMskd_mean, 1, window=cloudWindow)
                    
                    #with concurrent.futures.ThreadPoolExecutor(
                    #    max_workers=5
                    #) as executor: executor.map(buildMeanWindow, windows)

    print(f"FINISHED {band_MeanFile}")

Skipping B02
Skipping B03
Skipping B04
Skipping B05
Skipping B06
Skipping B07
Skipping B08
Skipping B8A
Skipping B11
Skipping B12
Wall time: 37.5 ms


* 6 minutes 8 seconds with 4 workers
* failed with 12 workers
* 6 minutes 0 seconds with 5 workers

In [9]:
with rio.open(bandMeanFiles[0]) as src:
    data = src.read(1)
    print(len(data[data==0]))

17901334


In [13]:
filled = remove_small_holes(data!=0, area_threshold=120, connectivity=3)
np.unique(filled, return_counts=True)

(array([False,  True]), array([ 17901180, 123754404], dtype=int64))

In [14]:
filled = remove_small_holes(data!=0, area_threshold=1120, connectivity=3)
np.unique(filled, return_counts=True)

(array([False,  True]), array([ 17901180, 123754404], dtype=int64))

In [11]:
filled = remove_small_holes(data!=0, area_threshold=20, connectivity=3)
data[data==0 & filled ] = 1
data[~filled] = 0
print(len(data[data==0]))

17901334


In [3]:
## There still exist some zero (no data values scattered around ponds that didn't get filled/filterd in the intial process. Had to go back and fix (one-time))
bandMeanFiles = [r"G:\OrthophotosAndElevation\2019\Sentinel2Data\B8AMean_20190601to20190901.tif"]
with rio.open(bandMeanFiles[0]) as src:
    data, mask = src.read(1, masked=True)
    print(len(data[np.isnan(data)]))
for file in bandMeanFiles:
    with rio.open(file) as src:
        data = src.read(1)
        kwargs = src.profile

    filled = remove_small_holes(data!=0, area_threshold=20, connectivity=3)
    data[data==0 & filled ] = 1
    data[~filled] = 0

    #kwargs.update(dtype=np.uint16, nodata=0)
    #with rio.open(file, "w", **kwargs) as dst:
    #    dst.write(data, 1)

0
