# Landsat Processing
Created by: Oriana Chegwidden

In [2]:
%load_ext autoreload
%autoreload 2
import boto3
from rasterio.session import AWSSession
aws_session = AWSSession(boto3.Session(), requester_pays=True)
from osgeo.gdal import VSICurlClearCache
VSICurlClearCache() 
import rasterio as rio
import xarray as xr
import dask
import os
import fsspec
from satsearch import Search
from matplotlib.pyplot import imshow
from intake import open_stac_item_collection
import osgeo
import numcodecs
import numpy as np

In [3]:
from dask_gateway import Gateway

gateway = Gateway()
options = gateway.cluster_options()
options.worker_cores = 2
options.worker_memory = 32
options.environment = {'AWS_REQUEST_PAYER': 'requester'}
cluster = gateway.new_cluster(cluster_options=options)
cluster.adapt(minimum=1, maximum=10)
cluster

VBox(children=(HTML(value='<h2>GatewayCluster</h2>'), HBox(children=(HTML(value='\n<div>\n<style scoped>\n    …

In [4]:
client = cluster.get_client()

Each Landsat scene is stored in cloud optimized geotiff (COG) according to a verbose (but once you understand it, human readable!) naming convention. Landsat Collection 2 uses the same naming convention as Collection 1 which is as follows (lifted from their docs at `https://prd-wret.s3.us-west-2.amazonaws.com/assets/palladium/production/atoms/files/LSDS-1656_%20Landsat_Collection1_L1_Product_Definition-v2.pdf`

```LXSS_LLLL_PPPRRR_YYYYMMDD_yyyymmdd_CC_TX```
where
```
L = Landsat  (constant)
X = Sensor  (C = OLI / TIRS, O = OLI-only, T= TIRS-only, E = ETM+, T = TM, M= MSS)
SS = Satellite  (e.g., 04 for Landsat 4, 05 for Landsat 5, 07 for Landsat 7, etc.) 
LLLL = Processing  level  (L1TP, L1GT, L1GS)
PPP  = WRS path
RRR  = WRS row
YYYYMMDD = Acquisition  Year (YYYY) / Month  (MM) / Day  (DD) 
yyyymmdd  = Processing  Year (yyyy) / Month  (mm) / Day (dd)
CC = Collection  number  (e.g., 01, 02, etc.) 
TX= RT for Real-Time, T1 for Tier 1 (highest quality), and T2 for Tier 2

```

Thus, we're looking for scenes coded in the following way:
`LE07_????_PPP_RRR_YYYMMDD_yyyymmdd_02_T1` for Landsat 7 and
`LT05_????_PPP_RRR_YYYMMDD_yyyymmdd_02_T1` for Landsat 5
(but T1 might be wrong there)


We are re-implementing (to the best of our abilities) the methods from Wang et al (in review). Jon Wang's paper said:

```To extend our AGB predictions through space and time, we used time series (1984 – 2014) of 30 m surface reflectance data from the Thematic Mapper onboard Landsat 5 and the Enhanced Thematic Mapper Plus onboard Landsat 7. We used the GLAS-derived estimates of AGB as a response variable and the mean growing season (June, July, August) and non-growing season values for each of Landsat’s six spectral reflectance bands as the predictors in an ensemble machine learning model```

So we'll be looking for:
* Landsat 5 (Thematic mapper) and 7 (Enhanced Thematic Mapper Plus)
* Growing season (June-August) and non-growing season (Sept-May) averages at an annual timestep. <--- will need to figure out around the calendar whether we want consecutive
* All six spectral reflectance bands
* We'll do a quality thresholding of cloudless cover for now based upon their thresholding

In orienting myeslf, these are the potential collection options I've figured out (by poking around here on the [sat-api catalog](https://landsatlook.usgs.gov/sat-api/collections):
* `landsat-c2l2-sr` Landsat Collection 2 Level-2 UTM Surface Reflectance (SR) Product
* `landsat-c2l2alb-sr` Landsat Collection 2 Level-2 Albers Surface Reflectance (SR) Product
* `landsat-c1l2alb-sr` Landsat Collection 1 Level-2 Albers Surface Reflectance (SR) Product <-- we don't want this one (b/c we'll go with collection 2)
* `landsat-c2l1` Landsat Collection 2 Level-1 Product <-- don't think we want this because we want surface reflectance


Run this once to apply the aws session to the rasterio environment

In [5]:
def test_worker_credentials(aws_session):    
#     VSICurlClearCache()
    # this file is the canary in the coal mine
    # if you can't open this one you've got *issues* because it exists!
    # also the instantiation of the environment here
    # might help you turn on the switch of the credentials
    # but maybe that's just anecdotal i hate credential stuff SO MUCH
    # if anyone is reading this message i hope you're enjoying my typing
    # as i wait for my cluster to start up.... hmm....
    canary_file = 's3://usgs-landsat/collection02/level-2/standard/tm/2003/044/029/LT05_L2SP_044029_20030827_20200904_02_T1/LT05_L2SP_044029_20030827_20200904_02_T1_SR_B2.TIF'

    with rio.Env(aws_session):
        with rio.open(canary_file) as src:
            profile = src.profile
            arr = src.read(1)
        

In [6]:
def create_catalog(bbox, time_slice):
    url = 'https://landsatlook.usgs.gov/sat-api/stac/'
    search = Search.search(url=url,
                           bbox=bbox, 
                           time=time_slice,
                           # for some reason the collection subsetting isn't really working and 
                           # I'm still getting landsat l1 stuff- shrug. we can filter elsewhere
                           collection=['landsat-c2l2-sr'])
    items = search.items(limit=7000)
    summary = items.summary()
    catalog = open_stac_item_collection(items)
    return catalog

Question: what is difference between Item and Collection?

Taking a single one of those files and loading it looks like this.

In [7]:
def fix_link(url):
    return url.replace('https://landsatlook.usgs.gov/data', 's3://usgs-landsat')

There are different kinds of QA/QC bands contained in L2SP:
* SR_CLOUD_QA - I think we want this one because anything less than 2 is either just dark dense vegetation or no flags. everything above is stuff like water, snow, cloud (different levels of obscurity). This is the result of the fmask algorithm from Zhu et al.
* QA_PIXEL - this gets a little more specific and goes intot different kinds of clouds. Super interesting but I don't think we want to use it.

Pull in the SR_CLOUD_QA and use as a mask - see Table 5-3 in https://prd-wret.s3.us-west-2.amazonaws.com/assets/palladium/production/atoms/files/LSDS-1370_L4-7_C1-SurfaceReflectance-LEDAPS_ProductGuide-v3.pdf for description of cloud integer values to select which ones to use as drop. For now I'll drop anything greater than 1 (0= no QA concerns and 1 is Dark dense vegetation (DDV)).

In [8]:
def cloud_qa(item):
    import numpy as np
    qa_path = fix_link(item._stac_obj.assets['SR_CLOUD_QA.TIF']['href'])
    cog_mask = xr.open_rasterio(qa_path).squeeze().drop('band')
    return cog_mask

In [9]:
def grab_ds(item, bands_of_interest, cog_mask):
    url_list = [fix_link(item._stac_obj.assets['{}.TIF'.format(band)]['href']) for band in bands_of_interest]
    da_list = []
    for url in url_list:
        da_list.append(xr.open_rasterio(url, chunks={'x': 1280,
                                                    'y': 1280}))
    # combine into one dataset
    ds = xr.concat(da_list, dim='band').to_dataset(dim='band').rename({1: 'reflectance'})
    ds = ds.assign_coords({'band': bands_of_interest})
    ds = ds.where(cog_mask<2).compute()
    return ds

First we make the query using sat-search to find every file in the STAC catalog that we want. We'll store that list of files. We'll do this first for a single tile (in this first exmaple just covering Washington State) but then we'll loop through in 1-degree by 1-degree tiles. 

In [10]:
def create_dataset_list(catalog, aws_session, bands_of_interest='all'):
    if bands_of_interest=='all':
        bands_of_interest = ['SR_B1', 'SR_B2', 'SR_B3', 
                                 'SR_B4', 'SR_B5', 'SR_B7']
    ds_list = []
    for scene_id in list(catalog):
        # only grab the l2sp and landsat5 for now. for some reason
        # satsearch is having difficulty filtering out these.)
        if ('L2SP' in scene_id) and ('LT05' in scene_id):
            item = catalog[scene_id]
            cog_mask = cloud_qa(item)
            ds = grab_ds(item, bands_of_interest, cog_mask)
            ds_list.append(ds)
    return ds_list

In [11]:
def combine(ds_list):
    full_ds = xr.concat(ds_list, dim='scene').mean(dim='scene').compute()
    full_ds = full_ds.chunk({'band': 1, 'x': 1280, 'y': 1280})
    return full_ds

In [12]:
def write_out(ds, mapper, aws_session):
    encoding = {'reflectance': {'compressor': numcodecs.Blosc()}}
    with rio.Env(aws_session):
#         with dask.config.set(scheduler='threads'): # this?
        ds.to_zarr(store=mapper,
                        encoding=encoding, 
                         mode='w')

In [13]:
# @dask.delayed # this?
def process_tile(bbox, time_slice, mapper, aws_session, bands_of_interest='all'):
    # This is the workflow we'll actually pass off to the worker.
    with rio.Env(aws_session):
        test_worker_credentials(aws_session)
        catalog = create_catalog(bbox, time_slice)
        ds_list = create_dataset_list(catalog, aws_session, bands_of_interest=bands_of_interest)
        full_ds = combine(ds_list)
        write_out(full_ds, mapper, aws_session)
    return 'Completed processing of {} for region bounded by '\
            '{} over time period {}'.format(bands_of_interest, bbox, time_slice)

In [4]:
import s3fs
fs = s3fs.S3FileSystem()
fs.ls(PANGEO_SCRATCH)

NameError: name 'PANGEO_SCRATCH' is not defined

In [14]:
years = np.arange(2003,2009)
bands = ['SR_B1', 'SR_B2', 'SR_B3', 
        'SR_B4', 'SR_B5', 'SR_B7']
tile_bboxes = []
lat_box_length = 2
lon_box_length = 1
for lat1 in np.arange(45,49,lat_box_length):
    for lon1 in np.arange(-125,-117,lon_box_length):
        tile_bboxes.append([lon1, lat1, lon1+lon_box_length, lat1+lat_box_length])

In [1]:
#dask.config.set({"array.slicing.split_large_chunks": True})

In [None]:
out = []
bbox = tile_bboxes[0]
lon1, lat1 = bbox[0], bbox[1] 
year = 2003
PANGEO_SCRATCH=os.environ['PANGEO_SCRATCH']

timeslice =f"{year}-06-01T00:00:00Z/{year}-08-31T23:59:59Z"
# file_save_template = 'landsat5/{}/{}/'
mapper = fsspec.get_mapper(f'{PANGEO_SCRATCH}test810.zarr')
band = ['SR_B1']
url = f'{PANGEO_SCRATCH}{lat1}_{lon1}/{year}/{band[0]}.zarr'
mapper = fsspec.get_mapper(url)

# years = np.arange(2003,2009)
# i=0
# for [lon1, lat1, lon2, lat2] in tile_bboxes:
#     for year in years:        
#         for band in bands:
#             time_slice=
#             i+=1
task = process_tile([-125, 45, -124, 46], #bbox
            "2003-06-01T00:00:00Z/2003-08-31T23:59:59Z", #timeslice, 
             mapper, 
            aws_session,
            bands_of_interest=band)
# # dask.compute(task, retries=1)
# out.append(dask.compute(task, retries=1))

Then we take the list of files for a given year to average across growing season and non-growing season.

In [6]:
def read_cog(filepath):
    '''
    Function for hard restart read. If you have trouble reading things
    try this approach to ensure nothing else is funky.
    '''
    # this fixes it - but how do we implement in rasterio?
    VSICurlClearCache()
# does applying the aws_session in the env once just set it permannently?
    with rio.Env(aws_session):
        with rio.open(filepath) as src:
            profile = src.profile
            arr = src.read(1)
    return arr

In [7]:
mean_file = 's3://usgs-landsat/collection02/level-2/standard/tm/2003/047/027/LT05_L2SP_047027_20030629_20200904_02_T1/LT05_L2SP_047027_20030629_20200904_02_T1_SR_B1.TIF'

In [4]:
existing_file = 's3://usgs-landsat/collection02/level-2/standard/tm/2003/044/029/LT05_L2SP_044029_20030827_20200904_02_T1/LT05_L2SP_044029_20030827_20200904_02_T1_SR_B2.TIF'
broken_file = 's3://usgs-landsat/collection02/level-2/standard/tm/2003/044/029/LT05_L2SP_044029_20030827_20200904_02_T1/LT05_L2SP_044029_20030827_20200904_02_T1_SR_B8.TIF'

In [5]:
environment = rio.Env(aws_session)

In [6]:
read_cog(mean_file)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint16)