# Prep datasets for training

Created by: Oriana Chegwidden


In [1]:
%load_ext autoreload
%autoreload 2

from pyproj import CRS
import boto3
from rasterio.session import AWSSession
from s3fs import S3FileSystem
aws_session = AWSSession(boto3.Session(),#profile_name='default'), 
                         requester_pays=True)
fs = S3FileSystem(requester_pays=True) #profile='default', 
import xgboost as xgb

from osgeo.gdal import VSICurlClearCache
import rasterio as rio
import numpy as np
import xarray as xr
import dask
import os
import fsspec
import geopandas as gpd
import rioxarray # for the extension to load
import matplotlib.pyplot as plt
import utm
import pandas as pd
from datetime import datetime
import json
import zarr
import awswrangler as wr
from dask_gateway import Gateway
from carbonplan_trace.v1.landsat_preprocess import access_credentials, test_credentials
from carbonplan_trace.v1.inference import predict, predict_delayed
from carbonplan_trace.v1 import utils
from carbonplan_trace.v1.training_prep import prep_training_dataset, prep_training_dataset_delayed, add_parquet_urls

  from distributed.utils import LoopRunner, format_bytes


In [2]:
# kind_of_cluster = "local"
kind_of_cluster = "remote"
if kind_of_cluster == "local":
    # spin up local cluster. must be on big enough machine
    from dask.distributed import Client

    client = Client(n_workers=1, threads_per_worker=1)  # _per_worker=4
    client
else:
    gateway = Gateway()
    options = gateway.cluster_options()
    options.environment = {
        "AWS_REQUEST_PAYER": "requester",
        "AWS_REGION_NAME": "us-west-2",
    }
    options.worker_cores = 1
    options.worker_memory = 200
    options.image = "carbonplan/trace-python-notebook:latest"
    cluster = gateway.new_cluster(cluster_options=options)
#     cluster.adapt(minimum=2, maximum=100)
    cluster.scale(100) 
    

In [3]:
cluster

VBox(children=(HTML(value='<h2>GatewayCluster</h2>'), HBox(children=(HTML(value='\n<div>\n<style scoped>\n    …

In [4]:
client = cluster.get_client()
client

0,1
Connection method: Cluster object,Cluster type: GatewayCluster
Dashboard: /services/dask-gateway/clusters/prod.d6dafd1247944fc9a1a43c574fb10d33/status,


Each Landsat scene is stored in cloud optimized geotiff (COG) according to a
verbose (but once you understand it, human readable!) naming convention. Landsat
Collection 2 uses the same naming convention as Collection 1 which is as follows
(lifted from their docs at
`https://prd-wret.s3.us-west-2.amazonaws.com/assets/palladium/production/atoms/files/LSDS-1656_%20Landsat_Collection1_L1_Product_Definition-v2.pdf`

`LXSS_LLLL_PPPRRR_YYYYMMDD_yyyymmdd_CC_TX` where

```
L = Landsat  (constant)
X = Sensor  (C = OLI / TIRS, O = OLI-only, T= TIRS-only, E = ETM+, T = TM, M= MSS)
SS = Satellite  (e.g., 04 for Landsat 4, 05 for Landsat 5, 07 for Landsat 7, etc.)
LLLL = Processing  level  (L1TP, L1GT, L1GS)
PPP  = WRS path
RRR  = WRS row
YYYYMMDD = Acquisition  Year (YYYY) / Month  (MM) / Day  (DD)
yyyymmdd  = Processing  Year (yyyy) / Month  (mm) / Day (dd)
CC = Collection  number  (e.g., 01, 02, etc.)
TX= RT for Real-Time, T1 for Tier 1 (highest quality), and T2 for Tier 2

```

Thus, we're looking for scenes coded in the following way:
`LE07_????_PPP_RRR_YYYMMDD_yyyymmdd_02_T1` for Landsat 7 and
`LT05_????_PPP_RRR_YYYMMDD_yyyymmdd_02_T1` for Landsat 5 (but T1 might be wrong
there)


We are re-implementing (to the best of our abilities) the methods from Wang et
al (in review). Jon Wang's paper said:

`To extend our AGB predictions through space and time, we used time series (1984 – 2014) of 30 m surface reflectance data from the Thematic Mapper onboard Landsat 5 and the Enhanced Thematic Mapper Plus onboard Landsat 7. We used the GLAS-derived estimates of AGB as a response variable and the mean growing season (June, July, August) and non-growing season values for each of Landsat’s six spectral reflectance bands as the predictors in an ensemble machine learning model`

So we'll be looking for:

- Landsat 5 (Thematic mapper) and 7 (Enhanced Thematic Mapper Plus)
- Growing season (June-August) and non-growing season (Sept-May) averages at an
  annual timestep. <--- will need to figure out around the calendar whether we
  want consecutive
- All six spectral reflectance bands
- We'll do a quality thresholding of cloudless cover for now based upon their
  thresholding


In orienting myeslf, these are the potential collection options I've figured out
(by poking around here on the
[sat-api catalog](https://landsatlook.usgs.gov/sat-api/collections):

- `landsat-c2l2-sr` Landsat Collection 2 Level-2 UTM Surface Reflectance (SR)
  Product
- `landsat-c2l2alb-sr` Landsat Collection 2 Level-2 Albers Surface Reflectance
  (SR) Product
- `landsat-c1l2alb-sr` Landsat Collection 1 Level-2 Albers Surface Reflectance
  (SR) Product <-- we don't want this one (b/c we'll go with collection 2)
- `landsat-c2l1` Landsat Collection 2 Level-1 Product <-- don't think we want
  this because we want surface reflectance


Run this once to apply the aws session to the rasterio environment


There are different kinds of QA/QC bands contained in L2SP:

- SR_CLOUD_QA - I think we want this one because anything less than 2 is either
  just dark dense vegetation or no flags. everything above is stuff like water,
  snow, cloud (different levels of obscurity). This is the result of the fmask
  algorithm from Zhu et al.
- QA_PIXEL - this gets a little more specific and goes intot different kinds of
  clouds. Super interesting but I don't think we want to use it.


Pull in the SR_CLOUD_QA and use as a mask - see Table 5-3 in
https://prd-wret.s3.us-west-2.amazonaws.com/assets/palladium/production/atoms/files/LSDS-1370_L4-7_C1-SurfaceReflectance-LEDAPS_ProductGuide-v3.pdf
for description of cloud integer values to select which ones to use as drop. For
now I'll drop anything greater than 1 (0= no QA concerns and 1 is Dark dense
vegetation (DDV)).


First we make the query using sat-search to find every file in the STAC catalog
that we want. We'll store that list of files. We'll do this first for a single
tile (in this first exmaple just covering Washington State) but then we'll loop
through in 1-degree by 1-degree tiles.


In [5]:
def shutdown_cluster(kind_of_cluster):
    if kind_of_cluster == "local":
        client.shutdown()
    elif kind_of_cluster == "remote":
        cluster.shutdown()

In [6]:
# shutdown_cluster(kind_of_cluster)

Due to memory constraints we'll average repeated captures of the same scene.
Then we'll average all of those averaged scenes together to create the full
mesh. As of now we're just doing a straight average but ideally we would carry
the weights of the number of repeats of each scene and do a weighted average
when quilting the scenes together.


Then we take the list of files for a given year to average across growing season
for each of the tiles and write it out to a mapper with those specifications.


In [7]:
access_key_id, secret_access_key = access_credentials()

In [8]:
test_credentials(aws_session)

In [9]:
dask.config.set({"array.slicing.split_large_chunks": True})
# tcp is a transmission control protocol 
dask.config.set({"distributed.comm.timeouts.tcp": "50s"})
dask.config.set({"distributed.comm.timeouts.connect": "50s"})

<dask.config.set at 0x7f45f0df2850>

In [10]:
gdf = gpd.read_file(
    "https://prd-wret.s3-us-west-2.amazonaws.com/assets/"
    "palladium/production/s3fs-public/atoms/files/"
    "WRS2_descending_0.zip"
)
bucket = "s3://carbonplan-climatetrace/v1"

biomass_folder = bucket + "/biomass/"
biomass_files = fs.ls(biomass_folder)
lat_lon_tags = [
    utils.get_lat_lon_tags_from_tile_path(fp) for fp in biomass_files
]
bounding_boxes = [
    utils.parse_bounding_box_from_lat_lon_tags(lat, lon)
    for lat, lon in lat_lon_tags
]

In [None]:
path = 144
row = 60
year = 2006


l = np.array([
    [66, 45, 2003],
    [66, 45, 2004],
    [66, 45, 2005],
    [66, 45, 2006],
    [66, 45, 2007],
    [66, 45, 2008],
    [66, 45, 2009],
    [6, 32, 2004],
])
for i in l:
    print(i)
    prep_training_dataset(
        path=task_id[i][0],
        row=task_id[i][1],
        year=task_id[i][2],
        access_key_id=access_key_id,
        secret_access_key=secret_access_key,
        training_write_bucket=f"{bucket}/training",
    )

In [11]:
from carbonplan_trace.v1.glas_allometric_eq import REALM_GROUPINGS

realms = list(REALM_GROUPINGS.keys())
processed_scenes = []
for realm in realms + ["no_data"]:
    print(realm)
    for year in np.arange(2003, 2010):
        processed_scenes.extend(
            fs.ls(f"{bucket}/training/{realm}/{year}", recursive=True)
        )

processed_scenes = [scene[-19:-8] for scene in processed_scenes]

afrotropic
australia
nearctic
neotropic
palearctic
tropical_asia
no_data


In [12]:
len(processed_scenes)

66151

In [13]:
len(processed_scenes) - 66151

6

In [None]:
landsat_bucket = (
    "s3://usgs-landsat/collection02/level-2/standard/etm/{}/{:03d}/{:03d}/"
)

with rio.Env(aws_session):
    tasks = []
    task_id = []
    for bounding_box in bounding_boxes:
        print(bounding_box)
        min_lat, max_lat, min_lon, max_lon = bounding_box
        scenes_in_tile = gdf.cx[min_lon:max_lon, min_lat:max_lat][
            ["PATH", "ROW"]
        ].values
        for year in np.arange(2003, 2010):
            for [path, row] in scenes_in_tile:
                scene_stores = fs.ls(landsat_bucket.format(year, path, row))
                output_name = f"{year}/{path:03d}{row:03d}"
                if len(scene_stores) == 0:
                    continue
                elif output_name in processed_scenes:
                    continue
                else:
                    tasks.append(
#                         prep_training_dataset(
#                         prep_training_dataset_delayed(
                        client.compute(prep_training_dataset_delayed(
                            path=path,
                            row=row,
                            year=year,
                            access_key_id=access_key_id,
                            secret_access_key=secret_access_key,
                            training_write_bucket=f"{bucket}/training",
                            error="raise",
                        ))
                    )
                    task_id.append([path, row, year])
    print(len(tasks))

(-10.0, 0.0, 0.0, 10.0)
(-10.0, 0.0, 10.0, 20.0)
(-10.0, 0.0, 20.0, 30.0)
(-10.0, 0.0, 30.0, 40.0)
(-10.0, 0.0, 40.0, 50.0)
(-10.0, 0.0, -40.0, -30.0)
(-10.0, 0.0, -50.0, -40.0)
(-10.0, 0.0, -60.0, -50.0)
(-10.0, 0.0, -70.0, -60.0)
(-10.0, 0.0, -80.0, -70.0)
(-10.0, 0.0, 90.0, 100.0)
(-10.0, 0.0, -90.0, -80.0)
(-10.0, 0.0, 100.0, 110.0)
(-10.0, 0.0, -100.0, -90.0)
(-10.0, 0.0, 110.0, 120.0)
(-10.0, 0.0, 120.0, 130.0)
(-10.0, 0.0, 130.0, 140.0)
(-10.0, 0.0, 140.0, 150.0)
(-10.0, 0.0, 150.0, 160.0)
(-10.0, 0.0, 160.0, 170.0)
(0.0, 10.0, 0.0, 10.0)
(0.0, 10.0, 10.0, 20.0)
(0.0, 10.0, -10.0, 0.0)
(0.0, 10.0, 20.0, 30.0)
(0.0, 10.0, -20.0, -10.0)
(0.0, 10.0, 30.0, 40.0)
(0.0, 10.0, 40.0, 50.0)
(0.0, 10.0, 50.0, 60.0)
(0.0, 10.0, -50.0, -40.0)
(0.0, 10.0, -60.0, -50.0)
(0.0, 10.0, 70.0, 80.0)
(0.0, 10.0, -70.0, -60.0)
(0.0, 10.0, 80.0, 90.0)
(0.0, 10.0, -80.0, -70.0)
(0.0, 10.0, 90.0, 100.0)
(0.0, 10.0, -90.0, -80.0)
(0.0, 10.0, 100.0, 110.0)
(0.0, 10.0, -100.0, -90.0)
(0.0, 10.0, 110.0, 120

In [None]:
len(tasks)

In [19]:
# results = dask.compute(tasks, retries=1)[0]
# results

In [None]:
# for i, task in enumerate(tasks):
#     try:
#         task.cancel()
#     except:
#         print(i)

In [39]:
access_denied = []

# three types of error 
# KilledWorker == memory usage 
# Assertion Error => need to figure out what we want to do for inference, can throw away for now 
# .zmetadata 
# AccessDenied should have been fixed 

for i, task in enumerate(tasks):
    if task.status == "error" and i not in []:
        print(i)
#         print(task.result())
        try:
            print(task.result())
        except Exception as e:
            print(e)
#             if isinstance(e, PermissionError):
#                 access_denied.append(i)
#                 print(i)

0
('prep_training_dataset-e3d4349f-2337-46a0-a1a9-2086ff318fa2', <WorkerState 'tls://192.168.19.240:44985', name: dask-worker-691a3565fcf6464596e2fc7ce18c42bc-thgm9, memory: 0, processing: 1>)
1
('prep_training_dataset-0689d6f5-a10c-4255-9de1-e2c1524cf1bc', <WorkerState 'tls://192.168.21.120:42417', name: dask-worker-691a3565fcf6464596e2fc7ce18c42bc-xh5gm, memory: 0, processing: 1>)
2
('prep_training_dataset-ddf2d023-eabc-433c-aeec-212b9bdd6bf5', <WorkerState 'tls://192.168.21.93:34553', name: dask-worker-691a3565fcf6464596e2fc7ce18c42bc-lf2db, memory: 0, processing: 1>)
3
('prep_training_dataset-3d01e233-677f-444e-b730-3acd316b9710', <WorkerState 'tls://192.168.20.88:42657', name: dask-worker-691a3565fcf6464596e2fc7ce18c42bc-bn6b5, memory: 0, processing: 1>)
4
('prep_training_dataset-8ad4c023-2049-4b22-a5f0-b4d3373c16c5', <WorkerState 'tls://192.168.15.231:33181', name: dask-worker-691a3565fcf6464596e2fc7ce18c42bc-qfs2m, memory: 0, processing: 1>)
5
('prep_training_dataset-79d455b9-58

In [30]:
task_id[120]

[202, 46, 2004]

In [33]:
for i in access_denied:
    print(i)
    prep_training_dataset(
        path=task_id[i][0],
        row=task_id[i][1],
        year=task_id[i][2],
        access_key_id=access_key_id,
        secret_access_key=secret_access_key,
        training_write_bucket=f"{bucket}/training",
    )

120




19.260353299802112 21.200790184663454 -11.879491657490442 -9.621437072364555
['20N_020W', '20N_010W', '30N_020W', '30N_010W']
length of data 3635
s3://carbonplan-climatetrace/v1/training/afrotropic/2004/202046.parquet
s3://carbonplan-climatetrace/v1/training/palearctic/2004/202046.parquet
121




19.257684378406406 21.20623254620289 -11.882483032838392 -9.615758823843576
['20N_020W', '20N_010W', '30N_020W', '30N_010W']
length of data 2000
s3://carbonplan-climatetrace/v1/training/afrotropic/2005/202046.parquet
s3://carbonplan-climatetrace/v1/training/palearctic/2005/202046.parquet
122




19.269921290073956 21.188407970714227 -10.313967859982627 -8.097336087429051
['20N_020W', '20N_010W', '30N_020W', '30N_010W']
length of data 1543
s3://carbonplan-climatetrace/v1/training/afrotropic/2006/201046.parquet
s3://carbonplan-climatetrace/v1/training/palearctic/2006/201046.parquet
123




19.257663948753287 21.20886716276355 -11.888308578973694 -9.635800161422235
['20N_020W', '20N_010W', '30N_020W', '30N_010W']


KeyboardInterrupt: 

In [None]:
print('done')

In [21]:
# import random

# for _ in range(100):
#     check = random.choice(processed_scenes)
#     print(check)
#     if 'no_data' in check:
#         continue
#     df = pd.read_parquet(f's3://{check}')
#     for v in ['SR_B1', 'SR_B2', 'SR_B3', 'SR_B4', 'SR_B5', 'SR_B7']:
#         print(v, df[v].unique())

### In case you want to aggregate the parquet files the following snippets could be useful. They're written currently to aggregate back to make a file for a 10x10 tile.


In [22]:
# combine_parquet_files_full_tile(
#     ul_lat,
#     ul_lon,
#     write=True,
#     access_key_id=access_key_id,
#     secret_access_key=secret_access_key,
# )