# Inference pipeline

Created by: Oriana Chegwidden


In [None]:
%load_ext autoreload
%autoreload 2

from pyproj import CRS
import boto3
from rasterio.session import AWSSession
from s3fs import S3FileSystem
aws_session = AWSSession(boto3.Session(),#profile_name='default'), 
                         requester_pays=True)
fs = S3FileSystem(requester_pays=True) #profile='default', 
import xgboost as xgb

from osgeo.gdal import VSICurlClearCache
import rasterio as rio
import numpy as np
import xarray as xr
import dask
import os
import fsspec
import geopandas as gpd
import rioxarray # for the extension to load
import matplotlib.pyplot as plt
import utm
import pandas as pd
from datetime import datetime
import time
import json
import zarr
import awswrangler as wr
from dask_gateway import Gateway
from carbonplan_trace.v1.landsat_preprocess import access_credentials, test_credentials
from carbonplan_trace.v1.inference import predict, predict_delayed
from carbonplan_trace.v1 import utils


In [None]:
import pyproj

pyproj.__version__

In [None]:
from carbonplan_trace import version

print(version)

In [None]:
dask.config.set({"array.slicing.split_large_chunks": True})
# tcp is a transmission control protocol
dask.config.set({"distributed.comm.timeouts.tcp": "50s"})
dask.config.set({"distributed.comm.timeouts.connect": "50s"})
# dask.config.set({"distributed.worker.resources.WORKERTOKEN": 1})

In [None]:
kind_of_cluster = "local"
# kind_of_cluster = "remote"
if kind_of_cluster == "local":
    # spin up local cluster. must be on big enough machine
    from dask.distributed import Client

    # when very very huge use 8,8
    client = Client(n_workers=8, threads_per_worker=8, resources={"workertoken": 1})
    client
else:
    gateway = Gateway()
    options = gateway.cluster_options()
    options.environment = {
        "AWS_REQUEST_PAYER": "requester",
        "AWS_REGION_NAME": "us-west-2",
        "DASK_DISTRIBUTED__WORKER__RESOURCES__WORKERTOKEN": "1",
    }
    options.worker_cores = 8
    options.worker_memory = 100

    options.image = "carbonplan/trace-python-notebook:latest"
    cluster = gateway.new_cluster(cluster_options=options)
    cluster.adapt(minimum=1, maximum=10)
#     cluster.scale(100)

In [None]:
# client = cluster.get_client()
client

# check this link first
# possible scenario:
# 1) everything is succeeding and cluster still running, no need to do anything
# 2) most things are failing but cluster still running, restart, increase mem and decrease num worker, re start and run all
# 3) 404 error -> cluster died -> restart and run all

In [None]:
def shutdown_cluster(kind_of_cluster):
    if kind_of_cluster == "local":
        client.shutdown()
    elif kind_of_cluster == "remote":
        cluster.shutdown()

In [None]:
shutdown_cluster("local")

In [None]:
access_key_id, secret_access_key = access_credentials()

In [None]:
test_credentials(aws_session)

Then we take the list of files for a given year to average across growing season for each of the
tiles and write it out to a mapper with those specifications.


In [None]:
ul_lats = ["10S", "20S", "30S"]
ul_lons = [f"{lon}E" for lon in np.arange(110, 151, 10)]
lat_lon_tags = []
for ul_lat in ul_lats:
    for ul_lon in ul_lons:
        lat_lon_tags.append((ul_lat, ul_lon))

In [None]:
gdf = gpd.read_file(
    "https://prd-wret.s3-us-west-2.amazonaws.com/assets/"
    "palladium/production/s3fs-public/atoms/files/"
    "WRS2_descending_0.zip"
)
bucket = "s3://carbonplan-climatetrace/v2.1"

# biomass_folder = "s3://carbonplan-climatetrace/intermediate/ecoregions_mask/"
# biomass_files = fs.ls(biomass_folder) # just to get list of lat_lon tiles we want
# lat_lon_tags = [utils.get_lat_lon_tags_from_tile_path(fp) for fp in biomass_files]
# lat_lon_tags = [('60N', '130W')]#, ('40N', '130W')]#, ('00N', '060W')] #('50N', '130W'),
bounding_boxes = [utils.parse_bounding_box_from_lat_lon_tags(lat, lon) for lat, lon in lat_lon_tags]

In [None]:
from carbonplan_trace.v1.glas_allometric_eq import REALM_GROUPINGS

processed_scenes = []
for year in np.arange(2011, 2022):
    processed_scenes.extend(fs.ls(f"{bucket}/inference/xg/{year}", recursive=True))

processed_scenes = [scene[-19:-8] for scene in processed_scenes]
len(processed_scenes)

In [None]:
import carbonplan_trace

We'll loop through every scene and every year and calculate biomass for that scene. Will produce
table of values [x, y, (both specific to utm projection), lat, lon, biomass].


In [None]:
for bounding_box in bounding_boxes:
    min_lat, max_lat, min_lon, max_lon = bounding_box
    valid_scenes = gdf.cx[min_lon:max_lon, min_lat:max_lat][["PATH", "ROW"]].values

In [None]:
file_lengths = pd.DataFrame(
    columns=["v1-rf", "v2-rf", "v2-xg"],
    index=["_".join([str(path), str(row)]) for (path, row) in valid_scenes],
)

In [None]:
# rerun_scenes = {'2010':[], '2014':[]}
# setups = [('v2', 'rf')]#, ('v2', 'xg')] #('v1', 'rf'),
# for year in ['2010', '2014']:
#     for (version, model) in setups:
#         for [path, row] in valid_scenes:
#             output_name = f"{year}/{path:03d}{row:03d}.parquet"
#             print(f's3://carbonplan-climatetrace/{version}/inference/{model}/{output_name}')
#             if len(fs.ls(f's3://carbonplan-climatetrace/{version}/inference/{model}/{output_name}')) == 0:
#                 if [path, row] not in rerun_scenes[year]:
#                     rerun_scenes[year].append([path, row])
#         i+=1
#             file_length = len(pd.read_parquet(f's3://carbonplan-climatetrace/{version}/inference/{model}/{output_name}'))
#         except FileNotFoundError:
#             file_length = np.nan

#         file_lengths.loc[f'{path}_{row}', f'{version}-{model}'] = file_length

In [None]:
# file_lengths.to_csv('files_to_repeat.csv')

In [None]:
# remove each entry in index

In [None]:
landsat_bucket = "s3://usgs-landsat/collection02/level-2/standard/etm/{}/{:03d}/{:03d}/"
with rio.Env(aws_session):
    tasks = []
    task_ids = []
    for bounding_box in bounding_boxes:
        print(bounding_box)
        min_lat, max_lat, min_lon, max_lon = bounding_box
        scenes_in_tile = gdf.cx[min_lon:max_lon, min_lat:max_lat][["PATH", "ROW"]].values
        for year in np.arange(2011, 2022):
            for [path, row] in scenes_in_tile:
                scene_stores = fs.ls(landsat_bucket.format(year, path, row))
                output_name = f"{year}/{path:03d}{row:03d}"
                if len(scene_stores) == 0:
                    continue
                elif output_name in processed_scenes:
                    continue
                elif output_name in task_id:
                    continue
                else:
                    tasks.append(
                        #                         predict(
                        client.compute(
                            predict_delayed(
                                model_folder=f"{bucket}/models/",
                                path=path,
                                row=row,
                                year=year,
                                access_key_id=access_key_id,
                                secret_access_key=secret_access_key,
                                output_write_bucket=f"{bucket}/inference",
                            ),
                            resources={"workertoken": 1},
                        )
                    )
                    task_id.append(output_name)

In [None]:
len(rerun_scenes["2014"])

In [None]:
results = dask.compute(tasks, retries=1, resources={"workertoken": 1})[0]
results

In [None]:
results

In [None]:
# i = 0
# path = task_id[i][0]
# row = task_id[i][1]
# year = task_id[i][2]

path = 48
row = 22
year = 2014

print(path, row, year)

predict(
    model_folder=f"{bucket}/models/",
    path=path,
    row=row,
    year=year,
    access_key_id=access_key_id,
    secret_access_key=secret_access_key,
    output_write_bucket=f"{bucket}/inference",
)

In [None]:
fs.ls("s3://carbonplan-climatetrace/v2/inference/rf/2014/054018.parquet")

In [None]:
fs.ls("s3://carbonplan-climatetrace/v2/inference/xg/2014/054018.parquet")

In [None]:
fs.ls("s3://carbonplan-climatetrace/v2/inference/rf/2014/054018.parquet")

In [None]:
# i = 0
# path = task_id[i][0]
# row = task_id[i][1]
# year = task_id[i][2]

path = 54
row = 18
year = 2010

print(path, row, year)

predict(
    model_folder=f"{bucket}/models/",
    path=path,
    row=row,
    year=year,
    access_key_id=access_key_id,
    secret_access_key=secret_access_key,
    output_write_bucket=f"{bucket}/inference",
)

In [None]:
pd.read_parquet("s3://carbonplan-climatetrace/v2/inference/rf/2010/054018.parquet")

In [None]:
for i, task in enumerate(tasks):
    if task.status == "error" and i not in []:
        print(i)
        print(task.result())

In [None]:
exclude_list = []
errors = []

for i, task in enumerate(tasks):
    if task.status == "error" and i not in []:
        print(i)
        #         print(task.result())
        try:
            print(task.result())
        except Exception as e:
            print(e)
            exclude_list.append(list(task_id[i]))

pd.DataFrame(exclude_list, columns=["path", "row", "year"]).to_csv("inference_failed_tasks.csv")

In [None]:
exclude_list

In [None]:
for i, task in enumerate(tasks):
    try:
        task.cancel()
    except:
        print(i)