# Inference pipeline
Created by: Oriana Chegwidden

In [None]:
%load_ext autoreload
%autoreload 2
import boto3
from rasterio.session import AWSSession
from s3fs import S3FileSystem
aws_session = AWSSession(boto3.Session(profile_name='default'), 
                         requester_pays=True)
fs = S3FileSystem(profile='default', requester_pays=True)

from osgeo.gdal import VSICurlClearCache
VSICurlClearCache() 
import rasterio as rio
import xarray as xr
import dask
import os
import fsspec
import geopandas as gpd
import regionmask as rm
from matplotlib.pyplot import imshow
from intake import open_stac_item_collection
import numcodecs
import numpy as np
import rioxarray # for the extension to load
import matplotlib.pyplot as plt
import utm
import pandas as pd
from datetime import datetime
import json
import zarr
import awswrangler as wr
from dask.distributed import PipInstall
from dask_gateway import Gateway
import fsspec
import xgboost as xgb 
from carbonplan_trace.v1.landsat_preprocess import access_credentials
from carbonplan_trace.v1 import utils
from carbonplan_trace.v1.inference import predict

In [None]:
cluster = 'local' 
if cluster == 'local':
    # spin up local cluster. must be on big enough machine
    from dask.distributed import Client
    client = Client(n_workers=3, threads_per_worker=1)#, mem_per_worker=48) #_per_worker=4
    client
else:
    gateway = Gateway()
    options = gateway.cluster_options()
    options.environment = {'AWS_REQUEST_PAYER': 'requester'}
    options.worker_cores = 2
    options.worker_memory = 48
    cluster = gateway.new_cluster(cluster_options=options)
    cluster.adapt(minimum=1, maximum=10)
    cluster
    
    client = cluster.get_client()

In [None]:
access_key_id, secret_access_key = access_credentials()

In [None]:
## Load your model

In [None]:
sample_model_path = 's3://carbonplan-climatetrace/v1/models/xgb_biomass_50N_120W.bin'

Then we take the list of files for a given year to average across growing season for each of the tiles and write it out to a mapper with those specifications.

In [None]:
dask.config.set({"array.slicing.split_large_chunks": True})

In [None]:
gdf = gpd.read_file('https://prd-wret.s3-us-west-2.amazonaws.com/assets/'
                   'palladium/production/s3fs-public/atoms/files/'
                   'WRS2_descending_0.zip')

In [None]:
sample_tile = gdf.cx[-120:-110,40:50]

We'll loop through every scene and every year and calculate biomass for that scene. Will produce table of values [x, y, (both specific to utm projection), lat, lon, biomass].

In [None]:
bucket = 's3://carbonplan-climatetrace/v1/'
tasks=[]
rerun=True
if rerun:
    with rio.Env(aws_session): 
        for year in np.arange(2003,2004):
            for [path, row] in sample_tile[['PATH', 'ROW']].values[0:1]:
#                 tasks.append(
                    ds = predict(sample_model_path, path, row, year, access_key_id, 
                                   secret_access_key, output_write_bucket=None,
                                            input_write_bucket=None)
#                            )

In [None]:
dask.compute(tasks)

In [None]:
## Load a sample scene ()

In [None]:
ds = xr.open_zarr(fs.get_mapper('carbonplan-climatetrace/v1/44/27/2004/JJA_reflectance.zarr')).load()