# Processing Example for AWS illustrating a simple Algorithm to Test Infrastucture

This is based on using an environment simliar to the one that is created from the ASDI CMPI example: https://github.com/awslabs/amazon-asdi/tree/main/examples/cmip6 (this needs to run in us-east-1 , I think as the CMPI references failed from London)

If you want to run this you may need to install some addtional libraries using ```conda install``` from a Terminal

In [None]:
%matplotlib inline
import xarray as xr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import intake
import boto3
import botocore
import datetime
import s3fs
import fsspec
import dask
#import sys
import lz4
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
from dask.distributed import performance_report, Client, progress, LocalCluster

font = {'family' : 'sans-serif',
        'weight' : 'normal',
        'size'   : 18}
matplotlib.rc('font', **font)

In [None]:
import json
import sys
import boto3
import os
from boto3.dynamodb.conditions import Key, Attr

In [None]:
#some paramters for regridding
regrid_lon=np.arange(0.0,360.0,0.1)
regrid_lat=np.arange(-90.0,90.0,0.1)
regrid_method='slinear'

# Connect to Dask cluster scheduler


In [None]:
import dask.array as da
from dask_worker_pools import pool, propagate_pools, visualize_pools


In [None]:
from dask.distributed import Client
import lz4
# Client.get_versions('self', check=True)
client = Client('Dask-Scheduler.local-dask:8786')
# client = Client('Dask-Scheduler.local-dask:8786',serializers=['dask', 'pickle'],
#                deserializers=['dask', 'pickle']
#               )

In [None]:
client.scheduler_info()['workers']

In [None]:
%store -r activity_id
%store -r variable_id
%store -r table_id
variable_ids = variable_id # tas is air temperature at 2m above surface
table_id = table_id # Monthly data from Atmosphere - would really like this to be daily, but run out of memeory in client ('day' is the id)
grid = 'gn' #

# Records for Institution, experiment, and source_id are stored in https://github.com/WCRP-CMIP/CMIP6_CVs
experiment_id = 'ssp245' #['ssp126', 'ssp245', 'ssp370', 'ssp585'] 
activity_ids = activity_id # Search Scenarios & CMIP activities only
institution_id = 'MOHC' #just looking at our data in this example

print(activity_id)
print(variable_id)
print(table_id)

In [None]:
session = boto3.session.Session()
my_region = session.region_name

# Get the historic record

### Search ERA metadata from catalog

#### Now we use the store from the ux testing file to actually get the desired attribute

In [None]:
%store -r host
credentials = boto3.Session().get_credentials()
auth = AWSV4SignerAuth(credentials, my_region)
index_name = 'era5-pds'


In [None]:
opensearch_client = OpenSearch(
        hosts = [{'host': host, 'port': 443}],
        http_auth = auth,
        use_ssl = True,
        verify_certs = True,
        connection_class = RequestsHttpConnection
        )

In [None]:


def query_nc(q, index_name):
    
    queryy = {
      'size': 1,
      'query': {
        'multi_match': {
          'query': q,
            'fields': ['fileName']
        }
      }
    }

    respons = opensearch_client.search(
        body = queryy,
        index = index_name
    )
    
    res = [i['_source']['fileName'] for i in respons['hits']['hits']][0]
    d_pool = [d['_source']['dask_pool'] for d in respons['hits']['hits']]
    regio = [f['_source']['region'] for f in respons['hits']['hits']]
    regio = list(set(regio))[0]
    d_pool = list(set(d_pool))[0]
    return res, regio, d_pool


In [None]:
import urllib

%store -r start_date end_date

start_year = int(start_date.year) #really want this to be 2010
end_year = int(end_date.year)

lustre_mount_point = "/fsx"
years = list(np.arange(start_year, end_year+1, 1))

# todo: update how we handle this
months = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"] 


In [None]:
%store -r desired_attribute
attr1 = desired_attribute
attr1

In [None]:
%store -r start_date
start_date.month

In [None]:
n_list = []
from dateutil.relativedelta import relativedelta
current_date = start_date

while current_date <= end_date:
#     print(current_date.month)
#     datem = datetime.datetime.strptime(datetime.datetime.strftime(current_date, "%Y-%M-%d"), "%Y-%m-%d")
    current_month = str(current_date.month)
    if len(current_month) == 1:
        current_month = "0" + current_month
    item = '{}/{}/{}/{}/{}.nc'.format(index_name, current_date.year, current_month, 'data',attr1)
#     print(item)
    n_list.append(item)
    current_date = current_date + relativedelta(months=1)
# n_list2[0])

In [None]:
region_dask_pool = []
region = []
nc_list = []

for nc in n_list:
    fileName, regn, dask_pool = query_nc(nc, index_name)
    nc_list.append(fileName)
    region_dask_pool.append(dask_pool)
    region.append(regn)

region = list(set(region))[0]
region_dask_pool = list(set(region_dask_pool))[0]
print(nc_list)

In [None]:
historical_pool_region = 'us-east-1'
%store historical_pool_region

In [None]:
%%time
with pool(historical_pool_region):
    historical_data = xr.open_mfdataset(nc_list, engine='h5netcdf', concat_dim='time0', combine='nested', coords='minimal', compat='override', parallel=True, chunks={'lon':200,'lat':200,'time0':720})
    %store historical_data