# Example collocating a .cvs dataset (krill) with Pangeo and AWS datasets

* Import libraries
* set krill filename
* define function to parse time in csv file
* define function to get cloud data

In [None]:
import warnings
# filter some warning messages
warnings.filterwarnings("ignore") 

import xarray as xr
import fsspec
from matplotlib import pyplot as plt
import numpy as np
import intake
import dask
import pandas as pd

xr.set_options(display_style="html")  #display dataset nicely 
%matplotlib inline
plt.rcParams['figure.figsize'] = 12, 6
%config InlineBackend.figure_format = 'retina' 

In [74]:

warnings.simplefilter('ignore') # filter some warning messages
xr.set_options(display_style="html")  #display dataset nicely 

krill_file = 'F:/data/NASA_biophysical/collocated_data/CorrectMasterKrill.csv'

def parse_time(tem):
    # subroutine to parse the time in the csv file
    i1 = tem.find('/')
    smon = str(tem[0:i1]).zfill(2)
    tem = tem[i1+1:]
    i1 = tem.find('/')
    sdym = str(tem[0:i1]).zfill(2)
    tem = tem[i1+1:]
    i1 = tem.find(':')
    syr = str(2000+int(tem[0:i1-2])).zfill(4)
    tem = tem[i1-2:]
    i1 = tem.find(':')
    shr = str(int(tem[:i1])).zfill(2)
    tem = tem[i1+1:]
    smin = str(tem[0:2]).zfill(2)
    tstr = syr+'-'+smon+'-'+sdym+'T'+shr+':'+smin
    return tstr

#define function to get all the data at once, use same years for climatology for all data
def get_data():
    
    #climatology years
    cyr1,cyr2='1993-01-01','2018-12-31'
    
    # AVISO test
    cat_pangeo = intake.open_catalog("https://raw.githubusercontent.com/pangeo-data/pangeo-datastore/master/intake-catalogs/master.yaml")
    ds = cat_pangeo.ocean.sea_surface_height.to_dask()
    ds = ds.rename({'latitude':'lat','longitude':'lon'})
    ds.coords['lon'] = (ds.coords['lon'] + 180) % 360 - 180
    ds_aviso = ds.sortby(ds.lon).drop({'lat_bnds','lon_bnds','crs','err'})
    for var in ds_aviso:
        tem = ds_aviso[var].attrs
        tem['var_name']='aviso_'+str(var)
        ds_aviso[var].attrs=tem
    ds_aviso_clim = ds_aviso.sel(time=slice(cyr1,cyr2))
    ds_aviso_clim = ds_aviso_clim.groupby('time.dayofyear').mean('time',keep_attrs=True,skipna=False)    

    #sst
    file_location = 's3://mur-sst/zarr'
    ds_sst = xr.open_zarr(fsspec.get_mapper(file_location, anon=True),consolidated=True)
    ds_sst = ds_sst.drop({'analysis_error','mask','sea_ice_fraction'})
    tem = ds_sst.analysed_sst.attrs
    tem['var_name']='mur_sst'
    ds_sst.analysed_sst.attrs=tem
    ds_sst_clim = ds_sst.sel(time=slice(cyr1,cyr2))
    ds_sst_clim = ds_sst_clim.groupby('time.dayofyear').mean('time',keep_attrs=True,skipna=False)
    
    #put data into a dictionary
    data_dict={'aviso':ds_aviso,
               'sst':ds_sst}
    clim_dict={'aviso_clim':ds_aviso_clim,
               'sst_clim':ds_sst_clim}
  
    return data_dict,clim_dict

# Read in krill dataset
- use pandas read_csv, this has a lot of built in options that are worth exploring
- the NET_IN_TIME has the time, but for reading via a computer, it is a bit of a difficult format because the month and day of month can be either one digit or two digits, so here we parse the data by '/' and ':'
- next the lat and lon are scaled by 100, probably to add significant digit accuracy in the cvs file, so we un-scale the data

In [76]:
#read in csv file in to panda dataframe
ds_krill = pd.read_csv(krill_file)
#calculate time in np.datetime64
tem_time=np.ones(len(ds_krill),dtype='datetime64[ns]')
for i in range(len(ds_krill)):
    tstr = parse_time(ds_krill.NET_IN_TIME[i])
    tem_time[i]=np.datetime64(tstr)
ds_krill['time']=tem_time
#scale lat/lon
ds_krill['NET_IN_LAT']=ds_krill.NET_IN_LAT/100.
ds_krill['NET_IN_LON']=ds_krill.NET_IN_LON/100.
#print data
ds_krill

Unnamed: 0,CRUISE,HAUL_NO,NET_IN_TIME,NET_IN_LAT,NET_IN_LON,BOTTOM_DEPTH,JRES_STRATA,STATION,T..SPINIFERA,E..PACIFICA,...,Length,Width,Ocean,Canyon_ID,Distance_to_Canyon,LTYPE,INCREM,DEPTH,Distance_to_200_Isobath,time
0,205,2,5/10/02 2:47,36.463799,121.528496,172,C,114,1965,0.0,...,205.637501,59.540925,North Pacific Ocean,C9435,0.000000,isobath,2000,-2000,17917.300130,2002-05-10 02:47:00
1,205,3,5/10/02 3:56,36.423999,121.544102,86,C,115,4,0.0,...,205.637501,59.540925,North Pacific Ocean,C9435,5595.065102,isobath,2000,-2000,13899.304480,2002-05-10 03:56:00
2,205,5,5/10/02 21:23,36.412600,121.578203,106,C,112,0,0.0,...,205.637501,59.540925,North Pacific Ocean,C9435,1575.927777,isobath,2000,-2000,8257.446358,2002-05-10 21:23:00
3,205,7,5/11/02 0:24,36.427500,121.544404,86,C,115,4,0.0,...,205.637501,59.540925,North Pacific Ocean,C9435,5199.343990,isobath,2000,-2000,13913.299500,2002-05-11 00:24:00
4,205,8,5/11/02 1:47,36.445400,121.585703,303,C,116,237,0.0,...,205.637501,59.540925,North Pacific Ocean,C9435,0.000000,isobath,2000,-2000,9145.574006,2002-05-11 01:47:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1575,1505,164,6/13/15 4:27,37.167600,123.083799,868,C,135,0,39.0,...,164.004633,29.545458,North Pacific Ocean,C7472,4273.853688,isobath,2000,-2000,8668.120247,2015-06-13 04:27:00
1576,1505,165,6/13/15 21:39,37.535801,123.307900,1518,C,162,0,132.0,...,45.164910,14.393829,North Pacific Ocean,C7535,57.179111,isobath,2000,-2000,1145.054797,2015-06-13 21:39:00
1577,1505,166,6/13/15 23:24,37.531101,123.190098,85,C,160,62100,3726.0,...,126.730235,39.861287,North Pacific Ocean,C7519,10968.390420,isobath,2000,-2000,17378.109900,2015-06-13 23:24:00
1578,1505,167,6/14/15 1:59,37.448701,123.085400,70,C,156,4479,55031.0,...,126.730235,39.861287,North Pacific Ocean,C7519,4975.714651,isobath,2000,-2000,15995.253420,2015-06-14 01:59:00


### Start a cluster, a group of computers that will work together.

(A cluster is the key to big data analysis on on Cloud.)

- This will set up a [dask kubernetes](https://docs.dask.org/en/latest/setup/kubernetes.html) cluster for your analysis and give you a path that you can paste into the top of the Dask dashboard to visualize parts of your cluster.  
- You don't need to paste the link below into the Dask dashboard for this to work, but it will help you visualize progress.
- Try 20 workers to start (during the tutorial) but you can increase to speed things up later

In [None]:
from dask_gateway import Gateway
from dask.distributed import Client

In [None]:
gateway = Gateway()
cluster = gateway.new_cluster()
cluster.adapt(minimum=1, maximum=20)
client = Client(cluster)
cluster

** ☝️ Don’t forget to click the link above or copy it to the Dask dashboard on the left to view the scheduler dashboard! **

In [None]:
data_dict = get_data()


In [None]:
#%%time
##create the data
#sla_monthly = ds_aviso['sla'].resample(time='1MS').mean()

In [None]:
#sst_climatology = sst_timeseries.groupby('time.dayofyear').mean()
#sst_anomaly = sst_timeseries.groupby('time.dayofyear')-sst_climatology
#sst_anomaly_monthly = sst_anomaly.resample(time='1MS').mean()


In [None]:
for name in data:
    ds_data=data[name]
    print('name',name)
    for var in ds_data:
        var_tem=ds_data[var].attrs['var_name']
        ds_krill[var_tem]=np.ones(len(ds_krill))*np.NaN
        ds_krill[var_tem].attrs=ds_data[var].attrs
    print('var',var_tem)
    for i in range(len(ds_krill)):
        if ds_krill.time[i]<ds_data.time.min():
            continue
        if ds_krill.time[i]>ds_data.time.max():
            continue
        t1,t2 = ds_krill.time[i]-np.timedelta64(24,'h'), ds_krill.time[i]+np.timedelta64(24,'h')
        lat1,lat2=ds_krill.Lat[i]-.5,ds_krill.Lat[i]+.5
        lon1,lon2=ds_krill.Lon[i]-.5,ds_krill.Lon[i]+.5
        tem = ds_data.sel(time=slice(t1,t2),lat=slice(lat1,lat2),lon=slice(lon1,lon2)).load()
        tem = tem.interp(time=ds_krill.time[i],lat=ds_krill.Lat[i],lon=ds_krill.Lon[i])
        #tem = tem.load()
        for var in ds_data:
            var_tem=ds_data[var].attrs['var_name']
            ds_krill[var_tem][i]=tem[var].data
        if int(i/100)*100==i:
            print(i,len(ds_krill))
#at topo info
#interp will create a new 2D array, to avoid that put the lat/lon into dataarrays
ds_topo=data['topo']
new_lat = xr.DataArray(ds_krill.Lat.values, dims='new_dim')
new_lon = xr.DataArray(ds_krill.Lon.values, dims='new_dim')
ds_krill['ETOPO_depth'] = ds_topo.z.interp(lat=new_lat, lon=new_lon,method='nearest')

#output data
ds_krill.to_csv(filename_bird_out)
ds_krill = xr.Dataset.from_dataframe(ds_krill)
ds_krill.to_netcdf(filename_bird_out_netcdf)

In [None]:
for name in clim:
    ds_data=clim[name]
    print('name',name)
    for var in ds_data:
        var_tem=ds_data[var].attrs['var_name']+'_clim'
        ds_bird[var_tem]=np.ones(len(ds_bird))*np.NaN
        ds_bird[var_tem].attrs=ds_data[var].attrs
    print('var',var_tem)
    for i in range(len(ds_bird)):
        t1,t2 = ds_bird.time[i]-np.timedelta64(24,'h'), ds_bird.time[i]+np.timedelta64(24,'h')
        lat1,lat2=ds_bird.Lat[i]-.5,ds_bird.Lat[i]+.5
        lon1,lon2=ds_bird.Lon[i]-.5,ds_bird.Lon[i]+.5
        tem = ds_data.sel(dayofyear=ds_bird.time[i].dayofyear,lat=slice(lat1,lat2),lon=slice(lon1,lon2)).load()
        tem = tem.interp(lat=ds_bird.Lat[i],lon=ds_bird.Lon[i])
        for var in ds_data:
            var_tem=ds_data[var].attrs['var_name']+'_clim'
            ds_bird[var_tem][i]=tem[var].data
            
#output data
ds_bird.to_csv(filename_bird_out_final)
DS_bird = xr.Dataset.from_dataframe(ds_bird)
DS_bird.to_netcdf(filename_bird_out_netcdf_final)



In [None]:
client.close()
cluster.close()