# this reads in the MUR SST from AWS PODAAC collocates it with all Saildrone cruises


In [None]:
import sys
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
import xarray as xr
import cartopy.crs as ccrs
from scipy import spatial
#sys.path.append('/home/jovyan/shared/users/cgentemann/notebooks/salinity/subroutines/')
#from read_routines import read_all_usv, read_one_usv, add_coll_vars
import warnings
warnings.simplefilter('ignore') # filter some warning messages
from glob import glob


# these libraries help reading cloud data
import fsspec 
import s3fs
import requests
import os

warnings.simplefilter("ignore")  # filter some warning messages
xr.set_options(display_style="html",keep_attrs=True)  # display dataset nicely


# Read in All Saildrone cruises downloaded from https://data.saildrone.com/data/sets
- 2017 onwards, note that earlier data is going to lack insruments and be poorer data quality in general
- For this code I want to develop a routine that reads in all the different datasets and creates a standardized set
- It may work best to first read each of the files individually into a dictionary 
- then go through each dataset finding all variable names
- I decided to put all SST into TEMP_CTD_MEAN and same for Salinity so there is a single variable name
- this still preserves all the dataset information

In [None]:
import os
import sys
sys.path.append(os.path.join(os.environ['HOME'],'shared','users','lib'))
import ebdpy as ebd

ebd.set_credentials(profile='esip-qhub')

profile = 'esip-qhub'
region = 'us-west-2'
endpoint = f's3.{region}.amazonaws.com'
ebd.set_credentials(profile=profile, region=region, endpoint=endpoint)
worker_max = 30
client,cluster = ebd.start_dask_cluster(profile=profile,worker_max=worker_max, 
                                      region=region, use_existing_cluster=True,
                                      adaptive_scaling=False, wait_for_cluster=False, 
                                      environment='pangeo', worker_profile='Medium Worker', 
                                      propagate_env=True)

In [None]:
dir_data_pattern = '/home/jovyan/data/sss_collocations_orbital_norepeat/'
dir_out =         '/home/jovyan/data/sss_collocations_orbital_norepeat_mur/'
files = glob(dir_data_pattern+'*.nc')
for ifile,file in enumerate(files):
    ds = xr.open_dataset(file)
    ds.close()
    if any(v=='ob' for v in ds.dims.keys()):
        ds = ds.swap_dims({'ob':'time'})
    #remove any duplicates in time, keep only first value
    _, index = np.unique(ds['time'], return_index=True)
    ds=ds.isel(time=index)
    name = file[52:-3]
    name = name.replace(" ", "_")
    name = name.replace("/", "_")
    if ifile==0:
        data_dict = {name:ds}
    else:
        data_dict[name]=ds
    print(name)

# Collocate MUR

In [None]:
from earthdata import Auth 
auth = Auth().login()

In [None]:
url = "https://archive.podaac.earthdata.nasa.gov/s3credentials"
response = requests.get(url).json()

In [None]:
%%time
# set up read
json_consolidated = "s3://esip-qhub-public/nasa/mur/murv41_consolidated_20211011.json"
s_opts = {"requester_pays": True, "skip_instance_cache": True}
r_opts = {"key": response["accessKeyId"],"secret": response["secretAccessKey"],"token": response["sessionToken"],"client_kwargs": {"region_name": "us-west-2"},}
fs = fsspec.filesystem("reference",fo=json_consolidated,
                       ref_storage_args=s_opts,remote_protocol="s3",
                       remote_options=r_opts,simple_templates=True,)
ds_sst = xr.open_dataset(fs.get_mapper(""), decode_times=False, engine="zarr", consolidated=False)
ds_sst

# Collocate using .interp linear interpolation

In [None]:
ds = ds_sst
for iname,name in enumerate(data_dict):
    #if iname>3:
    #    continue
    print(iname,name)
    ds_usv = data_dict[name]
    #create space for new data
    for var in ds_sst:  
        ds_usv[var]=ds_usv.BARO_PRES_MEAN.copy(deep=True)*np.nan
        ds_usv[var].attrs=ds_sst[var].attrs
    ilen = len(ds_usv.time)
    for inc in range(0,ilen,100):
        i1,i2 = inc,inc+100
        if i2>ilen:
            i2=ilen-1
        #print(inc,inc+100)
        sub = ds_usv.isel(time=slice(i1,i2))   
        t1,t2=sub.time.min().data-np.timedelta64(1,'D'),sub.time.max().data+np.timedelta64(1,'D')
        x1,x2=sub.lon.min().data-.15,sub.lon.max().data+.15
        y1,y2=sub.lat.min().data-.15,sub.lat.max().data+.15
        #print(inc,t1,t2,x1,x2,y1,y2)
        ds_sat = ds_sst.sel(time=slice(t1,t2),lat=slice(y1,y2),lon=slice(x1,x2))  
        ds_sat['time']=np.asarray(ds_sat.time.data, "datetime64[ns]") 
        ds_interp = ds_sat.interp(time=sub.time,lat=sub.lat,lon=sub.lon,method='linear',assume_sorted=False) #add saildrone data to interpolated sat data
        #add saildrone data to interpolated sat data
        ds_interp = ds_interp.reset_coords(names={'lat','lon'})
        for var in ds_interp:
            ds_usv[var][i1:i2]=ds_interp[var]        
    #output
    fout = dir_out+name+'_20211116.nc'
    ds_usv.to_netcdf(fout)
    print('output done, start new')

In [None]:
for iname,name in enumerate(data_dict):
    fout = dir_out+name+'_20211116.nc'
    #ds_usv = xr.open_dataset(fout)
    #ds_usv.close()
    #ds_usv.analysed_sst.plot()
    #plt.show()
    #plt.clf()

# now gridded no repeat

In [1]:
import sys
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
import xarray as xr
import cartopy.crs as ccrs
from scipy import spatial
#sys.path.append('/home/jovyan/shared/users/cgentemann/notebooks/salinity/subroutines/')
#from read_routines import read_all_usv, read_one_usv, add_coll_vars
import warnings
warnings.simplefilter('ignore') # filter some warning messages
from glob import glob


# these libraries help reading cloud data
import fsspec 
import s3fs
import requests
import os

warnings.simplefilter("ignore")  # filter some warning messages
xr.set_options(display_style="html",keep_attrs=True)  # display dataset nicely


<xarray.core.options.set_options at 0x7f90dd0532e0>

In [2]:
import os
import sys
sys.path.append(os.path.join(os.environ['HOME'],'shared','users','lib'))
import ebdpy as ebd

ebd.set_credentials(profile='esip-qhub')

profile = 'esip-qhub'
region = 'us-west-2'
endpoint = f's3.{region}.amazonaws.com'
ebd.set_credentials(profile=profile, region=region, endpoint=endpoint)
worker_max = 30
client,cluster = ebd.start_dask_cluster(profile=profile,worker_max=worker_max, 
                                      region=region, use_existing_cluster=True,
                                      adaptive_scaling=False, wait_for_cluster=False, 
                                      environment='pangeo', worker_profile='Medium Worker', 
                                      propagate_env=True)

Existing Dask clusters:
Cluster Index c_idx: 0 / Name: dev.7cdc3beb067e4a9c9406230835ab0400 ClusterStatus.RUNNING
Using existing cluster [0].
Setting Fixed Scaling workers=30
Reconnect client to clear cache
client.dashboard_link (for new browser tab/window or dashboard searchbar in Jupyterhub):
https://jupyter.qhub.esipfed.org/gateway/clusters/dev.7cdc3beb067e4a9c9406230835ab0400/status
Propagating environment variables to workers


In [3]:
from earthdata import Auth 
auth = Auth().login()

Enter your Earthdata Login username:  cgentemann
Enter your Earthdata password:  ··········


You're now authenticated with NASA Earthdata Login


In [4]:
url = "https://archive.podaac.earthdata.nasa.gov/s3credentials"
response = requests.get(url).json()

In [5]:
%%time
# set up read
json_consolidated = "s3://esip-qhub-public/nasa/mur/murv41_consolidated_20211011.json"
s_opts = {"requester_pays": True, "skip_instance_cache": True}
r_opts = {"key": response["accessKeyId"],"secret": response["secretAccessKey"],"token": response["sessionToken"],"client_kwargs": {"region_name": "us-west-2"},}
fs = fsspec.filesystem("reference",fo=json_consolidated,
                       ref_storage_args=s_opts,remote_protocol="s3",
                       remote_options=r_opts,simple_templates=True,)
ds_sst = xr.open_dataset(fs.get_mapper(""), decode_times=False, engine="zarr", consolidated=False)
ds_sst

CPU times: user 45.3 s, sys: 4.78 s, total: 50 s
Wall time: 1min


In [6]:
dir_data_pattern = '/home/jovyan/data/sss_collocations_8day_nearest_norepeat/'
dir_out =         '/home/jovyan/data/sss_collocations_8day_nearest_norepeat_mur/'
files = glob(dir_data_pattern+'*.nc')
for ifile,file in enumerate(files):
    ds = xr.open_dataset(file)
    ds.close()
    if any(v=='ob' for v in ds.dims.keys()):
        ds = ds.swap_dims({'ob':'time'})
    #remove any duplicates in time, keep only first value
    _, index = np.unique(ds['time'], return_index=True)
    ds=ds.isel(time=index)
    name = file[57:-3]
    name = name.replace(" ", "_")
    name = name.replace("/", "_")
    if ifile==0:
        data_dict = {name:ds}
    else:
        data_dict[name]=ds
    print(ifile,name)

0 saildrone_west_coast_survey_2019_sd1045_RSSv4.0_8dy_20210511norep_20210511
1 PMEL_Arctic_2015_sd126-ALL-1_min-v1_RSSv4.0_8dy_20210511norep_20210511
2 saildrone_west_coast_survey_2019_sd1040_JPLv5.0_8dy_20210613norep_20210613
3 saildrone_tpos_sd1066_2019_RSSv4.0_8dy_20210511norep_20210511
4 saildrone-gen_5-arctic_misst_2019-sd1037-20190514T230000-20191011T183000-1_minutes-v1.1575487464625_RSSv4.0_8dy_20210511norep_20210511
5 saildrone_arctic_sd1034_2019_JPLv5.0_8dy_20210613norep_20210613
6 saildrone_west_coast_survey_2018_sd1028_RSSv4.0_8dy_20210511norep_20210511
7 saildrone_arctic_2017_sd1001_JPLv5.0_8dy_20210613norep_20210613
8 saildrone-gen_5-atomic_eurec4a_2020-sd1061-20200117T000000-20200302T235959-1_minutes-v1.1589307121602_JPLv5.0_8dy_20210613norep_20210613
9 saildrone_tpos_sd1005_2018_RSSv4.0_8dy_20210511norep_20210511
10 saildrone-gen_4-shark-2018-sd1004-20180315T000000-20180617T235959-1_minutes-v1.1581627077777_JPLv5.0_8dy_20210613norep_20210613
11 saildrone_west_coast_surve

In [None]:
ds = ds_sst
for iname,name in enumerate(data_dict):
    if iname<12:
        continue
    print(iname,name)
    ds_usv = data_dict[name]
    #create space for new data
    for var in ds_sst:  
        ds_usv[var]=ds_usv.BARO_PRES_MEAN.copy(deep=True)*np.nan
        ds_usv[var].attrs=ds_sst[var].attrs
    ilen = len(ds_usv.time)
    for inc in range(0,ilen,5):
        #print(inc)
        i1,i2 = inc,inc+5
        if i2>ilen:
            i2=ilen-1
        if i1==i2:
            continue
        #print(inc,inc+101)
        sub = ds_usv.isel(time=slice(i1,i2))   
        t1,t2=sub.time.min().data-np.timedelta64(1,'D'),sub.time.max().data+np.timedelta64(1,'D')
        x1,x2=sub.lon.min().data-.15,sub.lon.max().data+.15
        y1,y2=sub.lat.min().data-.15,sub.lat.max().data+.15
        #print(inc,t1,t2,x1,x2,y1,y2)
        ds_sat = ds_sst.sel(time=slice(t1,t2),lat=slice(y1,y2),lon=slice(x1,x2))  
        ds_sat['time']=np.asarray(ds_sat.time.data, "datetime64[ns]") 
        ds_interp = ds_sat.interp(time=sub.time,lat=sub.lat,lon=sub.lon,method='linear',assume_sorted=False) #add saildrone data to interpolated sat data
        #add saildrone data to interpolated sat data
        ds_interp = ds_interp.reset_coords(names={'lat','lon'})
        for var in ds_interp:
            ds_usv[var][i1:i2]=ds_interp[var]        
    #output
    fout = dir_out+name+'_20211116.nc'
    ds_usv.to_netcdf(fout)
    print('output done, start new')

12 saildrone_tpos_sd1006_2017_RSSv4.0_8dy_20210511norep_20210511
output done, start new
13 PMEL_Arctic_2016_sd128-ALL-1_min-v1_JPLv5.0_8dy_20210613norep_20210613
output done, start new
14 saildrone_tpos_sd1069_2019_JPLv5.0_8dy_20210613norep_20210613
output done, start new
15 saildrone_west_coast_survey_2019_sd1044_JPLv5.0_8dy_20210613norep_20210613
output done, start new
16 saildrone_arctic_2017_sd1002_RSSv4.0_8dy_20210511norep_20210511
output done, start new
17 saildrone_west_coast_survey_2018_sd1027_JPLv5.0_8dy_20210613norep_20210613
output done, start new
18 saildrone-gen_5-atlantic_to_med_2019_to_2020-sd1030-20191018T101200-20200717T134559-1_minutes-v1.1595626086288_RSSv4.0_8dy_20210511norep_20210511
output done, start new
19 saildrone-gen_5-antarctica_circumnavigation_2019-sd1020-20190119T040000-20190803T043000-1_minutes-v1.1564884498845_RSSv4.0_8dy_20210511norep_20210511
output done, start new
20 saildrone_west_coast_survey_2018_sd1024_RSSv4.0_8dy_20210511norep_20210511
output do

Exception in callback None()
handle: <Handle cancelled>
Traceback (most recent call last):
  File "/home/conda/store/6f7ecebfd1f597ece4a199fa68765d8c4f67f1510bb7acea08d550a96fc64b7c-pangeo/lib/python3.9/site-packages/tornado/iostream.py", line 1391, in _do_ssl_handshake
    self.socket.do_handshake()
  File "/home/conda/store/6f7ecebfd1f597ece4a199fa68765d8c4f67f1510bb7acea08d550a96fc64b7c-pangeo/lib/python3.9/ssl.py", line 1309, in do_handshake
    self._sslobj.do_handshake()
ssl.SSLEOFError: EOF occurred in violation of protocol (_ssl.c:1129)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/conda/store/6f7ecebfd1f597ece4a199fa68765d8c4f67f1510bb7acea08d550a96fc64b7c-pangeo/lib/python3.9/asyncio/events.py", line 80, in _run
    self._context.run(self._callback, *self._args)
  File "/home/conda/store/6f7ecebfd1f597ece4a199fa68765d8c4f67f1510bb7acea08d550a96fc64b7c-pangeo/lib/python3.9/site-packages/tornado/platform/a

output done, start new
66 saildrone_west_coast_survey_2018_sd1028_JPLv5.0_8dy_20210613norep_20210613
output done, start new
67 saildrone_tpos_sd1005_2018_JPLv5.0_8dy_20210613norep_20210613


In [None]:
for iname,name in enumerate(data_dict):
    fout = dir_out+name+'_20211116.nc'
    ds_usv = xr.open_dataset(fout)
    ds_usv.close()
    print(iname,ds_usv.analysed_sst.mean().data)
    #plt.show()
    #plt.clf()

In [None]:
ds_usv.analysed_sst.plot()

In [None]:
ds_interp = ds_sat.interp(time=sub.time).load()
#ds_interp = ds_interp.reset_coords(names={'lat','lon'})
#ds_interp.analysed_sst.plot()
#ds_interp = ds_interp.drop('ob')
ds_interp.analysed_sst[0,:,:].plot()

In [None]:
ds_sst

In [None]:
ds_sst.analysed_sst[5000,0:1000,18000:19000].plot()

In [None]:
ds_sst.analysed_sst[5000,9000,18000]

# tricky bit here, .interp wasn't working
- ds_sat is being read somewhere as "datetime64[us]" rather than "datetime64[ns]"
- this is breaking the interpolation routine which expects "datetime64[ns]"
- solution is to set ds_sat time to "datetime64[ns]"

In [None]:
ds_sat.time

In [None]:
data = np.asarray(ds_sat.time.data, "datetime64[ns]") 
ds_sat['time']=data

In [None]:
tem2 = ds_sat.interp(time=ds_usv.time,lat=ds_usv.lat,lon=ds_usv.lon,method='linear',assume_sorted=False)
#tem2 = ds_sat.sel(time=ds_sat.time[1],method='nearest')#,lat=ds_usv.lat[0],lon=ds_usv.lon[0],method='linear',assume_sorted=False)
#tem2 = ds_sat.sel(time=ds_usv.time[0],tem2 = ds_sat.sel(time=ds_sat.time[1],method='nearest')#,lat=ds_usv.lat[0],lon=ds_usv.lon[0],method='linear',assume_sorted=False)
#tem2 = ds_sat.sel(time=data[0],method='nearest')#,lat=ds_usv.lat[0],lon=ds_usv.lon[0],method='linear',assume_sorted=False)
#lat=ds_usv.lat[0],lon=ds_usv.lon[0],method='nearest')#,method='linear',assume_sorted=False)
tem2.analysed_sst.plot()

In [None]:
tem2 = ds_sat.sel(time=sub.time,lat=sub.lat,lon=sub.lon,method='nearest') 
tem2.analysed_sst.plot()

# TESTING