# this reads in the MUR SST from AWS PODAAC collocates it with all Saildrone cruises


In [1]:
import sys
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
import xarray as xr
import cartopy.crs as ccrs
from scipy import spatial
#sys.path.append('/home/jovyan/shared/users/cgentemann/notebooks/salinity/subroutines/')
#from read_routines import read_all_usv, read_one_usv, add_coll_vars
import warnings
warnings.simplefilter('ignore') # filter some warning messages
from glob import glob


# these libraries help reading cloud data
import fsspec 
import s3fs
import requests
import os

warnings.simplefilter("ignore")  # filter some warning messages
xr.set_options(display_style="html",keep_attrs=True)  # display dataset nicely


<xarray.core.options.set_options at 0x7f53244f2670>

In [2]:
def read_all_usv(adir_usv):
    # this subroutine reads in all the saildrone data for all cruises and normalizes variable names
    # input directory with files
    # output dictionary of datasets
    
    import xarray as xr
    import numpy as np
    from glob import glob
    
    #list names of variables to keep
    list_var = ['time','lat','lon','SOG_MEAN','COG_MEAN','HDB_MEAN','ROLL_FILTERED_MEAN','PITCH_FILTERED_MEAN',
                'UWND_MEAN','VWND_MEAN','WWND_MEAN','GUST_WND_MEAN','TEMP_AIR_MEAN','RH_MEAN','BARO_PRES_MEAN',
                'PAR_AIR_MEAN','TEMP_CTD_MEAN','SAL_CTD_MEAN','TEMP_RBR_MEAN','SAL_RBR_MEAN',
                'TEMP_O2_RBR_MEAN']
    #list names of variables to swap to common names
    swapvar = {'TEMP_SBE37_MEAN':'TEMP_CTD_MEAN','SAL_SBE37_MEAN':'SAL_CTD_MEAN','SAL_MEAN':'SAL_CTD_MEAN',
               'TEMP_O2_RBR_MEAN':'TEMP_O2_MEAN','TEMP_CTD_RBR_MEAN':'TEMP_RBR_MEAN'}

    #get list of all filenames in directory
    files = [x for x in glob(adir_usv)]
    print('number of file:',len(files))
    
    #go through each file, read in, normalize and put in dictionary with datasets
    for ifile,file in enumerate(files):
        #print(file)
        ds = xr.open_dataset(file)
        ds.close()
        if any(v=='latitude' for v in ds.dims.keys()):
            ds = ds.rename({'latitude':'lat','longitude':'lon'})
        if any(v=='latitude' for v in ds):
            ds = ds.rename({'latitude':'lat','longitude':'lon'})
        if any(v=='trajectory' for v in ds.dims.keys()):
            ds = ds.isel(trajectory=0)
    #    for v in ds.dims.keys():
        if any(v=='obs' for v in ds.dims.keys()):
            ds = ds.swap_dims({'obs':'time'})
        if any(v=='row' for v in ds.dims.keys()):
            ds = ds.swap_dims({'row':'time'})
        #remove any duplicates in time, keep only first value
        _, index = np.unique(ds['time'], return_index=True)
        ds=ds.isel(time=index)
        #renames some common variables to uniform name, drop variables not on list above
        #dssv = ds
        if any(var=='wind_speed' for var in ds):
            ds['UWND_MEAN']=-ds.wind_speed*np.sin(ds.wind_dir*np.pi/180.)
            ds['VWND_MEAN']=-ds.wind_speed*np.cos(ds.wind_dir*np.pi/180.)
            ds.UWND_MEAN.attrs['units']=ds.wind_speed.attrs['units']
            ds.VWND_MEAN.attrs['units']=ds.wind_speed.attrs['units']
        for var in ds:
            var2 = var
            if swapvar.get(var): 
                ds = ds.rename({var:swapvar.get(var)})
                var2 = swapvar.get(var)
            if any(vv==var2 for vv in list_var):
                ds #just a place holder does nothing
            else:
                ds = ds.drop(var2)
        #check that there is a TEMP_CTD_MEAN, if not & temp_rbr_mean there, change it to temp_ctd_mean
        if any(var=='TEMP_CTD_MEAN' for var in ds):
            ds #just a place holder does nothing
        else:
            if any(var=='TEMP_RBR_MEAN' for var in ds):
                ds = ds.rename({'TEMP_RBR_MEAN':'TEMP_CTD_MEAN'})
        if any(var=='SAL_CTD_MEAN' for var in ds):
            ds #just a place holder does nothing
        else:
            if any(var=='SAL_RBR_MEAN' for var in ds):
                ds = ds.rename({'SAL_RBR_MEAN':'SAL_CTD_MEAN'})

        # add room to write collocated data information
        #ilen = ds.time.shape[0]
        #ds['deltaT'] = xr.DataArray(np.ones(ilen, dtype='float32')*99999, coords={'time': ds.time}, dims=('time'))
        #ds['smap_SSS'] = xr.DataArray(np.empty(ilen, dtype='float32'), coords={'time': ds.time}, dims=('time'))
        #ds['smap_iqc_flag'] = xr.DataArray(np.empty(ilen, dtype='int32'), coords={'time': ds.time}, dims=('time'))
        #ds['smap_name'] = xr.DataArray(np.empty(ilen, dtype='U125'), coords={'time': ds.time}, dims=('time'))
        #ds['smap_dist'] = xr.DataArray(np.ones(ilen, dtype='float32')*99999, coords={'time': ds.time}, dims=('time'))
        #ds['smap_ydim'] = xr.DataArray(np.empty(ilen, dtype='float32'), coords={'time': ds.time}, dims=('time'))
        #ds['smap_xdim'] = xr.DataArray(np.empty(ilen, dtype='float32'), coords={'time': ds.time}, dims=('time'))

        name = file[33:-3]
        name = name.replace(" ", "_")
        name = name.replace("/", "_")
        print(ifile,name)
        if ifile==0:
            data_dict = {name:ds}
        else:
            data_dict[name]=ds
   
    return data_dict



###################read OLD******************
def read_usv(adir_usv, iusv):
    import xarray as xr
    import numpy as np
    
   
    
    
    filename_usv_list = ['pmel_2015_sd126-ALL-1_min-v1.nc',
                         'pmel_2015_sd128-ALL-1_min-v1.nc',
                         'pmel_2016_sd126-ALL-1_min-v1.nc',
                         'pmel_2016_sd128-ALL-1_min-v1.nc',
                         'arctic_2019_sd1033-NRT-1_min-v1.nc',
                         'arctic_2019_sd1034-NRT-1_min-v1.nc',
                         'arctic_2019_sd1035-NRT-1_min-v1.nc',
                         'arctic_2019_sd1036-NRT-1_min-v1.nc',
                         'arctic_2019_sd1037-NRT-1_min-v1.nc',
                         'saildrone-gen_5-antarctica_circumnavigation_2019-sd1020-20190119T040000-20190803T043000-1440_minutes-v1.1564857794963.nc'
                        'wcoast_2018_sd1024-ALL-1_min-v1.nc',
                        'wcoast_2018_sd1025-ALL-1_min-v1.nc',
                        'wcoast_2018_sd1026-ALL-1_min-v1.nc',
                        'wcoast_2018_sd1027-ALL-1_min-v1.nc',
                        'wcoast_2018_sd1028-ALL-1_min-v1.nc']
    name_usv_list = ['pmel_2015_sd126', 'pmel_2015_sd128', 'pmel_2016_sd126', 'pmel_2016_sd128',
                     'arctic2019_1033', 'arctic2019_1034', 'arctic2019_1035', 'arctic2019_1036', 'arctic2019_1037',
                     'antarctic2019','wcoast1025','wcoast1026','wcoast1027','wcoast1028','wcoast1029']

    filename_usv = adir_usv + filename_usv_list[iusv]
    print('FILEIN:', filename_usv)
    ds_usv = xr.open_dataset(filename_usv)
    ds_usv.close()
    # NEED TO FIND OUT IF wind_speed is to/from wind_direction ?
    if (iusv == 0 or iusv == 1):  # 1033
        ds_usv = ds_usv.rename(
            {'temp_air_mean': 'TEMP_AIR_MEAN', 'rh_mean': 'RH_MEAN', 'baro_pres_mean': 'BARO_PRES_MEAN',
             'sal_mean': 'SAL_MEAN', 'temp_ctd_mean': 'TEMP_CTD_MEAN', 'temp_o2_mean': 'TEMP_O2_MEAN',
             'chlor_mean': 'CHLOR_MEAN', 'gust_wnd_mean': 'GUST_WND_MEAN', 'temp_ctd_stddev': 'TEMP_CTD_STDDEV'})
        tem_att = ds_usv.wind_speed_mean.attrs
        ds_usv['wind_speed_mean'] = ds_usv.wind_speed_mean * .51444
        ds_usv.wind_speed_mean.attrs = tem_att
        ds_usv.wind_speed_mean.attrs['units'] = 'm s-1'
        uwnd = ds_usv.wind_speed_mean * np.cos(np.deg2rad(ds_usv.wind_direction_mean))
        vwnd = ds_usv.wind_speed_mean * np.sin(np.deg2rad(ds_usv.wind_direction_mean))
        ds_usv['UWND_MEAN'] = uwnd
        ds_usv.UWND_MEAN.attrs = {'standard_name': 'eastward_wind', 'long_name': 'Eastward wind speed',
                                  'units': 'm s-1', 'installed_height': '5.2'}
        ds_usv['VWND_MEAN'] = vwnd
        ds_usv.VWND_MEAN.attrs = {'standard_name': 'northward_wind', 'long_name': 'Northward wind speed',
                                  'units': 'm s-1', 'installed_height': '5.2'}
        ilen = ds_usv.time.shape[0]
        ds_usv['WWND_MEAN'] = xr.DataArray(np.ones(ilen) * np.nan, coords={'time': ds_usv.time}, dims=('time'))
        ds_usv.WWND_MEAN.attrs = {'standard_name': 'upward_wind_velocity', 'long_name': 'upward wind speed',
                                  'units': 'm s-1', 'installed_height': '5.2'}
    if (iusv == 2 or iusv == 3):  # 1033
        ds_usv = ds_usv.rename(
            {'temp_air_mean': 'TEMP_AIR_MEAN', 'rh_mean': 'RH_MEAN', 'baro_pres_mean': 'BARO_PRES_MEAN',
             'sal_mean': 'SAL_MEAN', 'temp_ctd_mean': 'TEMP_CTD_MEAN', 'temp_o2_mean': 'TEMP_O2_MEAN',
             'chlor_mean': 'CHLOR_MEAN', 'gust_wnd_mean': 'GUST_WND_MEAN', 'temp_ctd_stddev': 'TEMP_CTD_STDDEV'})
        tem_att = ds_usv.wind_speed.attrs
        ds_usv['wind_speed'] = ds_usv.wind_speed * .51444
        ds_usv.wind_speed.attrs = tem_att
        ds_usv.wind_speed.attrs['units'] = 'm s-1'
        uwnd = ds_usv.wind_speed * np.cos(np.deg2rad(ds_usv.wind_direction))
        vwnd = ds_usv.wind_speed * np.sin(np.deg2rad(ds_usv.wind_direction))
        ds_usv['UWND_MEAN'] = uwnd
        ds_usv.UWND_MEAN.attrs = {'standard_name': 'eastward_wind', 'long_name': 'Eastward wind speed',
                                  'units': 'm s-1', 'installed_height': '5.2'}
        ds_usv['VWND_MEAN'] = vwnd
        ds_usv.VWND_MEAN.attrs = {'standard_name': 'northward_wind', 'long_name': 'Northward wind speed',
                                  'units': 'm s-1', 'installed_height': '5.2'}
        ilen = ds_usv.time.shape[0]
        ds_usv['WWND_MEAN'] = xr.DataArray(np.ones(ilen) * np.nan, coords={'time': ds_usv.time}, dims=('time'))
        ds_usv.WWND_MEAN.attrs = {'standard_name': 'upward_wind_velocity', 'long_name': 'upward wind speed',
                                  'units': 'm s-1', 'installed_height': '5.2'}
    if iusv == 4:  # 1033
        ds_usv = ds_usv.rename({'TEMP_CTD_RBR_MEAN': 'TEMP_CTD_MEAN', 'TEMP_CTD_RBR_STDDEV': 'TEMP_CTD_STDDEV',
                                'TEMP_O2_RBR_MEAN': 'TEMP_O2_MEAN', 'SAL_RBR_MEAN': 'SAL_MEAN',
                                'CHLOR_WETLABS_MEAN': 'CHLOR_MEAN'})
    if iusv == 5:  # 1034
        ds_usv = ds_usv.rename({'TEMP_CTD_RBR_MEAN': 'TEMP_CTD_MEAN', 'TEMP_CTD_RBR_STDDEV': 'TEMP_CTD_STDDEV',
                                'TEMP_O2_RBR_MEAN': 'TEMP_O2_MEAN', 'SAL_RBR_MEAN': 'SAL_MEAN',
                                'CHLOR_WETLABS_MEAN': 'CHLOR_MEAN'})
    if iusv == 6:  # 1035
        ds_usv = ds_usv.rename({'TEMP_CTD_RBR_MEAN': 'TEMP_CTD_MEAN', 'TEMP_CTD_RBR_STDDEV': 'TEMP_CTD_STDDEV',
                                'TEMP_O2_RBR_MEAN': 'TEMP_O2_MEAN', 'SAL_RBR_MEAN': 'SAL_MEAN',
                                'CHLOR_WETLABS_MEAN': 'CHLOR_MEAN'}) #, 'WIND_MEASUREMENT_MEAN_HEIGHT': 'WIND_MEAN_HEIGHT'})
    if iusv == 7:  # 1036
        ds_usv = ds_usv.isel(time=slice(100,
                                        -1))  # ds_usv = ds_usv.rename({'TEMP_CTD_RBR_MEAN':'TEMP_CTD_MEAN','TEMP_O2_RBR_MEAN':'TEMP_O2_MEAN','SAL_RBR_MEAN':'SAL_MEAN','CHLOR_WETLABS_MEAN':'CHLOR_MEAN'})
        ds_usv = ds_usv.rename({'TEMP_CTD_RBR_MEAN': 'TEMP_CTD_MEAN', 'TEMP_CTD_RBR_STDDEV': 'TEMP_CTD_STDDEV',
                                'TEMP_O2_RBR_MEAN': 'TEMP_O2_MEAN', 'SAL_RBR_MEAN': 'SAL_MEAN',
                                'CHLOR_WETLABS_MEAN': 'CHLOR_MEAN'})
    if iusv == 8:  # 1037
        ds_usv = ds_usv.rename({'TEMP_CTD_RBR_MEAN': 'TEMP_CTD_MEAN', 'TEMP_CTD_RBR_STDDEV': 'TEMP_CTD_STDDEV',
                                'TEMP_O2_RBR_MEAN': 'TEMP_O2_MEAN'})
    if iusv == 9:  # 1037
        ds_usv = ds_usv.isel(trajectory=0).swap_dims({'obs': 'time'}).rename(
            {'latitude': 'lat', 'longitude': 'lon', 'TEMP_O2_RBR_MEAN': 'TEMP_O2_MEAN'})  # TEMP_CTD_RBR_MEAN':'TEMP_
    if (iusv == 9 or iusv <= 3):
        ilen = ds_usv.time.shape[0]
        ds_usv['WIND_HEIGHT_MEAN'] = xr.DataArray(np.ones(ilen) * np.nan, coords={'time': ds_usv.time}, dims=('time'))
        ds_usv.WIND_HEIGHT_MEAN.attrs = {'long_name': 'Wind measurement height', 'units': 'm',
                                         'installed_height': '5.2'}
        ds_usv['WAVE_DOMINANT_PERIOD'] = xr.DataArray(np.ones(ilen) * np.nan, coords={'time': ds_usv.time},
                                                      dims=('time'))
        ds_usv.WAVE_DOMINANT_PERIOD.attrs = {
            'standard_name': 'sea_surface_wave_period_at_variance_spectral_density_maximum',
            'long_name': 'Dominant wave period', 'units': 's', 'installed_height': '0.34'}
        ds_usv['WAVE_SIGNIFICANT_HEIGHT'] = xr.DataArray(np.ones(ilen) * np.nan, coords={'time': ds_usv.time},
                                                         dims=('time'))
        ds_usv.WAVE_SIGNIFICANT_HEIGHT.attrs = {'standard_name': 'sea_surface_wave_significant_height',
                                                'long_name': 'Significant wave height', 'units': 'm',
                                                'installed_height': '0.34'}

    # add room to write collocated data information
    ilen = ds_usv.time.shape[0]
   
    #ds_usv['delta_time'] = xr.DataArray(np.ones(ilen) * 999999, coords={'time': ds_usv.time}, dims=('time'))
    #ds_usv['sss_name'] = xr.DataArray(np.empty(ilen, dtype=str), coords={'time': ds_usv.time}, dims=('time'))
    #ds_usv['sss_dist'] = xr.DataArray(np.ones(ilen) * 999999, coords={'time': ds_usv.time}, dims=('time'))
    #ds_usv['sss_scan'] = xr.DataArray(np.ones(ilen) * 999999, coords={'time': ds_usv.time}, dims=('time'))
    #ds_usv['sss_cell'] = xr.DataArray(np.ones(ilen) * 999999, coords={'time': ds_usv.time}, dims=('time'))
    #ds_usv['sss_iqc_flag'] = xr.DataArray(np.ones(ilen) * 999999, coords={'time': ds_usv.time}, dims=('time'))
    #ds_usv['sss_sss'] = xr.DataArray(np.ones(ilen) * 999999, coords={'time': ds_usv.time}, dims=('time'))

    return ds_usv, name_usv_list[iusv]


# Read in All Saildrone cruises downloaded from https://data.saildrone.com/data/sets
- 2017 onwards, note that earlier data is going to lack insruments and be poorer data quality in general
- For this code I want to develop a routine that reads in all the different datasets and creates a standardized set
- It may work best to first read each of the files individually into a dictionary 
- then go through each dataset finding all variable names
- I decided to put all SST into TEMP_CTD_MEAN and same for Salinity so there is a single variable name
- this still preserves all the dataset information

In [3]:
import os
import sys
sys.path.append(os.path.join(os.environ['HOME'],'shared','users','lib'))
import ebdpy as ebd

ebd.set_credentials(profile='esip-qhub')

profile = 'esip-qhub'
region = 'us-west-2'
endpoint = f's3.{region}.amazonaws.com'
ebd.set_credentials(profile=profile, region=region, endpoint=endpoint)
worker_max = 30
client,cluster = ebd.start_dask_cluster(profile=profile,worker_max=worker_max, 
                                      region=region, use_existing_cluster=True,
                                      adaptive_scaling=False, wait_for_cluster=False, 
                                      environment='pangeo', worker_profile='Medium Worker', 
                                      propagate_env=True)

Existing Dask clusters:
Cluster Index c_idx: 0 / Name: dev.3de0663814034f1cb17c8a2d02454b09 ClusterStatus.RUNNING
Using existing cluster [0].
Setting Fixed Scaling workers=30
Reconnect client to clear cache
client.dashboard_link (for new browser tab/window or dashboard searchbar in Jupyterhub):
https://jupyter.qhub.esipfed.org/gateway/clusters/dev.3de0663814034f1cb17c8a2d02454b09/status
Propagating environment variables to workers


In [4]:
#adir_usv = '/home/jovyan/data/saildrone_data/*.nc' 
#files = [x for x in glob(adir_usv)]
#files


In [5]:
#dir_data = 'C:/Users/gentemann/Google Drive/public/2019_saildrone/' #'f:/data/cruise_data/saildrone/saildrone_data/'
#dir_data_pattern = 'C:/Users/gentemann/Google Drive/public/ALL_Saildrone_Data/Arctic/saildrone_usv_data/*.nc' 
#dir_data_pattern = 'F:/data/cruise_data/saildrone/saildrone_data/'
dir_data_pattern = '/home/jovyan/data/saildrone_data/*.nc'

dir_out = '/home/jovyan/data/sss_collocations_orbital_norepeat/'
dir_out2 = '/home/jovyan/data/sss_collocations_orbital_norepeat_mur'

files = glob(dir_out+'*.nc')
for ifile,file in enumerate(files):
    ds = xr.open_dataset(file)
    ds.close()
    if any(v=='latitude' for v in ds.dims.keys()):
        ds = ds.rename({'latitude':'lat','longitude':'lon'})
    if any(v=='latitude' for v in ds):
        ds = ds.rename({'latitude':'lat','longitude':'lon'})
    if any(v=='trajectory' for v in ds.dims.keys()):
        ds = ds.isel(trajectory=0)
#    for v in ds.dims.keys():
    if any(v=='obs' for v in ds.dims.keys()):
        ds = ds.swap_dims({'obs':'time'})
    if any(v=='ob' for v in ds.dims.keys()):
        ds = ds.swap_dims({'ob':'time'})
    if any(v=='row' for v in ds.dims.keys()):
        ds = ds.swap_dims({'row':'time'})
    #remove any duplicates in time, keep only first value
    _, index = np.unique(ds['time'], return_index=True)
    ds=ds.isel(time=index)
    name = file[52:-3]
    name = name.replace(" ", "_")
    name = name.replace("/", "_")
    if ifile==0:
        data_dict = {name:ds}
    else:
        data_dict[name]=ds
    print(name)
#data_dict = read_all_usv(dir_data_pattern)
#data_dict = add_coll_vars(data_dict)

saildrone-gen_5-atomic_eurec4a_2020-sd1060-20200117T000000-20200302T235959-1_minutes-v1.1589306886594rssv04.0_orbitalnorep
saildrone_west_coast_survey_2019_sd1043jplv05.0_orbitalnorep
saildrone_tpos_sd1029_2018jplv05.0_orbitalnorep
saildrone_west_coast_survey_2018_sd1026jplv05.0_orbitalnorep
saildrone_west_coast_survey_2018_sd1025rssv04.0_orbitalnorep
saildrone_arctic_2017_sd1003rssv04.0_orbitalnorep
saildrone_tpos_sd1030_2018jplv05.0_orbitalnorep
saildrone-gen_5-arctic_misst_2019-sd1036-20190514T230000-20191011T183000-1_minutes-v1.1575336154680rssv04.0_orbitalnorep
saildrone-gen_5-atomic_eurec4a_2020-sd1026-20200117T000000-20200302T235959-1_minutes-v1.1589306725934rssv04.0_orbitalnorep
saildrone_west_coast_survey_2019_sd1046rssv04.0_orbitalnorep
saildrone_tpos_sd1068_2019jplv05.0_orbitalnorep
saildrone_west_coast_survey_2019_sd1047jplv05.0_orbitalnorep
PMEL_Arctic_2015_sd128-ALL-1_min-v1jplv05.0_orbitalnorep
saildrone_tpos_sd1067_2019rssv04.0_orbitalnorep
saildrone_west_coast_survey_2

# Collocate MUR

In [6]:
from earthdata import Auth 
auth = Auth().login()

Enter your Earthdata Login username:  cgentemann
Enter your Earthdata password:  ··········


You're now authenticated with NASA Earthdata Login


In [7]:
url = "https://archive.podaac.earthdata.nasa.gov/s3credentials"
response = requests.get(url).json()

In [8]:
%%time
# set up read
json_consolidated = "s3://esip-qhub-public/nasa/mur/murv41_consolidated_20211011.json"
s_opts = {"requester_pays": True, "skip_instance_cache": True}
r_opts = {"key": response["accessKeyId"],"secret": response["secretAccessKey"],"token": response["sessionToken"],"client_kwargs": {"region_name": "us-west-2"},}
fs = fsspec.filesystem("reference",fo=json_consolidated,
                       ref_storage_args=s_opts,remote_protocol="s3",
                       remote_options=r_opts,simple_templates=True,)
ds_sst = xr.open_dataset(fs.get_mapper(""), decode_times=False, engine="zarr", consolidated=False)
ds_sst

CPU times: user 1min 15s, sys: 6.19 s, total: 1min 21s
Wall time: 1min 36s


# Collocate using .interp linear interpolation

In [9]:
for iname,name in enumerate(data_dict):
    if iname>3:
        continue
    print(iname)
    ds_usv = data_dict[name].copy(deep=True)
    for cc in ds_usv.coords:
        if cc=='latitude':
            ds_usv = ds_usv.rename({'latitude':'lat'})
        if cc=='latitude':
            ds_usv = ds_usv.rename({'longitude':'lon'})
ds_usv

0
1
2
3


In [10]:
ilen = len(ds_usv.time)
print(ilen)
#create space for new data
for var in ds_sst:  
    ds_usv[var]=ds_usv.BARO_PRES_MEAN*np.nan
    ds_usv[var].attrs=ds_sst[var].attrs
ds_usv_tem = ds_usv.copy(deep=True)
#break up problem into bite size bits so you don't run out of memory
for inc in range(0,ilen,20000):
    i1,i2 = inc,inc+20000
    if i2>ilen:
        i2=ilen-1
    print(inc,inc+20000)
    sub = ds_usv.isel(time=slice(i1,i2))   
    t1,t2=sub.time.min().data-np.timedelta64(1,'D'),sub.time.max().data+np.timedelta64(1,'D')
    x1,x2=sub.lon.min().data,sub.lon.max().data
    y1,y2=sub.lat.min().data,sub.lat.max().data
    print(t1,t2,x1,x2,y1,y2)
    ds_sat = ds_sst.sel(time=slice(t1,t2),lat=slice(y1,y2),lon=slice(x1,x2)).load()   
    ds_interp = ds_sat.interp(time=sub.time,lat=sub.lat,lon=sub.lon,method='linear')#.interp(method='nearest')
    #add saildrone data to interpolated sat data
    ds_interp = ds_interp.reset_coords(names={'lat','lon'})
    for var in ds_interp:
        ds_usv_tem[var][i1:i2]=ds_interp[var]

219
0 20000
2018-07-07T03:07:00.000000000 2018-10-26T07:40:47.142857142 -125.29057701647055 -117.34402468452535 32.5762861298091 40.46447049878721


In [None]:
%%time
ds = ds_sst
for iname,name in enumerate(data_dict):
    print(iname)
    ds_usv = data_dict[name].copy(deep=True)
    for cc in ds_usv.coords:
        if cc=='latitude':
            ds_usv = ds_usv.rename({'latitude':'lat'})
        if cc=='longitude':
            ds_usv = ds_usv.rename({'longitude':'lon'})
    ilen = len(ds_usv.time)
    print(ilen)
    #create space for new data
    for var in ds_sst:  
        ds_usv[var]=ds_usv.BARO_PRES_MEAN*np.nan
        ds_usv[var].attrs=ds_sst[var].attrs
    ds_usv_tem = ds_usv.copy(deep=True)
    ds_usv_tem2 = ds_usv.copy(deep=True)
    for inc in range(0,ilen,5000):
        i1,i2 = inc,inc+5000
        if i2>ilen:
            i2=ilen-1
        print(inc,inc+5000)
        sub = ds_usv.isel(time=slice(i1,i2))   
        t1,t2=sub.time.min().data-np.timedelta64(1,'D'),sub.time.max().data+np.timedelta64(1,'D')
        x1,x2=sub.lon.min().data,sub.lon.max().data
        y1,y2=sub.lat.min().data,sub.lat.max().data
        print(t1,t2,x1,x2,y1,y2)
        ds_sat = ds_sst.sel(time=slice(t1,t2),lat=slice(y1,y2),lon=slice(x1,x2)).load()   
        ds_interp = ds_sat.interp(time=sub.time,lat=sub.lat,lon=sub.lon,method='linear') #add saildrone data to interpolated sat data
        ds_interp = ds_interp.reset_coords(names={'lat','lon'})
        for var in ds_interp:
            ds_usv_tem[var][i1:i2]=ds_interp[var]            
        ds_interp = ds_sat.interp(time=sub.time,lat=sub.lat,lon=sub.lon,method='nearest') #add saildrone data to interpolated sat data
        ds_interp = ds_interp.reset_coords(names={'lat','lon'})
        for var in ds_interp:
            ds_usv_tem2[var][i1:i2]=ds_interp[var]                       
   #output
    fout = dir_out+name+'_20211116.nc'
    ds_usv_tem.to_netcdf(fout)
    fout = dir_out2+name+'_20211116.nc'
    ds_usv_tem2.to_netcdf(fout)
    

0
212
0 5000
2020-01-16T04:33:00.000000000 2020-03-02T13:43:28.399071925 -59.32408349377049 -48.65138466010929 7.452111005157594 12.687814156065574
1
34
0 5000
2019-06-17T06:45:30.000000000 2019-08-02T05:52:00.000000000 -126.2351190974359 -121.86958851997399 35.89619735960591 41.700128787939704
2
748
0 5000
2018-10-06T10:09:00.000000000 2019-09-20T12:46:00.000000000 -162.50791563636363 -139.13910015999997 -0.20216102913907286 22.770619311511354


NameError: name 'ds_sat' is not defined

In [None]:
ds_usv = data_dict[name].copy(deep=True)
ds_usv['lat'] = ds_usv.lat.interpolate_na(dim='time',method='linear').ffill(dim='time').bfill(dim='time')
ds_usv['lon'] = ds_usv.lon.interpolate_na(dim='time',method='linear').ffill(dim='time').bfill(dim='time')
tem = ds_usv.lat.ffill(dim='time')
tem = ds_usv.lat.bfill(dim='time')
t1,t2=ds_usv.time.min().data-np.timedelta64(8,'D'),ds_usv.time.max().data+np.timedelta64(8,'D')
x1,x2=ds_usv.lon.min().data,ds_usv.lon.max().data
y1,y2=ds_usv.lat.min().data,ds_usv.lat.max().data
print(t1,t2)
#ds_sat = ds.sel(time=slice(t1,t2),lat=slice(y1,y2),lon=slice(x1,x2)).load()   
#ds_interp = ds_sat.interp(time=ds_usv.time,lat=ds_usv.lat,lon=ds_usv.lon,method='linear')#.interp(method='nearest')
#add saildrone data to interpolated sat data
#ds_interp = ds_interp.reset_coords(names={'lat','lon'})


In [None]:
#print out distance to land as a check
for iname,name in enumerate(data_dict):
    fin = dir_out+name+'_RSSv4.0_8dy_20210511.nc'
    ds_usv=xr.open_dataset(fin)
    ds_usv.close()
    plt.scatter(ds_usv.lon[::500],ds_usv.lat[::500],c=ds_usv.dist_land[::500],vmin=0,vmax=500)

# Collocate using .interp nearest neighbor interpolation

In [None]:
%%time
for iname,name in enumerate(data_dict):
    print(iname)
    ds_usv = data_dict[name].copy(deep=True)
    ds_usv['lat'] = ds_usv.lat.interpolate_na(dim='time',method='linear').ffill(dim='time').bfill(dim='time')
    ds_usv['lon'] = ds_usv.lon.interpolate_na(dim='time',method='linear').ffill(dim='time').bfill(dim='time')
    tem = ds_usv.lat.ffill(dim='time')
    tem = ds_usv.lat.bfill(dim='time')
    t1,t2=ds_usv.time.min().data-np.timedelta64(8,'D'),ds_usv.time.max().data+np.timedelta64(8,'D')
    x1,x2=ds_usv.lon.min().data,ds_usv.lon.max().data
    y1,y2=ds_usv.lat.min().data,ds_usv.lat.max().data
    print(t1,t2)
    ds_sat = ds.sel(time=slice(t1,t2),lat=slice(y1,y2),lon=slice(x1,x2)).load()   
    ds_interp = ds_sat.interp(time=ds_usv.time,lat=ds_usv.lat,lon=ds_usv.lon,method='nearest')#.interp(method='nearest')

    #add saildrone data to interpolated sat data
    ds_interp = ds_interp.reset_coords(names={'lat','lon'})
    ds_interp['sat_time']=ds_interp.time
    for var in ds_interp:
        ds_usv['sat_'+var]=ds_interp[var]
        
    #add distance to land

    ds_usv['dist_land']=ds_land.dist_land.interp(lat=ds_usv.lat,lon=ds_usv.lon).drop({'lat','lon'})
    lnd_att={'long_name':'distance to nearest land','units':'km'}
    ds_usv['dist_land'].attrs=lnd_att    

    #output
    fout = dir_out2+name+'_RSSv4.0_8dy_20210511.nc'
    ds_usv.to_netcdf(fout)

# Collocation SMAP JPL 8day

In [None]:
#JPL
adir = 'F:/data/sat_data/smap/SSS/L3/JPL/V5.0/8day_running/**/**/*5.0.nc'
files = [x for x in glob(adir)]
print('number of file:',len(files))

ds = xr.open_mfdataset(files,combine='nested',concat_dim='time')
ds = ds.rename({'latitude':'lat','longitude':'lon'})
ds = ds.sortby(ds.lat)
ds.close()  
ds

In [None]:
%%time
for iname,name in enumerate(data_dict):
    print(iname)
    ds_usv = data_dict[name].copy(deep=True)
    ds_usv['lat'] = ds_usv.lat.interpolate_na(dim='time',method='linear').ffill(dim='time').bfill(dim='time')
    ds_usv['lon'] = ds_usv.lon.interpolate_na(dim='time',method='linear').ffill(dim='time').bfill(dim='time')
    tem = ds_usv.lat.ffill(dim='time')
    tem = ds_usv.lat.bfill(dim='time')
    t1,t2=ds_usv.time.min().data-np.timedelta64(8,'D'),ds_usv.time.max().data+np.timedelta64(8,'D')
    x1,x2=ds_usv.lon.min().data,ds_usv.lon.max().data
    y1,y2=ds_usv.lat.min().data,ds_usv.lat.max().data
    print(t1,t2)
    ds_sat = ds.sel(time=slice(t1,t2),lat=slice(y1,y2),lon=slice(x1,x2)).load()   
    ds_interp = ds_sat.interp(time=ds_usv.time,lat=ds_usv.lat,lon=ds_usv.lon,method='linear')#.interp(method='nearest')

    #add saildrone data to interpolated sat data
    ds_interp = ds_interp.reset_coords(names={'lat','lon'})
    for var in ds_interp:
        ds_usv['sat_'+var]=ds_interp[var]    
    
    #add distance to land
    ds_usv['dist_land']=ds_land.dist_land.interp(lat=ds_usv.lat,lon=ds_usv.lon).drop({'lat','lon'})
    lnd_att={'long_name':'distance to nearest land','units':'km'}
    ds_usv['dist_land'].attrs=lnd_att    

    fout = dir_out+name+'_JPLv5.0_8dy_20210613.nc'
    ds_usv.to_netcdf(fout)

In [None]:
#print out distance to land as a check
for iname,name in enumerate(data_dict):
    fin = dir_out+name+'_JPLv5.0_8dy_20210613.nc'
    ds_usv=xr.open_dataset(fin)
    ds_usv.close()
    plt.scatter(ds_usv.lon[::500],ds_usv.lat[::500],c=ds_usv.dist_land[::500],vmin=0,vmax=500)

In [None]:
%%time
for iname,name in enumerate(data_dict):
    print(iname)
    ds_usv = data_dict[name].copy(deep=True)
    ds_usv['lat'] = ds_usv.lat.interpolate_na(dim='time',method='linear').ffill(dim='time').bfill(dim='time')
    ds_usv['lon'] = ds_usv.lon.interpolate_na(dim='time',method='linear').ffill(dim='time').bfill(dim='time')
    tem = ds_usv.lat.ffill(dim='time')
    tem = ds_usv.lat.bfill(dim='time')
    t1,t2=ds_usv.time.min().data-np.timedelta64(8,'D'),ds_usv.time.max().data+np.timedelta64(8,'D')
    x1,x2=ds_usv.lon.min().data,ds_usv.lon.max().data
    y1,y2=ds_usv.lat.min().data,ds_usv.lat.max().data
    print(t1,t2)
    ds_sat = ds.sel(time=slice(t1,t2),lat=slice(y1,y2),lon=slice(x1,x2)).load()   
    ds_interp = ds_sat.interp(time=ds_usv.time,lat=ds_usv.lat,lon=ds_usv.lon,method='nearest')#.interp(method='nearest')

    #add saildrone data to interpolated sat data
    ds_interp = ds_interp.reset_coords(names={'lat','lon'})
    for var in ds_interp:
        ds_usv['sat_'+var]=ds_interp[var]

    #add distance to land
    ds_usv['dist_land']=ds_land.dist_land.interp(lat=ds_usv.lat,lon=ds_usv.lon).drop({'lat','lon'})
    lnd_att={'long_name':'distance to nearest land','units':'km'}
    ds_usv['dist_land'].attrs=lnd_att    

    fout = dir_out2+name+'_JPLv5.0_8dy_20210613'+'.nc'
    ds_usv.to_netcdf(fout)

# remove repeats
- Saildrone samples every 1 minute, so many saildrone obsevations will match with the same 8-day gridded SMAP data point.
1. read in all the collocated data
2. find what saildrone data are matched to a single smap ob
3. average all the saildrone data that were matched
4. create a new dataset, that will be much smaller, where 1 averaged saildrone ob is matched to 1 smap ob

In [None]:
#saildrone-gen_5-arctic_misst_2019-sd1037-20190514T230000-20191011T183000-1_minutes-v1.1575487464625_RSS8dy_20210413.nc

data_dir = 'F:/data/cruise_data/saildrone/sss/sss_collocations_8day_nearest/'
data_dir_out = 'F:/data/cruise_data/saildrone/sss/sss_collocations_8day_nearest_norepeat/'
filenames = [x for x in glob(data_dir+'*.nc')]
filenames

In [None]:
loc=['lat','lon']
for iname,name in enumerate(filenames):
    print(iname,len(filenames))
    i = name.find('\\')
    fout = data_dir_out + name[i+1:-3]+'norep_20210613.nc' 
    ds = xr.open_dataset(name)
    ds.close()
    if 'RSS' in name:
        continue
    if 'RSS' in name:  
        ds_tem2 = ds.where((ds.sat_sss_smap<50) & (ds.sat_sss_smap>1),drop=True)    
        isv=0
        while len(ds_tem2.time)>1:
            i=0
            cond = ((ds_tem2.sat_sss_smap==ds_tem2.sat_sss_smap[i]) 
                    & (ds_tem2.sat_sss_smap_uncertainty==ds_tem2.sat_sss_smap_uncertainty[i]) 
                    & (ds_tem2.sat_sss_smap_40km==ds_tem2.sat_sss_smap_40km[i]))
            subset = ds_tem2.where(cond,drop=True)  #repeat obs
            ds_mn = subset.mean(keep_attrs=True,skipna=True)
            ds_mn['time'] = subset.time.mean()
            ds_mn = ds_mn.assign_coords({'ob':isv})
            if isv==0:
                ds_mn2 = ds_mn
            else:
                ds_mn2 = xr.concat([ds_mn2,ds_mn],dim='ob')
            isv = isv+1
            ds_tem2 = ds_tem2.where(~cond,drop=True)  #data with repeat obs removed
    else:
        ds_tem2 = ds.where((ds.sat_smap_sss<50) & (ds.sat_smap_sss>1),drop=True)    
        isv=0
        while len(ds_tem2.time)>1:
            i=0
            cond = ((ds_tem2.sat_smap_sss==ds_tem2.sat_smap_sss[i]) 
                    & (ds_tem2.sat_anc_sst==ds_tem2.sat_anc_sst[i]) 
                    & (ds_tem2.sat_anc_sss==ds_tem2.sat_anc_sss[i]))
            subset = ds_tem2.where(cond,drop=True)  #repeat obs
            ds_mn = subset.mean(keep_attrs=True,skipna=True)
            ds_mn['time'] = subset.time.mean()
            ds_mn = ds_mn.assign_coords({'ob':isv})
            if isv==0:
                ds_mn2 = ds_mn
            else:
                ds_mn2 = xr.concat([ds_mn2,ds_mn],dim='ob')
            isv = isv+1
            ds_tem2 = ds_tem2.where(~cond,drop=True)  #data with repeat obs removed
    ds_mn2.to_netcdf(fout)

In [None]:
fout

In [None]:
#tem_dir = 'F:/data/cruise_data/saildrone/sss/sss_collocations_8day_nearest_norepeat/'
#f1= 'saildrone-gen_5-arctic_misst_2019-sd1036-20190514T230000-20191011T183000-1_minutes-v1.1575336154680_RSS8dy_20210413norep.nc'
#f2='saildrone-gen_5-arctic_misst_2019-sd1036-20190514T230000-20191011T183000-1_minutes-v1.1575336154680_RSS8dynorep.nc'
#fout2 = tem_dir+f1
#fout2 = fout
fout2 = 'F:/data/cruise_data/saildrone/sss/sss_collocations_8day_nearest_norepeat/saildrone-gen_5-arctic_misst_2019-sd1036-20190514T230000-20191011T183000-1_minutes-v1.1575336154680_JPLv5.0_8dy_20210511norep_20210511.nc'
fout3 = 'F:/data/cruise_data/saildrone/sss/old/sss_collocations_8day_nearest_norepeat/saildrone-gen_5-arctic_misst_2019-sd1036-20190514T230000-20191011T183000-1_minutes-v1.1575336154680_JPL8dy_20210413norep_20210413.nc'
ds_mn2 = xr.open_dataset(fout2)
ds_mn2.close()
ds_mn3 = xr.open_dataset(fout3)
ds_mn3.close()

fout2 = 'F:/data/cruise_data/saildrone/sss/sss_collocations_8day_nearest_norepeat/saildrone-gen_5-arctic_misst_2019-sd1036-20190514T230000-20191011T183000-1_minutes-v1.1575336154680_RSSv4.0_8dy_20210511norep_20210511.nc'
fout3 = 'F:/data/cruise_data/saildrone/sss/old/sss_collocations_8day_nearest_norepeat/saildrone-gen_5-arctic_misst_2019-sd1036-20190514T230000-20191011T183000-1_minutes-v1.1575336154680_RSS8dy_20210413norep_20210413.nc'
ds_mn2a = xr.open_dataset(fout2)
ds_mn2a.close()
ds_mn3a = xr.open_dataset(fout3)
ds_mn3a.close()

In [None]:
ds_mn3

In [None]:
plt.scatter(ds_mn3a.sat_sss_smap,ds_mn2a.sat_sss_smap)
#plt.scatter(ds_mn3.time,ds_mn3.sat_smap_sss)
#plt.scatter(ds_mn2.time,ds_mn2.sat_smap_sss)
#plt.scatter(ds_mn3a.time,ds_mn3a.sat_sss_smap)
#plt.scatter(ds_mn3a.time,ds_mn3a.sat_smap_sss)

In [None]:
tdif = ds_mn2.sat_sss_smap-ds_mn2.SAL_CTD_MEAN
tdif = tdif.where(abs(tdif)<10,drop=True)
print('rss new',tdif.mean().data,tdif.std().data,len(tdif))
tdif = ds_mn3.sat_sss_smap-ds_mn3.SAL_CTD_MEAN
tdif = tdif.where(abs(tdif)<10,drop=True)
print('rss old',tdif.mean().data,tdif.std().data,len(tdif))
tdif = ds_mn2a.sat_smap_sss-ds_mn2a.SAL_CTD_MEAN
tdif = tdif.where(abs(tdif)<10,drop=True)
print('jpl new',tdif.mean().data,tdif.std().data,len(tdif))
tdif = ds_mn3a.sat_smap_sss-ds_mn3a.SAL_CTD_MEAN
tdif = tdif.where(abs(tdif)<10,drop=True)
print('jpl old',tdif.mean().data,tdif.std().data,len(tdif))

In [None]:
(ds_mn2.sat_sss_smap-ds_mn2.SAL_CTD_MEAN).rmse


# TESTING

In [None]:
fname = 'F:/data/cruise_data/saildrone/sss/sss_collocations_8day_nearest/saildrone-gen_5-arctic_misst_2019-sd1036-20190514T230000-20191011T183000-1_minutes-v1.1575336154680_JPL8dy.nc'
ds_tem = xr.open_dataset(fname)
plt.plot(ds_tem.lon,ds.lat)

In [None]:
#fix remove RSS data from JPL collocation
ds = ds.drop({'sat_nobs','sat_nobs_40km','sat_sss_smap','sat_sss_smap_uncertainty','sat_sss_smap_40km','sat_sss_ref','sat_gland','sat_fland','sat_gice','sat_surtep'})

In [None]:
ds_interp

In [None]:
ds_usv

In [None]:
plt.plot(ds_usv.time,ds_usv.SAL_CTD_MEAN,'b')
plt.plot(ds_usv.time,ds_usv.sat_smap_sss,'r.')

In [None]:
tem = ds.sel(time='2019-08-01',lat=slice(30,55),lon=slice(-130,-110))
plt.pcolormesh(tem.lon,tem.lat,tem.smap_sss[0,:,:])
plt.plot(ds_usv.lon,ds_usv.lat)

In [None]:
import xarray as xr
file = 'F:/data/cruise_data/saildrone/sss/sss_collocations_8day_nearest/saildrone-gen_5-atomic_eurec4a_2020-sd1026-20200117T000000-20200302T235959-1_minutes-v1.1589306725934_JPL8dy.nc'
ds = xr.open_dataset(file)
ds

In [None]:
#for name in data_dict:
#    print(name)
ds2 = data_dict['saildrone-gen_5-atomic_eurec4a_2020-sd1026-20200117T000000-20200302T235959-1_minutes-v1.1589306725934']
ds2

In [None]:
dir_list = ['F:/data/cruise_data/saildrone/sss/sss_collocations_8day/',
        'F:/data/cruise_data/saildrone/sss/sss_collocations_8day_nearest/',
        'F:/data/cruise_data/saildrone/sss/sss_collocations_8day_nearest_norepeat/']
files = glob(dir_list[0]+'*.nc')
file = files[2]
#if 'JPL' in file:
print(file)
ds = xr.open_dataset(file)
ds.close()
#ds = ds.drop({'sat_nobs','sat_nobs_40km','sat_sss_smap','sat_sss_smap_uncertainty','sat_sss_smap_40km','sat_sss_ref','sat_gland','sat_fland','sat_gice','sat_surtep' })
print(ds)
#ds.to_netcdf(file)

In [None]:
ds