# This is the Saildrone and MUR global 1 km sea surface temperature collocation code. 


In [49]:
import os
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
import xarray as xr

def get_sat_filename(date):
    dir_sat='F:/data/sst/jpl_mur/v4.1/'
    syr, smon, sdym, sjdy = str(date.dt.year.data), str(date.dt.month.data).zfill(2), str(date.dt.day.data).zfill(2), str(date.dt.dayofyear.data).zfill(2)
    sat_filename = dir_sat + syr + '/'+ sjdy + '/' + syr + smon + sdym + '090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc'
    exists = os.path.isfile(sat_filename)
    return sat_filename, exists

def robust_std(ds):
    MAD = np.nanmedian(ds)
    std_robust = MAD * 1.482602218505602

# Read in USV data
Read in the Saildrone USV file either from a local disc or using OpenDAP.

There are 6 NaN values in the lat/lon data arrays, interpolate across these

We want to collocate with wind vectors for this example,  but the wind vectors are only every 10 minutes rather than every minute, so use .dropna to remove all values in the dataset from all dataarrays when wind vectors aren't availalbe

In [50]:
filename_collocation_data = 'F:/data/cruise_data/saildrone/baja-2018/ccmp_collocation_data.nc'
#filename_usv = 'https://podaac-opendap.jpl.nasa.gov/opendap/hyrax/allData/insitu/L2/saildrone/Baja/saildrone-gen_4-baja_2018-sd1002-20180411T180000-20180611T055959-1_minutes-v1.nc'
filename_usv='f:/data/cruise_data/saildrone/baja-2018/saildrone-gen_4-baja_2018-sd1002-20180411T180000-20180611T055959-1_minutes-v1.nc'
ds_usv = xr.open_dataset(filename_usv)
ds_usv.close()
ds_usv = ds_usv.isel(trajectory=0).swap_dims({'obs':'time'}).rename({'longitude':'lon','latitude':'lat'})
ds_usv = ds_usv.sel(time=slice('2018-04-12T02','2018-06-10T18')) #get rid of last part and first part where USV being towed
ds_usv['lon'] = ds_usv.lon.interpolate_na(dim='time',method='linear') #there are 6 nan values
ds_usv['lat'] = ds_usv.lat.interpolate_na(dim='time',method='linear')
ds_usv['wind_speed']=np.sqrt(ds_usv.UWND_MEAN**2+ds_usv.VWND_MEAN**2)
ds_usv['wind_dir']=np.arctan2(ds_usv.VWND_MEAN,ds_usv.UWND_MEAN)*180/np.pi
ds_usv_subset = ds_usv.copy(deep=True)
ds_usv_subset = ds_usv_subset.where(np.logical_not((ds_usv.time.dt.hour>12)&(ds_usv.wind_speed<7.5)))
ds_usv_subset = ds_usv_subset.where(np.logical_not((ds_usv.time.dt.hour<6)&(ds_usv.wind_speed<7.5)))
ds_usv_subset = ds_usv_subset.where(np.logical_not((ds_usv.time>np.datetime64('2018-05-24T12')) & (ds_usv.time<np.datetime64('2018-05-26T12'))))
#ds_usv_subset = ds_usv.dropna(dim='time',subset={'UWND_MEAN'})   #get rid of all the nan
#print(ds_usv_subset.UWND_MEAN[2000:2010].values)

In order to use open_mfdataset you need to either provide a path or a list of filenames to input

Here we use the USV cruise start and end date to read in all data for that period

In [51]:
read_date,end_date = ds_usv_subset.time.min(),ds_usv_subset.time.max()
filelist = []
while read_date<=(end_date+np.timedelta64(1,'D')):
    tem_filename,exists = get_sat_filename(read_date)
    if exists:
        filelist.append(tem_filename)
    read_date=read_date+np.timedelta64(1,'D')
print(filelist[0])

F:/data/sst/jpl_mur/v4.1/2018/102/20180412090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc


# Read in MUR data
Read in data using open_mfdataset with the option coords='minimal'

The dataset is printed out and you can see that rather than straight xarray data array for each of the data variables open_mfdataset using dask arrays

In [52]:
ds_sat = xr.open_mfdataset(filelist,coords='minimal')
ds_sat

<xarray.Dataset>
Dimensions:           (lat: 17999, lon: 36000, time: 61)
Coordinates:
  * lat               (lat) float32 -89.99 -89.98 -89.97 ... 89.97 89.98 89.99
  * lon               (lon) float32 -179.99 -179.98 -179.97 ... 179.99 180.0
  * time              (time) datetime64[ns] 2018-04-12T09:00:00 ... 2018-06-11T09:00:00
Data variables:
    analysed_sst      (time, lat, lon) float32 dask.array<shape=(61, 17999, 36000), chunksize=(1, 17999, 36000)>
    analysis_error    (time, lat, lon) float32 dask.array<shape=(61, 17999, 36000), chunksize=(1, 17999, 36000)>
    mask              (time, lat, lon) float32 dask.array<shape=(61, 17999, 36000), chunksize=(1, 17999, 36000)>
    sea_ice_fraction  (time, lat, lon) float32 dask.array<shape=(61, 17999, 36000), chunksize=(1, 17999, 36000)>
    dt_1km_data       (time, lat, lon) timedelta64[ns] dask.array<shape=(61, 17999, 36000), chunksize=(1, 17999, 36000)>
Attributes:
    Conventions:                CF-1.5
    title:                   

# Xarray interpolation won't run on chunked dimensions.  
1. First let's subset the data to make it smaller to deal with by using the cruise lat/lons

1. Now load the data into memory (de-Dask-ify) it  


In [None]:
#Step 1 from above
print('min max lat lon:', ds_usv_subset.lon.min().data,ds_usv_subset.lon.max().data,ds_usv_subset.lat.min().data,ds_usv_subset.lat.max().data)
subset = ds_sat.sel(lon=slice(ds_usv_subset.lon.min().data,ds_usv_subset.lon.max().data),
                    lat=slice(ds_usv_subset.lat.min().data,ds_usv_subset.lat.max().data))
#Step 2 from above
subset.load()

min max lat lon: -125.55297279999999 -115.5226624 28.0176832 37.6797408


# Collocate USV data with MUR data
There are different options when you interpolate.  First, let's just do a linear interpolation

In [None]:

ds_collocated = subset.interp(lat=ds_usv_subset.lat,lon=ds_usv_subset.lon,time=ds_usv_subset.time,method='linear')


# Collocate USV data with MUR data
There are different options when you interpolate.  First, let's just do a nearest point rather than interpolate the data

In [None]:
ds_collocated_nearest = subset.interp(lat=ds_usv_subset.lat,lon=ds_usv_subset.lon,time=ds_usv_subset.time,method='nearest')


# A larger STD that isn't reflective of uncertainty in the observation
The collocation above will result in multiple USV data points matched with a single satellite
observation.    The USV is sampling every 1 min and approximately few meters, while the satellite
is an average over a footprint that is interpolated onto a daily mean map.  While calculating the mean would results in a valid mean, the STD would be higher and consist of a component that reflects the uncertainty of the USV and the satellite and a component that reflects the natural variability in the region that is sampled by the USV

Below we use the 'nearest' collocation results to identify when multiple USV data are collcated to
a single satellite observation.
This code goes through the data and creates averages of the USV data that match the single CCMP collocated value.


In [None]:
ilen,index = ds_collocated_nearest.dims['time'],0
ds_tem = ds_collocated_nearest.copy(deep=True)
duu, duv1, duv2, dlat, dlon, dut = [],[],[],[],[],np.empty((),dtype='datetime64')
while index <= ilen-2:
    index += 1
    if np.isnan(ds_collocated_nearest.analysed_sst[index]):
        continue
    if np.isnan(ds_tem.analysed_sst[index]):
        continue
   # print(index, ilen)
    iend = index + 1000
    if iend > ilen-1:
        iend = ilen-1
    ds_tem_subset = ds_tem.analysed_sst[index:iend]
    ds_usv_subset2sst = ds_usv_subset.TEMP_CTD_MEAN[index:iend]
    ds_usv_subset2uwnd = ds_usv_subset.UWND_MEAN[index:iend]
    ds_usv_subset2vwnd = ds_usv_subset.VWND_MEAN[index:iend]
    ds_usv_subset2lat = ds_usv_subset.lat[index:iend]
    ds_usv_subset2lon = ds_usv_subset.lon[index:iend]
    ds_usv_subset2time = ds_usv_subset.time[index:iend]
    cond = ((ds_tem_subset==ds_collocated_nearest.analysed_sst[index]))
    notcond = np.logical_not(cond)
    #cond = ((ds_tem.analysed_sst==ds_collocated_nearest.analysed_sst[index]))
    #notcond = np.logical_not(cond)
    masked = ds_tem_subset.where(cond)
    if masked.sum().data==0:  #don't do if data not found
        continue
    masked_usvsst = ds_usv_subset2sst.where(cond,drop=True)
    masked_usvuwnd = ds_usv_subset2uwnd.where(cond,drop=True)
    masked_usvvwnd = ds_usv_subset2vwnd.where(cond,drop=True)
    masked_usvlat = ds_usv_subset2lat.where(cond,drop=True)
    masked_usvlon = ds_usv_subset2lon.where(cond,drop=True)
    masked_usvtime = ds_usv_subset2time.where(cond,drop=True)
    duu=np.append(duu,masked_usvsst.mean().data)
    duv1=np.append(duv1,masked_usvuwnd.mean().data)
    duv2=np.append(duv2,masked_usvvwnd.mean().data)
    dlat=np.append(dlat,masked_usvlat.mean().data)
    dlon=np.append(dlon,masked_usvlon.mean().data)
    tdif = masked_usvtime[-1].data-masked_usvtime[0].data
    mtime=masked_usvtime[0].data+np.timedelta64(tdif/2,'ns')
    if mtime>dut.max():
        print(index,dut.shape[0],masked_usvtime[0].data,masked_usvtime[-1].data-masked_usvtime[0].data)
    dut=np.append(dut,mtime)
    ds_tem.analysed_sst[index:iend]=ds_tem.analysed_sst.where(notcond)
#    ds_tem=ds_tem.where(notcond,np.nan)  #masked used values by setting to nan
dut2 = dut[1:]  #remove first data point which is a repeat from what array defined    
ds_new=xr.Dataset(data_vars={'sst_usv': ('time',duu),'uwnd_usv': ('time',duv1),'vwnd_usv': ('time',duv2),
                             'lon': ('time',dlon),
                             'lat': ('time',dlat)},
                  coords={'time':dut2})
ds_new.to_netcdf('F:/data/cruise_data/saildrone/baja-2018/mur_downsampled_usv_data2.nc')

# redo the collocation
Now, redo the collocation, using 'linear' interpolation using the averaged data.  This will interpolate the data temporally onto the USV sampling which has been averaged to the satellite data grid points

2018-06-11T00:02:00.000000000


In [11]:
ds_collocated_averaged = subset.interp(lat=ds_new.lat,lon=ds_new.lon,time=ds_new.time,method='linear')
ds_collocated_averaged

<xarray.Dataset>
Dimensions:           (time: 6075)
Coordinates:
    lat               (time) float64 37.81 37.81 37.81 ... 37.78 37.77 37.77
    lon               (time) float64 -122.5 -122.5 -122.5 ... -122.3 -122.3
  * time              (time) datetime64[ns] 2018-04-11T18:53:30 ... 2018-06-11T00:02:00
Data variables:
    analysed_sst      (time) float64 286.3 286.3 286.3 286.3 ... nan nan nan
    analysis_error    (time) float64 0.3899 0.3884 0.3865 0.3858 ... nan nan nan
    mask              (time) float64 1.0 1.0 1.0 1.0 ... 1.0 1.009 1.464 1.447
    sea_ice_fraction  (time) float64 nan nan nan nan nan ... nan nan nan nan nan
Attributes:
    Conventions:                CF-1.5
    title:                      Daily MUR SST, Final product
    summary:                    A merged, multi-sensor L4 Foundation SST anal...
    references:                 http://podaac.jpl.nasa.gov/Multi-scale_Ultra-...
    institution:                Jet Propulsion Laboratory
    history:                

In [13]:
ds_collocated_averaged.to_netcdf('F:/data/cruise_data/saildrone/baja-2018/mur_downsampled_collocated_usv_data2.nc')
ds_new.to_netcdf('F:/data/cruise_data/saildrone/baja-2018/mur_downsampled_usv_data2.nc')


In [17]:
print(ds_collocated_averaged.time[-200:])

<xarray.DataArray 'time' (time: 200)>
array(['2018-06-10T01:32:00.000000000', '2018-06-10T01:27:00.000000000',
       '2018-06-10T01:31:30.000000000', '2018-06-10T01:36:30.000000000',
       '2018-06-10T01:42:30.000000000', '2018-06-10T01:53:00.000000000',
       '2018-06-10T02:07:00.000000000', '2018-06-10T03:03:00.000000000',
       '2018-06-10T02:29:30.000000000', '2018-06-10T02:36:00.000000000',
       '2018-06-10T03:38:00.000000000', '2018-06-10T02:57:00.000000000',
       '2018-06-10T02:52:00.000000000', '2018-06-10T03:38:00.000000000',
       '2018-06-10T03:25:30.000000000', '2018-06-10T03:16:30.000000000',
       '2018-06-10T03:21:30.000000000', '2018-06-10T03:25:00.000000000',
       '2018-06-10T04:05:30.000000000', '2018-06-10T03:48:00.000000000',
       '2018-06-10T03:53:00.000000000', '2018-06-10T03:57:30.000000000',
       '2018-06-10T04:13:00.000000000', '2018-06-10T04:08:00.000000000',
       '2018-06-10T04:12:00.000000000', '2018-06-10T04:52:00.000000000',
       '2018-

In [None]:
ds_collocated_averaged = xr.open_dataset('F:/data/cruise_data/saildrone/baja-2018/mur_downsampled_collocated_usv_data2.nc')
ds_collocated_averaged.close()
ds_new = xr.open_dataset('F:/data/cruise_data/saildrone/baja-2018/mur_downsampled_usv_data2.nc')
ds_new.close()

There are different ways to select data using an Xarray dataset.
- The easiest ways are to use .isel or .sel
- .isel selects by integer
- .sel selects by label

A note of caution, if you are using .isel it is better to rename your data variable.  If you run the block of code that selects the data more than once, using .sel it would still have the same result, while using .isel would apply the selection again

In [18]:
ds_collocated_averaged_subset = ds_collocated_averaged.isel(time=slice(0,-19))
ds_new_subset = ds_new.isel(time=slice(0,-19))


In [19]:
ds_collocated_averaged_subset = ds_collocated_averaged.sel(time=slice('2018-04-11T18:53:30','2018-06-10T16:35:30'))
ds_new_subset = ds_new.sel(time=slice('2018-04-11T18:53:30','2018-06-10T16:35:30'))


KeyError: "cannot represent labeled-based slice indexer for dimension 'time' with a slice over integer positions; the index is unsorted or non-unique"

In [None]:

#sat_sst = ds_collocated_averaged.analysed_sst[:-19]-273.15
#usv_sst = ds_new.sst_usv[:-19]


ds_new['spd']=np.sqrt(ds_new.uwnd_usv**2+ds_new.vwnd_usv**2)
usv_spd = ds_new.spd[:-19]
dif_sst = (sat_sst - usv_sst).dropna(dim='time')
std_robust = np.nanmedian(dif_sst) * 1.482602218505602
print('mean,std,rstd, dif ',[dif_sst.mean().data,dif_sst.std().data,std_robust,dif_sst.shape[0]])
plt.plot(usv_spd,dif_sst,'.')
plt.xlabel('USV wind speed (ms$^{-1}$)')
plt.ylabel('USV - Sat SST (K)')
sat_sst = ds_collocated_averaged.analysed_sst[:-19]-273.15
usv_sst = ds_new.sst_usv[:-19]
dif_sst = sat_sst - usv_sst
cond = usv_spd>2
dif_sst = dif_sst.where(cond).dropna('time')
std_robust = np.nanmedian(dif_sst) * 1.482602218505602
print('no low wind mean,std,rstd, dif ',[dif_sst.mean().data,dif_sst.std().data,std_robust,sum(cond).data])

In [None]:
plt.plot(usv_sst,dif_sst,'.')
plt.xlabel('USV  SST (K)')
plt.ylabel('USV - Sat SST (K)')

In [None]:
import cartopy.crs as ccrs
import matplotlib.pyplot as plt
ax = plt.axes(projection=ccrs.PlateCarree())
ax.coastlines()
ax.plt()


In [None]:
fig, ax = plt.subplots(figsize=(5,4))
ax.plot(sat_sst,sat_sst-usv_sst,'.')
ax.set_xlabel('USV wind speed (ms$^{-1}$)')
ax.set_ylabel('USV - Sat wind direction (deg)')
fig_fname='F:/data/cruise_data/saildrone/baja-2018/figs/sat_sst_both_bias.png'
fig.savefig(fig_fname, transparent=False, format='png')


In [None]:
plt.plot(dif_sst[:-19],'.')

In [None]:
subset.analysed_sst[0,:,:].plot()

In [None]:
subset.analysed_sst.mean({'lat','lon'}).plot()

In [None]:
subset.analysed_sst.mean({'time'}).plot()

In [None]:
subset.analysed_sst.std({'time'}).plot()