In [1]:
import matplotlib.pyplot as plt
import numpy as np
import xarray as xr
####################you will need to change some paths here!#####################
#list of input files
filename_cpr='f:/data/NASA_biophysical/CPR_data/All CPR Sample catalogue.xlsx'
filename_northpac_eddies='F:/data/NASA_biophysical/aviso/eddy_trajectory_19930101_20170106_north_pacific.nc'
filename_cpr_eddy='F:/data/NASA_biophysical/collocated_data/eddy_cpr_data_north_pacific.nc'
filename_eddy='F:/data/NASA_biophysical/collocated_data/eddy_ranking_data_north_pacific.nc'
#output files
filename_cpr_expanded='f:/data/NASA_biophysical/collocated_data/All CPR Sample catalogue with eddy info2.xlsx'
filename_cpr_expanded_netcdf='f:/data/NASA_biophysical/collocated_data/All CPR Sample catalogue with eddy info2.nc'
#################################################################################

# Define function to read in data and put in a dictionary

In [2]:
#define function to get all the data at once, use same years for climatology for all data
def get_data():
    
    #climatology years
    cyr1,cyr2='1993-01-01','2018-12-31'
    
    # CCMP test
    dir_pattern_zarr = 'F:/data/sat_data/ccmp/zarr/'
    ds= xr.open_zarr(dir_pattern_zarr)
    ds = ds.rename({'latitude':'lat','longitude':'lon'})
    ds.coords['lon'] = (ds.coords['lon'] + 180) % 360 - 180
    ds_ccmp = ds.sortby(ds.lon)
    ds_ccmp = ds_ccmp.drop('nobs')
    for var in ds_ccmp:
        tem = ds_ccmp[var].attrs
        tem['var_name']='ccmp_'+str(var)
        ds_ccmp[var].attrs=tem
    ds_ccmp_clim = ds_ccmp.sel(time=slice(cyr1,cyr2))
    ds_ccmp_clim = ds_ccmp_clim.groupby('time.dayofyear').mean('time',keep_attrs=True,skipna=False)
    
    # AVISO test
    dir_pattern_zarr = 'F:/data/sat_data/aviso/zarr/'
    ds= xr.open_zarr(dir_pattern_zarr)
    ds = ds.rename({'latitude':'lat','longitude':'lon'})
    ds.coords['lon'] = (ds.coords['lon'] + 180) % 360 - 180
    ds_aviso = ds.sortby(ds.lon).drop({'lat_bnds','lon_bnds','crs','err'})
    for var in ds_aviso:
        tem = ds_aviso[var].attrs
        tem['var_name']='aviso_'+str(var)
        ds_aviso[var].attrs=tem
    ds_aviso_clim = ds_aviso.sel(time=slice(cyr1,cyr2))
    ds_aviso_clim = ds_aviso_clim.groupby('time.dayofyear').mean('time',keep_attrs=True,skipna=False)    

    #sst
    dir_pattern_zarr = 'F:/data/sst/cmc/zarr/'
    ds_sst= xr.open_zarr(dir_pattern_zarr)
    ds_sst = ds_sst.drop({'analysis_error','mask','sea_ice_fraction'})
    tem = ds_sst.analysed_sst.attrs
    tem['var_name']='cmc_sst'
    ds_sst.analysed_sst.attrs=tem
    ds_sst_clim = ds_sst.sel(time=slice(cyr1,cyr2))
    ds_sst_clim = ds_sst_clim.groupby('time.dayofyear').mean('time',keep_attrs=True,skipna=False)
    
    #get bathymetry from ETOPO1
    fname_topo = 'F:/data/topo/ETOPO1_Ice_g_gmt4.grd'
    ds = xr.open_dataset(fname_topo)
#    x = ds.x  #21601
#    y = ds.y   #10801
#    topo = ds.z  #(10801, 21601)
    ds_topo = ds.rename_dims({'x':'lon','y':'lat'}).rename({'x':'lon','y':'lat'})
    tem = ds_topo.z.attrs
    tem['var_name']='etopo_depth'
    ds_topo.z.attrs=tem
#    ds_topo

    #put data into a dictionary
    data_dict={'aviso':ds_aviso,
               'wnd':ds_ccmp,
               'sst':ds_sst,
              'topo':ds_topo}
    clim_dict={'aviso_clim':ds_aviso_clim,
               'wnd_clim':ds_ccmp_clim,
               'sst_clim':ds_sst_clim}
  
    return data_dict,clim_dict

def get_eddy():
    filename='F:/data/NASA_biophysical//collocated_data/All CPR Sample catalogue with eddy info4.nc'
    ds_eddy = xr.open_dataset(filename)
    tt=np.empty(ds_eddy.z.size,dtype='datetime64[ns]') 
    for i in range(ds_eddy.z.size):
        tstr=str(ds_eddy.cpr_sample_year[i].data)+'-'+str(ds_eddy.cpr_sample_month[i].data).zfill(2)+'-'+str(ds_eddy.cpr_sample_day[i].data).zfill(2)
        tem=np.datetime64(tstr)
        tt[i]=tem
    ds_eddy['cpr_sample_time']=xr.DataArray(tt,dims=['z'])
    return ds_eddy

def get_all_eddy():
    filename_aviso='f:/data/NASA_biophysical/aviso/eddy_trajectory_19930101_20170106.nc'   #From AVISO  website
    ds = xr.open_dataset(filename_aviso)
    ds['longitude'] = (ds['longitude'] + 180) % 360 - 180
    ds_eddy = ds
#    tt=np.empty(ds_eddy.obs.size,dtype='datetime64[ns]') 
#    for i in range(ds_eddy.obs.size):
#        tstr=str(ds_eddy.time[i].dt.year.data)+'-'+str(ds_eddy.time[i].dt.month.data).zfill(2)+'-'+str(ds_eddy.time[i].dt.day.data).zfill(2)
#        tem=np.datetime64(tstr)
#        tt[i]=tem
#    ds_eddy['cpr_sample_time']=xr.DataArray(tt,dims=['obs'])
    return ds_eddy


In [None]:
data,clim = get_data()

In [None]:
ds_eddy = get_eddy()


In [None]:
#filename_aviso='f:/data/NASA_biophysical/aviso/eddy_trajectory_19930101_20170106.nc'   #From AVISO  website
#ds_eddy = xr.open_dataset(filename_aviso)
#ds_eddy
#ds_all = get_all_eddy()

# Collocate all data with eddy

In [None]:
for name in data:
    ds_data=data[name]
    if name=='topo':
        continue
    print('name',name)
    for var in ds_data:
        var_tem=ds_data[var].attrs['var_name']
        ds_eddy[var_tem]=ds_eddy.cpr_sample_ccmp_uwnd.copy(deep=True)*np.NaN
        ds_eddy[var_tem].attrs=ds_data[var].attrs
    print('var',var_tem)
    for i in range(2): #ds_eddy.z.size):
        lat1,lat2=ds_eddy.cpr_sample_lat[i].data-1,ds_eddy.cpr_sample_lat[i].data+1
        lon1,lon2=ds_eddy.cpr_sample_lon[i].data-1,ds_eddy.cpr_sample_lon[i].data+1
        #interp in time and select region around lat/lon to subset before loading data
        tem = ds_data.interp(time=ds_eddy.cpr_sample_time[i].data).sel(lat=slice(lat1,lat2),lon=slice(lon1,lon2)).load()
        tem = tem.interp(lat=ds_eddy.cpr_sample_lat[i].data,lon=ds_eddy.cpr_sample_lon[i].data)
        for var in ds_data:
            var_tem=ds_data[var].attrs['var_name']
            ds_eddy[var_tem]=tem[var]
for name in clim:
    ds_data=clim[name]
    print('name',name)
    for var in ds_data:
        var_tem=ds_data[var].attrs['var_name']+'_clim'
        ds_eddy[var_tem]=ds_eddy.cpr_sample_ccmp_uwnd.copy(deep=True)*np.NaN
        ds_eddy[var_tem].attrs=ds_data[var].attrs
    print('var',var_tem)
    for i in range(2): #ds_eddy.z.size):
        lat1,lat2=ds_eddy.cpr_sample_lat[i].data-1,ds_eddy.cpr_sample_lat[i].data+1
        lon1,lon2=ds_eddy.cpr_sample_lon[i].data-1,ds_eddy.cpr_sample_lon[i].data+1
        #interp in time and select region around lat/lon to subset before loading data
        tem = ds_data.sel(dayofyear=ds_eddy.cpr_sample_time[i].dt.dayofyear.data).sel(lat=slice(lat1,lat2),lon=slice(lon1,lon2)).load()
        tem = tem.interp(lat=ds_eddy.cpr_sample_lat[i].data,lon=ds_eddy.cpr_sample_lon[i].data)
        for var in ds_data:
            var_tem=ds_data[var].attrs['var_name']+'_clim'
            ds_eddy[var_tem]=tem[var]

ds_topo=data['topo']
ds_eddy['ETOPO_depth']=ds_topo.z.interp(lat=ds_eddy.cpr_sample_lat,lon=ds_eddy.cpr_sample_lon,method='nearest')       

# output data

In [None]:
filename_out='F:/data/NASA_biophysical//collocated_data/All CPR Sample catalogue with eddy info_version2020_04_21.nc'
ds_eddy.to_netcdf(filename_out)

# NOW, the point of this is to look up collocated eddy information and get the history of the data.  Steps are:
1. Read in list of collocated eddies.
2. Create list of unique eddy ID
3. Read in full eddy database and select eddy id
4. collocate environmental data for entire eddy history
5. save file

In [3]:
ds_all = get_all_eddy()

In [4]:
ds_eddy = get_eddy()

In [5]:
#drop all data where eddy radius < distance to eddy
#find unique id & create a list
subset = ds_eddy.where(ds_eddy.cpr_eddy_data_radius-ds_eddy.cpr_eddy_data_distance>0,drop=True)
_, index = np.unique(subset['cpr_eddy_data_track_id'], return_index=True)
eddy_list = subset['cpr_eddy_data_track_id'][index]
print(eddy_list[0])
#(subset.cpr_eddy_data_radius-subset.cpr_eddy_data_distance).plot()

<xarray.DataArray 'cpr_eddy_data_track_id' ()>
array(81964.)


In [6]:
data,clim = get_data()

In [7]:
eddy_list.size

521

In [58]:
#for ieddy in eddy_list:
for ieddy in range(eddy_list.size):
    if ieddy<8:
        continue
    subset = ds_all.where(ds_all.track==eddy_list[ieddy],drop=True)
    tt=np.empty(subset.obs.size,dtype='datetime64[ns]') 
    for i in range(subset.obs.size):
        tstr=str(subset.time[i].dt.year.data)+'-'+str(subset.time[i].dt.month.data).zfill(2)+'-'+str(subset.time[i].dt.day.data).zfill(2)
        tem=np.datetime64(tstr)
        tt[i]=tem
    subset['time']=xr.DataArray(tt,dims=['obs'])  
    for name in data:
        ds_data=data[name]
        if name=='topo':
            continue
        print('name',name)
        for var in ds_data:
            var_tem=ds_data[var].attrs['var_name']
            subset[var_tem]=subset.latitude.copy(deep=True)*np.NaN
            subset[var_tem].attrs=ds_data[var].attrs
        print('var',var_tem)
        for i in range(subset.latitude.size):
            lat1,lat2=subset.latitude[i].data-1,subset.latitude[i].data+1
            lon1,lon2=subset.longitude[i].data-1,subset.longitude[i].data+1
            #interp in time and select region around lat/lon to subset before loading data
            #interp doesn't work on chunked dims so rechunk
            ds_data2 = ds_data.chunk({'time':ds_data.time.size,'lat':ds_data[var].chunks[1],'lon':ds_data[var].chunks[2]})
            #ds_data2.interp(time=subset.time[i].data)
            tem = ds_data2.interp(time=subset.time[i].data).sel(lat=slice(lat1,lat2),lon=slice(lon1,lon2)).load()
            tem = tem.interp(lat=subset.latitude[i].data,lon=subset.longitude[i].data)
            for var in ds_data:
                var_tem=ds_data[var].attrs['var_name']
                subset[var_tem][i]=tem[var]
    for name in clim:
        ds_data=clim[name]
        print('name',name)
        for var in ds_data:
            var_tem=ds_data[var].attrs['var_name']+'_clim'
            subset[var_tem]=subset.latitude.copy(deep=True)*np.NaN
            subset[var_tem].attrs=ds_data[var].attrs
        print('var',var_tem)
        for i in range(subset.latitude.size):
            lat1,lat2=subset.latitude[i].data-1,subset.latitude[i].data+1
            lon1,lon2=subset.longitude[i].data-1,subset.longitude[i].data+1
            #interp in time and select region around lat/lon to subset before loading data
            ds_data2 = ds_data.chunk({'time':ds_data.time.size,'lat':ds_data[var].chunks[1],'lon':ds_data[var].chunks[2]})
            #ds_data2.interp(time=subset.time[i].data)
            tem = ds_data2.sel(dayofyear=subset.time[i].dt.dayofyear.data).sel(lat=slice(lat1,lat2),lon=slice(lon1,lon2)).load()
            tem = tem.interp(lat=subset.latitude[i].data,lon=subset.longitude[i].data)
            for var in ds_data:
                var_tem=ds_data[var].attrs['var_name']+'_clim'
                subset[var_tem][i]=tem[var]
    ds_topo=data['topo']
    subset['ETOPO_depth']=ds_topo.z.interp(lat=subset.latitude,lon=subset.longitude,method='nearest')   
    filename_out='F:/data/NASA_biophysical//collocated_data/eddy_collocated_data'+str(ieddy).zfill(8)+'.nc'
    subset.to_netcdf(filename_out)    

name aviso
var aviso_vgosa
name wnd
var ccmp_vwnd
name sst
var cmc_sst
name aviso_clim
var aviso_vgosa_clim


AttributeError: 'Dataset' object has no attribute 'time'

In [56]:
            ds_data2 = ds_data.chunk({'time':ds_data.time.size,'lat':ds_data[var].chunks[1],'lon':ds_data[var].chunks[2]})


In [57]:
ds_data[var].chunks

((1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 508),
 (180, 180, 180, 180),
 (180, 180, 180, 180, 180, 180, 180, 180))

In [49]:
ds_data[var]

In [None]:
fig, (ax1) = plt.subplots(nrows=1, figsize=(6, 5.4))
im = ax1.imshow(ds_topo.z[7000:9500,0:4500], interpolation='bilinear',vmin=-7000.0, vmax=1.0,aspect='auto',origin='lower')
plt.show()