In [1]:
import os
from glob import glob

import dask.bag as db
import dask.dataframe as dd
import xarray as xr
import numpy as np
import pickle

from matplotlib import pyplot as plt
%matplotlib inline
import cartopy.crs as ccrs
import cartopy.feature as cfeature

#from mitequinox.utils import *

#root_dir = '/home1/datawork/xyu/Drifters/'
root_dir = '/Users/aponte/data/drifters/'
data_dir = root_dir+'drifter_data_v1.02/'

In [2]:
# get dask handles and check dask server status
from dask.distributed import Client
#client = Client(cluster)
client = Client()

In [3]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:51819  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 17.18 GB


In [21]:
client.close()

---

## rewrite hourly data

In [6]:
def load_trajectory(ij, ncfile):
    ds = xr.open_dataset(ncfile)
    return ds.isel(TIME=slice(ij[0],ij[1]-1))

def store(d):
    d['ID'] = d.ID.values[0]
    #file = 'data/drifter_%d.nc' %int(d.ID.values)
    #d.to_netcdf(file)
    file = 'data/drifter_%d.p' %int(d.ID.values)
    pickle.dump( d, open( file, 'wb' ) , protocol=-1)

In [41]:
ncfile = data_dir+'driftertrajWMLE_1.02_block1.nc'

ds = xr.open_dataset(ncfile)
print(ds)
print(ds.note)

<xarray.Dataset>
Dimensions:  (TIME: 15230718)
Coordinates:
  * TIME     (TIME) datetime64[ns] 2005-04-15T20:00:00 ... NaT
Data variables:
    ID       (TIME) float64 ...
    LAT      (TIME) float64 ...
    LON      (TIME) float64 ...
    U        (TIME) float64 ...
    V        (TIME) float64 ...
    LAT_ERR  (TIME) float64 ...
    LON_ERR  (TIME) float64 ...
    U_ERR    (TIME) float64 ...
    V_ERR    (TIME) float64 ...
    GAP      (TIME) timedelta64[ns] ...
    RMSGAP   (TIME) timedelta64[ns] ...
    DROGUE   (TIME) float64 ...
Attributes:
    title:        Hourly Argos-tracked drifters location and velocity estimat...
    description:  This is version 1.02, block 1 of the dataset. See http://ww...
    note:         For all variables of dimension TIME, interruptions in the e...
    creator:      Shane Elipot and Rick Lumpkin
    timestamp:    19-Dec-2018 13:25:49
For all variables of dimension TIME, interruptions in the estimation along a single trajectory are indicated by "Inf" v

In [42]:
inan = np.where(np.isnan(ds.LAT.values))[0]

In [44]:
b = db.from_sequence([(i+1,j) for i,j in zip(np.hstack([-1,inan])[:-1],inan)]).map(load_trajectory, ncfile)
b.take(1)

(<xarray.Dataset>
 Dimensions:  (TIME: 5137)
 Coordinates:
   * TIME     (TIME) datetime64[ns] 2005-04-15T20:00:00 ... 2005-11-15T20:00:00
 Data variables:
     ID       (TIME) float64 ...
     LAT      (TIME) float64 ...
     LON      (TIME) float64 ...
     U        (TIME) float64 ...
     V        (TIME) float64 ...
     LAT_ERR  (TIME) float64 ...
     LON_ERR  (TIME) float64 ...
     U_ERR    (TIME) float64 ...
     V_ERR    (TIME) float64 ...
     GAP      (TIME) timedelta64[ns] ...
     RMSGAP   (TIME) timedelta64[ns] ...
     DROGUE   (TIME) float64 ...
 Attributes:
     title:        Hourly Argos-tracked drifters location and velocity estimat...
     description:  This is version 1.02, block 1 of the dataset. See http://ww...
     note:         For all variables of dimension TIME, interruptions in the e...
     creator:      Shane Elipot and Rick Lumpkin
     timestamp:    19-Dec-2018 13:25:49,)

In [None]:
#b.take(10,compute=False).map(store).compute()
b.map(store).compute()

In [8]:
for ifile in range(1,8):
    ncfile = data_dir+'driftertrajWMLE_1.02_block%d.nc' %ifile
    #
    ds = xr.open_dataset(ncfile)
    inan = np.where(np.isnan(ds.LAT.values))[0]
    #
    b = db.from_sequence([(i+1,j) for i,j in zip(np.hstack([-1,inan])[:-1],inan)]).map(load_trajectory, ncfile)
    b.map(store).compute()
    print(ncfile)

/Users/aponte/data/drifters/drifter_data_v1.02/driftertrajWMLE_1.02_block1.nc
/Users/aponte/data/drifters/drifter_data_v1.02/driftertrajWMLE_1.02_block2.nc
/Users/aponte/data/drifters/drifter_data_v1.02/driftertrajWMLE_1.02_block3.nc
/Users/aponte/data/drifters/drifter_data_v1.02/driftertrajWMLE_1.02_block4.nc
/Users/aponte/data/drifters/drifter_data_v1.02/driftertrajWMLE_1.02_block5.nc
/Users/aponte/data/drifters/drifter_data_v1.02/driftertrajWMLE_1.02_block6.nc
/Users/aponte/data/drifters/drifter_data_v1.02/driftertrajWMLE_1.02_block7.nc


---

## reload as dask bag

In [9]:
#files = glob('data/*.nc')
files = glob('data/*.p')
print(len(files))

12287


In [10]:
#b = db.from_sequence(files).map(lambda f: xr.open_dataset(f))
b = db.from_sequence(files).map(lambda f: pickle.load(open(f, 'rb')))
b.take(1)

(<xarray.Dataset>
 Dimensions:  (TIME: 8768)
 Coordinates:
   * TIME     (TIME) datetime64[ns] 2005-01-23T22:00:00 ... 2006-04-28T04:00:00
 Data variables:
     ID       float64 5.415e+04
     LAT      (TIME) float64 ...
     LON      (TIME) float64 ...
     U        (TIME) float64 ...
     V        (TIME) float64 ...
     LAT_ERR  (TIME) float64 ...
     LON_ERR  (TIME) float64 ...
     U_ERR    (TIME) float64 ...
     V_ERR    (TIME) float64 ...
     GAP      (TIME) timedelta64[ns] ...
     RMSGAP   (TIME) timedelta64[ns] ...
     DROGUE   (TIME) float64 ...
 Attributes:
     title:        Hourly Argos-tracked drifters location and velocity estimat...
     description:  This is version 1.02, block 2 of the dataset. See http://ww...
     note:         For all variables of dimension TIME, interruptions in the e...
     creator:      Shane Elipot and Rick Lumpkin
     timestamp:    19-Dec-2018 13:26:18,)

In [14]:
def row(d):
    d0 = d.isel(TIME=0)
    d1 = d.isel(TIME=-1)
    return {'index': int(d.ID), 't0': d0.TIME.values, 't1': d1.TIME.values, 
            'lon0': d0.LON.values, 'lat0': d0.LAT.values,
            'lon1': d1.LON.values, 'lat1': d1.LAT.values}

In [15]:
df = b.map(row).to_dataframe()

In [16]:
df.head()

Unnamed: 0,index,lat0,lat1,lon0,lon1,t0,t1
0,54151,-68.30614,-72.23164,291.36704,277.9977,2005-01-23 22:00:00,2006-04-28 04:00:00
1,98901,20.92921,23.8647,157.03923,153.63049,2011-10-02 03:00:00,2012-06-25 22:00:00
2,76811,51.83841,63.56214,311.60712,328.63237,2008-09-08 13:00:00,2010-09-25 23:00:00
3,123274,-55.30727,-41.15464,310.59742,113.01214,2013-12-20 09:00:00,2016-08-06 13:00:00
4,62330,29.19519,22.77021,344.4885,296.94631,2007-06-06 13:00:00,2008-10-01 21:00:00


In [17]:
df = df.repartition(npartitions=1)
#df.npartitions

In [18]:
df.to_csv('data/drifter-*.csv', mode='w')

['data/drifter-0.csv']

In [None]:
#df = dd.read_csv('myfiles.*.csv')

---

## form bag of pairs

In [19]:
import itertools

In [20]:
#b = (db.from_sequence(itertools.combinations(files, 2))
#    .map(lambda files: [xr.open_dataset(files[0]), xr.open_dataset(files[1])]))
b = (db.from_sequence(itertools.combinations(files, 2),npartitions=1000)
    .map(lambda files: [files[0], files[1]]))
b



KeyboardInterrupt: 

In [121]:
b.take(1)

(['data/drifter_9729742.p', 'data/drifter_25763.p'],)

In [81]:
#from netCDF4 import Dataset

In [122]:
def flatten(p):
    ds0 = pickle.load(open(p[0], 'rb'))
    ds1 = pickle.load(open(p[1], 'rb'))
    #ds0 = xr.open_dataset(p[0], lock=False)
    #ds1 = xr.open_dataset(p[1], lock=False)
    #ds0 = Dataset(p[0])
    #ds1 = Dataset(p[1])
    return {
        'id0': int(ds0.ID.values),
        'id1': int(ds1.ID.values),
        '0_t0': ds0.isel(TIME=0).TIME.values,
        '0_t1': ds0.isel(TIME=-1).TIME.values,
        '1_t0': ds1.isel(TIME=0).TIME.values,
        '1_t1': ds1.isel(TIME=-1).TIME.values
    }
#        'id0': p[0],
#        'id1': p[1]
#        'p0_t0': p[0].isel(TIME=0).TIME.values,
#        'p0_t1': p[0].isel(TIME=-2).TIME.values,
#        'p1_t0': p[1].isel(TIME=0).TIME.values,
#        'p1_t1': p[1].isel(TIME=-2).TIME.values
b.map(flatten).take(1)

({'id0': 9729742,
  'id1': 25763,
  '0_t0': numpy.datetime64('1998-02-06T05:00:00.000000000'),
  '0_t1': numpy.datetime64('2001-02-15T16:00:00.000000000'),
  '1_t0': numpy.datetime64('2001-03-30T03:00:00.000000000'),
  '1_t1': numpy.datetime64('2004-11-07T17:00:00.000000000')},)

In [123]:
df = b.map(flatten).to_dataframe()

In [124]:
df.head()

Unnamed: 0,0_t0,0_t1,1_t0,1_t1,id0,id1
0,1998-02-06 05:00:00,2001-02-15 16:00:00,2001-03-30 03:00:00,2004-11-07 17:00:00,9729742,25763
1,1998-02-06 05:00:00,2001-02-15 16:00:00,2002-04-01 01:00:00,2005-03-23 03:00:00,9729742,34160
2,1998-02-06 05:00:00,2001-02-15 16:00:00,2005-05-25 21:00:00,2005-10-12 23:00:00,9729742,2556428
3,1998-02-06 05:00:00,2001-02-15 16:00:00,2003-01-17 04:00:00,2005-01-15 13:00:00,9729742,36960
4,1998-02-06 05:00:00,2001-02-15 16:00:00,2002-05-01 10:00:00,2003-12-02 22:00:00,9729742,32042


In [125]:
print(df.npartitions)
#print(df.size)

1000


In [None]:
#df.repartition(1000).to_csv('data/pairs-*.csv')
df.to_csv('data/pairs-*.csv', mode='w') # can be done

In [17]:
def time_overlap(p):
    #t0, t1 = p[0].TIME.values, p[1].TIME.values
    #return not ( (t0[0] < t1[-1]) or (t1[0] < t0[-1]) )
    return ( p[0][-4] == p[1][-4] )

In [18]:
b.filter(time_overlap).compute()

[['data/drifter_39131.nc', 'data/drifter_54231.nc'],
 ['data/drifter_39131.nc', 'data/drifter_54371.nc'],
 ['data/drifter_39131.nc', 'data/drifter_30461.nc'],
 ['data/drifter_39131.nc', 'data/drifter_2444161.nc'],
 ['data/drifter_39131.nc', 'data/drifter_12411.nc'],
 ['data/drifter_39131.nc', 'data/drifter_2339261.nc'],
 ['data/drifter_39131.nc', 'data/drifter_52251.nc'],
 ['data/drifter_39131.nc', 'data/drifter_2556791.nc'],
 ['data/drifter_39131.nc', 'data/drifter_52191.nc'],
 ['data/drifter_39131.nc', 'data/drifter_2659761.nc'],
 ['data/drifter_39131.nc', 'data/drifter_2447611.nc'],
 ['data/drifter_39131.nc', 'data/drifter_41121.nc'],
 ['data/drifter_39131.nc', 'data/drifter_33191.nc'],
 ['data/drifter_39131.nc', 'data/drifter_32791.nc'],
 ['data/drifter_39131.nc', 'data/drifter_2444171.nc'],
 ['data/drifter_39131.nc', 'data/drifter_54221.nc'],
 ['data/drifter_39131.nc', 'data/drifter_9712541.nc'],
 ['data/drifter_39131.nc', 'data/drifter_21711.nc'],
 ['data/drifter_39131.nc', 'data