### Converting Argo data to parquet with dask

This notebook downloads and converts Argo Core and BGC profiles, given:

* the local path `gdac_path` to the argo index files (if they don't exist, they'll be downloaded to the folder),
* the path `outdir_nc` where to download the most recent Argo profile files (this is required to end with `GDAC/dac/`,
* the path `outdir_pqt` where the parquet database will be stored,
* the path `schema_path` to the parquet schemas, this should not need to be changed.

In [1]:
import argo_tools as at
from pprint import pprint

gdac_path = '/vortexfs1/share/boom/data/nc2pqt_test/'
outdir_nc = '/vortexfs1/share/boom/data/nc2pqt_test/GDAC/dac/'
outdir_pqt = '/vortexfs1/share/boom/data/nc2pqt_test/pqt2/'
schema_path = '/vortexfs1/home/enrico.milanese/projects/ARGO/nc2parquet/schemas/ArgoBGC_DATA_MODE_schema.metadata'

In [2]:
import dask
from dask.distributed import Client
client = Client(
    n_workers=10,
    threads_per_worker=10,
    processes=True, 
    memory_limit='auto'
)
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 10
Total threads: 100,Total memory: 271.27 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:34242,Workers: 10
Dashboard: http://127.0.0.1:8787/status,Total threads: 100
Started: Just now,Total memory: 271.27 GiB

0,1
Comm: tcp://127.0.0.1:45486,Total threads: 10
Dashboard: http://127.0.0.1:40715/status,Memory: 27.13 GiB
Nanny: tcp://127.0.0.1:44161,
Local directory: /tmp/dask-scratch-space/worker-hyjslv0h,Local directory: /tmp/dask-scratch-space/worker-hyjslv0h

0,1
Comm: tcp://127.0.0.1:35214,Total threads: 10
Dashboard: http://127.0.0.1:43408/status,Memory: 27.13 GiB
Nanny: tcp://127.0.0.1:41047,
Local directory: /tmp/dask-scratch-space/worker-_cc4fcmp,Local directory: /tmp/dask-scratch-space/worker-_cc4fcmp

0,1
Comm: tcp://127.0.0.1:39553,Total threads: 10
Dashboard: http://127.0.0.1:34161/status,Memory: 27.13 GiB
Nanny: tcp://127.0.0.1:33472,
Local directory: /tmp/dask-scratch-space/worker-k7u5ahl9,Local directory: /tmp/dask-scratch-space/worker-k7u5ahl9

0,1
Comm: tcp://127.0.0.1:40729,Total threads: 10
Dashboard: http://127.0.0.1:46564/status,Memory: 27.13 GiB
Nanny: tcp://127.0.0.1:35907,
Local directory: /tmp/dask-scratch-space/worker-3xbos5rw,Local directory: /tmp/dask-scratch-space/worker-3xbos5rw

0,1
Comm: tcp://127.0.0.1:34197,Total threads: 10
Dashboard: http://127.0.0.1:42799/status,Memory: 27.13 GiB
Nanny: tcp://127.0.0.1:43437,
Local directory: /tmp/dask-scratch-space/worker-eduyz9_0,Local directory: /tmp/dask-scratch-space/worker-eduyz9_0

0,1
Comm: tcp://127.0.0.1:36750,Total threads: 10
Dashboard: http://127.0.0.1:34502/status,Memory: 27.13 GiB
Nanny: tcp://127.0.0.1:40968,
Local directory: /tmp/dask-scratch-space/worker-43w1hs4a,Local directory: /tmp/dask-scratch-space/worker-43w1hs4a

0,1
Comm: tcp://127.0.0.1:35754,Total threads: 10
Dashboard: http://127.0.0.1:34871/status,Memory: 27.13 GiB
Nanny: tcp://127.0.0.1:34727,
Local directory: /tmp/dask-scratch-space/worker-d2ebdykx,Local directory: /tmp/dask-scratch-space/worker-d2ebdykx

0,1
Comm: tcp://127.0.0.1:40220,Total threads: 10
Dashboard: http://127.0.0.1:38081/status,Memory: 27.13 GiB
Nanny: tcp://127.0.0.1:42694,
Local directory: /tmp/dask-scratch-space/worker-ui9a78xm,Local directory: /tmp/dask-scratch-space/worker-ui9a78xm

0,1
Comm: tcp://127.0.0.1:41147,Total threads: 10
Dashboard: http://127.0.0.1:41845/status,Memory: 27.13 GiB
Nanny: tcp://127.0.0.1:45153,
Local directory: /tmp/dask-scratch-space/worker-hdk1sq4m,Local directory: /tmp/dask-scratch-space/worker-hdk1sq4m

0,1
Comm: tcp://127.0.0.1:45252,Total threads: 10
Dashboard: http://127.0.0.1:44186/status,Memory: 27.13 GiB
Nanny: tcp://127.0.0.1:46011,
Local directory: /tmp/dask-scratch-space/worker-7m5_6rxz,Local directory: /tmp/dask-scratch-space/worker-7m5_6rxz


In [3]:
import pyarrow as pa
import pyarrow.parquet as pq

schema_BGC = pq.read_schema(schema_path)

In [14]:
from datetime import datetime, timedelta
reference_time = datetime.utcnow() - timedelta(weeks=5*52)

filterQC = []
cols = []
for param in schema_BGC.names:
    if "_ADJUSTED_QC" in param:
        # param_name_parts = param.split('_')
        param_base_name = param_name_parts[0]
        second_last_index = param.rfind('_', 0, param.rfind('_') )
        param_base_name = param[:second_last_index]
        print(param_base_name)
        param_data_mode = param_base_name + '_DATA_MODE'
        print(param_data_mode)
        filterQC.append( [ ("JULD",">=",reference_time), (param, "in", [1,2]), (param_data_mode, "in", ["A","D"]) ] )
        cols.append(param)
    elif "_QC" in param:
        # param_name_parts = param.split('_', 1)
        last_index = param.rfind('_') 
        param_base_name = param[:last_index]
        # if <PARAM>_ADJUSTED already exists, no need to filter by real-time data
        if (param_base_name + '_ADJUSTED') not in schema_BGC.names:
            param_data_mode = param_base_name + '_DATA_MODE'
            filterQC.append( [ ("JULD",">=",reference_time), (param, "in", [1,2]), (param_data_mode, "==", "R") ] )
            cols.append(param)
    else:
        cols.append(param)
    
    # if "_QC" not in param:
    #     cols.append(param)

schema_BGC_QC12 = schema_BGC
for name in schema_BGC_QC12.names:
    if name not in cols:
        id_name = schema_BGC_QC12.get_field_index(name)
        schema_BGC_QC12 = schema_BGC_QC12.remove(id_name)

PRES
PRES_DATA_MODE
TEMP
TEMP_DATA_MODE
PSAL
PSAL_DATA_MODE
DOXY
DOXY_DATA_MODE
BBP
BBP_DATA_MODE
BBP470
BBP470_DATA_MODE
BBP532
BBP532_DATA_MODE
BBP700
BBP700_DATA_MODE
TURBIDITY
TURBIDITY_DATA_MODE
CP
CP_DATA_MODE
CP660
CP660_DATA_MODE
CHLA
CHLA_DATA_MODE
CDOM
CDOM_DATA_MODE
NITRATE
NITRATE_DATA_MODE
BISULFIDE
BISULFIDE_DATA_MODE
PH_IN_SITU_TOTAL
PH_IN_SITU_TOTAL_DATA_MODE
DOWN_IRRADIANCE
DOWN_IRRADIANCE_DATA_MODE
DOWN_IRRADIANCE380
DOWN_IRRADIANCE380_DATA_MODE
DOWN_IRRADIANCE412
DOWN_IRRADIANCE412_DATA_MODE
DOWN_IRRADIANCE443
DOWN_IRRADIANCE443_DATA_MODE
DOWN_IRRADIANCE490
DOWN_IRRADIANCE490_DATA_MODE
DOWN_IRRADIANCE555
DOWN_IRRADIANCE555_DATA_MODE
UP_IRRADIANCE
UP_IRRADIANCE_DATA_MODE
UP_IRRADIANCE380
UP_IRRADIANCE380_DATA_MODE
UP_IRRADIANCE412
UP_IRRADIANCE412_DATA_MODE
UP_IRRADIANCE443
UP_IRRADIANCE443_DATA_MODE
UP_IRRADIANCE490
UP_IRRADIANCE490_DATA_MODE
UP_IRRADIANCE555
UP_IRRADIANCE555_DATA_MODE
DOWNWELLING_PAR
DOWNWELLING_PAR_DATA_MODE


In [16]:
cols

['JULD',
 'LATITUDE',
 'LONGITUDE',
 'CYCLE_NUMBER',
 'PLATFORM_NUMBER',
 'N_PROF',
 'N_LEVELS',
 'PRES',
 'PRES_ADJUSTED',
 'PRES_ADJUSTED_QC',
 'PRES_ADJUSTED_ERROR',
 'TEMP',
 'TEMP_dPRES',
 'TEMP_ADJUSTED',
 'TEMP_ADJUSTED_QC',
 'TEMP_ADJUSTED_ERROR',
 'PSAL',
 'PSAL_dPRES',
 'PSAL_ADJUSTED',
 'PSAL_ADJUSTED_QC',
 'PSAL_ADJUSTED_ERROR',
 'DOXY',
 'DOXY_dPRES',
 'DOXY_ADJUSTED',
 'DOXY_ADJUSTED_QC',
 'DOXY_ADJUSTED_ERROR',
 'BBP',
 'BBP_dPRES',
 'BBP_ADJUSTED',
 'BBP_ADJUSTED_QC',
 'BBP_ADJUSTED_ERROR',
 'BBP470',
 'BBP470_dPRES',
 'BBP470_ADJUSTED',
 'BBP470_ADJUSTED_QC',
 'BBP470_ADJUSTED_ERROR',
 'BBP532',
 'BBP532_dPRES',
 'BBP532_ADJUSTED',
 'BBP532_ADJUSTED_QC',
 'BBP532_ADJUSTED_ERROR',
 'BBP700',
 'BBP700_dPRES',
 'BBP700_ADJUSTED',
 'BBP700_ADJUSTED_QC',
 'BBP700_ADJUSTED_ERROR',
 'TURBIDITY',
 'TURBIDITY_dPRES',
 'TURBIDITY_ADJUSTED',
 'TURBIDITY_ADJUSTED_QC',
 'TURBIDITY_ADJUSTED_ERROR',
 'CP',
 'CP_dPRES',
 'CP_ADJUSTED',
 'CP_ADJUSTED_QC',
 'CP_ADJUSTED_ERROR',
 'CP660'

In [17]:
import dask.dataframe as dd
ddf = dd.read_parquet(
                outdir_pqt+'debugBGC_DATA_MODE/',
                engine="pyarrow",
                storage_options={"anon": True, "use_ssl": True},
                columns = cols,
                filters = filterQC
            )

In [19]:
%%time
ddf = ddf.repartition(partition_size="300MB")

name_function = lambda x: f"ArgoBGC_QC12_dask_{x}.parquet"

ddf.to_parquet(
    outdir_pqt + 'debug_ArgoBGC_QC12AD_300MB',
    engine="pyarrow",
    name_function = name_function,
    write_metadata_file = True,
    write_index=False,
    schema = schema_BGC_QC12
)



CPU times: user 34.2 s, sys: 3.41 s, total: 37.6 s
Wall time: 1min 32s


In [None]:
ref_var = "TEMP_ADJUSTED"
cols_read = ["N_PROF", "N_LEVELS", ref_var,"LATITUDE","LONGITUDE","PRES_ADJUSTED","TEMP_ADJUSTED_QC","TEMP_DATA_MODE","PRES_DATA_MODE","PSAL_DATA_MODE"]


In [21]:
ddfAD = dd.read_parquet(
                outdir_pqt+'debug_ArgoBGC_QC12AD_300MB/',
                engine="pyarrow",
                storage_options={"anon": True, "use_ssl": True},
                columns =
            )

In [22]:
ddfAD.head()

Unnamed: 0,JULD,LATITUDE,LONGITUDE,CYCLE_NUMBER,PLATFORM_NUMBER,N_PROF,N_LEVELS,PRES,PRES_ADJUSTED,PRES_ADJUSTED_QC,...,DOWN_IRRADIANCE443_DATA_MODE,DOWN_IRRADIANCE490_DATA_MODE,DOWN_IRRADIANCE555_DATA_MODE,UP_IRRADIANCE_DATA_MODE,UP_IRRADIANCE380_DATA_MODE,UP_IRRADIANCE412_DATA_MODE,UP_IRRADIANCE443_DATA_MODE,UP_IRRADIANCE490_DATA_MODE,UP_IRRADIANCE555_DATA_MODE,DOWNWELLING_PAR_DATA_MODE
0,2021-05-06 02:03:16.000218880,49.236,-14.742,1,1902303,0,0,2.2,2.27,1,...,,,,,,,,,,
1,2021-05-06 02:03:16.000218880,49.236,-14.742,1,1902303,0,1,4.0,4.07,1,...,,,,,,,,,,
2,2021-05-06 02:03:16.000218880,49.236,-14.742,1,1902303,0,2,6.0,6.07,1,...,,,,,,,,,,
3,2021-05-06 02:03:16.000218880,49.236,-14.742,1,1902303,0,3,7.9,7.97,1,...,,,,,,,,,,
4,2021-05-06 02:03:16.000218880,49.236,-14.742,1,1902303,0,4,10.0,10.07,1,...,,,,,,,,,,


In [17]:
ddf.compute()

Unnamed: 0,N_PROF,N_LEVELS,TEMP_ADJUSTED,LATITUDE,LONGITUDE,PRES_ADJUSTED,TEMP_ADJUSTED_QC,TEMP_DATA_MODE,PRES_DATA_MODE,PSAL_DATA_MODE
0,31,0,30.535000,-1.8404,70.1473,2.430000,1,A,A,A
1,31,1,30.531000,-1.8404,70.1473,4.230000,1,A,A,A
2,31,2,30.534000,-1.8404,70.1473,6.230000,1,A,A,A
3,31,3,30.544001,-1.8404,70.1473,8.230000,1,A,A,A
4,31,4,30.547001,-1.8404,70.1473,10.230000,1,A,A,A
...,...,...,...,...,...,...,...,...,...,...
185,51,5,3.460000,-53.1720,91.0500,27.200001,1,A,A,A
186,51,6,3.431000,-53.1720,91.0500,32.799999,1,A,A,A
187,51,7,3.428000,-53.1720,91.0500,37.500000,1,A,A,A
188,51,8,3.371000,-53.1720,91.0500,43.000000,1,A,A,A


#### Done!

When we are done, we can shut down the dask cluster.

In [None]:
client.shutdown()