In [1]:
from datetime import datetime
from pathlib import Path
import gsw
import numpy as np
import pandas as pd
import xarray as xr
import pyarrow.parquet as pq
import requests as rq
import os
from urllib.parse import urljoin, urlencode
import argo_tools as at
import argopy
from argopy import DataFetcher as ArgoDataFetcher
local_gdac = '/vortexfs1/share/boom/data/gdac_snapshot/202403-ArgoData'
#Path.mkdir('/vortexfs1/share/boom/projects/n2o/pq')
outdir_nc = '/vortexfs1/share/boom/data/nc2pqt_test/GDAC/'
outdir_pqt = '/vortexfs1/share/boom/data/nc2pqt_test/PQT/' #'/vortexfs1/share/boom/projects/n2o/pq'
# argopy.set_options(mode='expert',src='gdac',ftp=local_gdac)
from pprint import pprint
import warnings
import glob
import psutil
import time
import multiprocessing
import gc

#### Downloading Argo profiles

If you already have the profiles stored somewhere, you can skip this. Otherwise, if you want to download them, uncomment the next cell, and pass to `at.argo_dac` the appropriate arguments to download the files you desire (see `argo_tools.py` for details about the arguments).

In [2]:
#wmos, df2 = at.argo_gdac(lat_range=[0,20],lon_range=[-30,-20],save_to=outdir_nc, download_individual_profs=False, skip_downloads=True)

#### Importing iterators from [`itertools`](https://docs.python.org/3/library/itertools.html)
* `islice` returns selected elements from iterable
* `batched` split the iterable object into tuples of prescribed length _n_ (if `length(iterable)%n~=0`, the last tuple is shorter than _n_).

In [3]:
import sys
from itertools import islice

if sys.version_info >= (3, 12):
    from itertools import batched
else:
    try:
        from more_itertools import batched
    except ImportError:
        def batched(iterable, chunk_size):
            iterator = iter(iterable)
            while chunk := tuple(islice(iterator, chunk_size)):
                yield chunk

In [4]:
VARS = ['JULD','CYCLE_NUMBER','PLATFORM_NUMBER','LATITUDE','LONGITUDE',
 'PRES',
 'PRES_QC',
 'PRES_ADJUSTED',
 'PRES_ADJUSTED_QC',
 'PRES_ADJUSTED_ERROR',
 'TEMP',
 'TEMP_QC',
 'TEMP_dPRES',
 'TEMP_ADJUSTED',
 'TEMP_ADJUSTED_QC',
 'TEMP_ADJUSTED_ERROR',
 'PSAL',
 'PSAL_QC',
 'PSAL_dPRES',
 'PSAL_ADJUSTED',
 'PSAL_ADJUSTED_QC',
 'PSAL_ADJUSTED_ERROR',
 'DOXY',
 'DOXY_QC',
 'DOXY_dPRES',
 'DOXY_ADJUSTED',
 'DOXY_ADJUSTED_QC',
 'DOXY_ADJUSTED_ERROR',
 'CHLA',
 'CHLA_QC',
 'CHLA_dPRES',
 'CHLA_ADJUSTED',
 'CHLA_ADJUSTED_QC',
 'CHLA_ADJUSTED_ERROR',
 'BBP700',
 'BBP700_QC',
 'BBP700_dPRES',
 'BBP700_ADJUSTED',
 'BBP700_ADJUSTED_QC',
 'BBP700_ADJUSTED_ERROR',
 'CDOM',
 'CDOM_QC',
 'CDOM_dPRES',
 'CDOM_ADJUSTED',
 'CDOM_ADJUSTED_QC',
 'CDOM_ADJUSTED_ERROR',
 'PH_IN_SITU_TOTAL',
 'PH_IN_SITU_TOTAL_QC',
 'PH_IN_SITU_TOTAL_dPRES',
 'PH_IN_SITU_TOTAL_ADJUSTED',
 'PH_IN_SITU_TOTAL_ADJUSTED_QC',
 'PH_IN_SITU_TOTAL_ADJUSTED_ERROR',
 'NITRATE',
 'NITRATE_QC',
 'NITRATE_dPRES',
 'NITRATE_ADJUSTED',
 'NITRATE_ADJUSTED_QC',
 'NITRATE_ADJUSTED_ERROR']

### File conversion

The conversion from `nc` to `parquet` is parallelized. All you need to do is comment/uncomment the appropriate line in the next cell, selecting if you want to process the files that you just downloaded (first line) or other files (second line, modify to include your desired path).
The new parquet files will be stored in the directory `outdir_pqt` that you specified earlier.

The default is for the code to execute over multiple processes. If you want to execute it on a single thread, set `single_process = True`

In [5]:
#flist = [outdir_nc + s for s in wmo_fp]
flist = glob.glob("/vortexfs1/share/boom/data/gdac_snapshot/202403-ArgoData/dac/*/*/*_Sprof.nc")

# flist = glob.glob("/vortexfs1/share/boom/data/gdac_snapshot/202403-ArgoData/dac/coriolis/*/*_Sprof.nc")

# fname1 = '/vortexfs1/share/boom/data/gdac_snapshot/202403-ArgoData/dac/coriolis/6990526/6990526_Sprof.nc'
# fname2 = '/vortexfs1/share/boom/data/gdac_snapshot/202403-ArgoData/dac/aoml/5904852/5904852_Sprof.nc'
# flist=[]
# flist.append(fname1)
# flist.append(fname2)

single_process = False

In [6]:
%%time

if not single_process:
    import multiprocessing

def xr2pqt(rank,files_list,loop_id):
    df_list = []
    df_memory = 0
    counter = 0
    rank_str = "#" + str(rank) + ": "
    nb_files = len(files_list)
    argo_file_fail = []
    for argo_file in files_list:
        counter += 1
        if counter%10==0:
            print(rank_str + "processing file " + str(counter) + " of " + str(nb_files))
            
        try:
            ds = xr.load_dataset(argo_file, engine="argo") #loading into memory the profile
        except:
            print(rank_str + 'Failed on ' + str(argo_file))
            argo_file_fail.append(argo_file)
        
        invars = list(set(VARS) & set(list(ds.data_vars))) # some floats don't have all the vars specified in VARS
        df = ds[invars].to_dataframe()
        df_memory += df.memory_usage().sum()/(10**6) # tracking memory usage (in GB)
        df_list.append(df)

    # store to parquet once a large enough dataframe has been created
    
    print(rank_str + "Storing to parquet...")
    if df_memory < 1e3:
        print(rank_str + "In-memory filesize: " + "{:.2f}".format(df_memory) + " MB")
    else:
        print(rank_str + "In-memory filesize: " + "{:.2f}".format(df_memory/1e3) + " GB")
    
    df_list = pd.concat(df_list)

    parquet_filename = outdir_pqt + "test_parquet_" + str(rank) + "_" + str(loop_id) + ".parquet"
    df_list.to_parquet(parquet_filename)
    print(rank_str + str(parquet_filename) + " stored.")

    df_list = []
    df_memory = 0

    return argo_file_fail
    
############################################################################################################

def poolParams(flist,nc_size_per_pqt):
    size_flist = []
    for f in flist:
        size_flist.append( os.path.getsize(f)/1024**2 ) #size in MB

    size_tot = sum(size_flist)
    NPROC = int(np.ceil(size_tot/nc_size_per_pqt))
    size_per_proc = size_tot/NPROC

    print('')
    print('Size per processor (MB)')
    print(size_per_proc)
    print('')
    
    ids_sort = np.argsort(np.array(size_flist))
    
    chunks_ids = []
    x = np.copy(ids_sort)
    
    for j in range(NPROC):
        chunk_ids = []
        chunk_size = 0
        while ((chunk_size<size_per_proc) and (len(x) > 0)):
            if len(chunk_ids)%2 == 0:
                chunk_ids.append(x[-1])
                x = x[:-1]
            else:
                chunk_ids.append(x[0])
                x = x[1:]
            chunk_size = sum(np.asarray(size_flist)[chunk_ids])
        print(chunk_size)
        chunks_ids.append(chunk_ids)
    
    if len(x) > 0:
        warnings.warn(str(len(x)) + " files have not been assigned to a processor.")
    
    print('')
    chunks=[]
    skip_proc = 0
    total_memory = 0
    for j,chunk_ids in enumerate(chunks_ids):
        print('Size in processor ' + str(j) + ' (MB):')
        size_proc = sum(np.asarray(size_flist)[chunk_ids])
        total_memory += size_proc
        print(size_proc)
        if size_proc == 0:
            skip_proc += 1
            continue
        chunk = [flist[k] for k in chunk_ids]
        chunks.append(chunk)

    NPROC -= skip_proc
        
    print('')
    print("Using " + str(NPROC) + " processors")
    
    return NPROC, chunks, size_per_proc

############################################################################################################

print("Processing " + str(len(flist)) + " files.")

if not single_process:
    nc_size_per_pqt = 40 # Empirically, 40 MB of average .nc file size gives in-memory sizes between 100-330 MB, which is what Dask recommens
    NPROC, chunks,size_per_proc = poolParams(flist,nc_size_per_pqt)

# fixing max nb of processes to prevent bottleneck likely due to I/O on disk queing operations and filling up the memory
MAXPROC = 120
if size_per_proc > 300:
    MAXPROC = 12

if NPROC > MAXPROC and not single_process:
    print("Estimated number of processors might create bottleneck issues. Forcing to use " + str(MAXPROC) + " processors at a time.")
    # force to use at most MAXPROC processes, by looping over chunks
    full_loops = NPROC//MAXPROC  #nb of loops to use at most MAXPROC
    RESPROC = NPROC%MAXPROC   #nb of residual processors after the loops
    pool_obj = multiprocessing.Pool(processes=MAXPROC)
    i_start = 0
    i_end   = 0
    failed_files = []
    for full_loop in range(full_loops):
        i_start = MAXPROC*full_loop
        i_end   = MAXPROC*(full_loop+1)
        failed_files.append( pool_obj.starmap(xr2pqt, [(rank, chunk, full_loop) for rank, chunk in enumerate(chunks[i_start:i_end])] ) )
    pool_obj.close()

    # multiprocessing across residual processor pool with NPROC<MAXPROC
    if RESPROC > 0:
        pool_obj = multiprocessing.Pool(processes=RESPROC)
        failed_files.append( pool_obj.starmap(xr2pqt, [(rank, chunk, full_loop+1) for rank, chunk in enumerate(chunks[(i_end+1):])] ) )
        pool_obj.close()

elif NPROC > 1 and not single_process:
    pool_obj = multiprocessing.Pool(processes=NPROC)
    failed_files = pool_obj.starmap(xr2pqt, [(rank, chunk, 0) for rank, chunk in enumerate(chunks)] )
    pool_obj.close()

else:
    failed_files = xr2pqt(0,flist,0)

print(failed_files)

Processing 765 files.

Size per processor (MB)
39.818978325704514

99.08544540405273
97.73884105682373
96.52665042877197
89.80551052093506
87.98799419403076
87.86990928649902
87.43379592895508
85.42619132995605
82.52131366729736
81.72994232177734
81.44808578491211
80.78289413452148
79.62730979919434
76.59406471252441
72.35435104370117
71.79525852203369
69.99108409881592
69.35292625427246
63.51381015777588
62.213666915893555
61.91689109802246
61.685922622680664
57.574278831481934
57.280272483825684
56.58053207397461
54.36483383178711
52.90372276306152
52.16682243347168
51.62456703186035
50.627431869506836
50.5104455947876
48.93765640258789
48.876712799072266
48.183417320251465
47.92483425140381
47.206130027770996
46.32355880737305
45.041481018066406
44.75474262237549
43.690430641174316
43.012332916259766
42.7005090713501
42.51829433441162
42.28854179382324
42.05257511138916
41.75073051452637
41.2997350692749
39.949758529663086
78.86246871948242
77.35775756835938
76.07283782958984
75.264

#### Reading from parquet

There are a couple of way to read parquet files. One is directly using pandas (make sure you have pyarrow, fastparquet or other suitable engine installed), the other is with Dask. Generally speaking, you'll want to use Dask if you need a large amount of data at the same time so that you can benefit from its parallelization. You should avoid Dask and just go for pandas whenever the data fits in your RAM.

When reading parquet files with pandas, you can either specificy the file name if you know which file you want, or the directory containing all the parquet files. In latter case if you apply any filter, pandas and pyarrow will sort through all the files in the folder, reading into memory only the subsets that satisfy your filter.

In [7]:
sel = [("PLATFORM_NUMBER", "==", 5904473)]
df = pd.read_parquet( outdir_pqt , engine='pyarrow', filters = sel ) 

In [8]:
selected_df = df.loc[df["PLATFORM_NUMBER"]==5904473,["PLATFORM_NUMBER","JULD","DOXY_ADJUSTED"]]

In [13]:
df["PSAL_ADJUSTED"].values

array([33.741886, 33.74088 , 33.741882, ...,       nan,       nan,
             nan], dtype=float32)

#### Testing conversion

The following cell performs integrity tests on the parquet files. As the number of floats, profiles, and variables is large, the check is performed over all the variables, but not all the files. For each variable in `VARS`, files are randomly selected from the input list (in a number set to 5% of the .nc files) and for each of them, the selected `VARS` is compared to the one obtained from the parquet file. Each of these checks can:
* succeed, if the nc and parquet variables are equal
* fail, if the nc and parquet variables are not equal
* be skipped, if the randomly selected file does not contain the selected variable

If a file is skipped, another one is randomly selected, until one that contains the selected variable is found. For each variable, no file can be randomly picked two or more times (it can happen across variables).

The variables `successes`, `fails`, and `skipped` contain the file id and the name of the variable for which the check was succesful, failed, or skipped.

In [6]:
def checkVars(rank, flist):

    rank_str = "#" + str(rank) + ": "
    
    VARS2 = [ 'PH_IN_SITU_TOTAL',
     'PH_IN_SITU_TOTAL_QC',
     'PH_IN_SITU_TOTAL_dPRES',
     'PH_IN_SITU_TOTAL_ADJUSTED',
     'PH_IN_SITU_TOTAL_ADJUSTED_QC',
     'PH_IN_SITU_TOTAL_ADJUSTED_ERROR']    
    
    rand_max = len(flist)
    nb_of_checks_per_var = np.min( [30,len(flist)] ) #int(np.ceil(rand_share*rand_max))
    nb_of_checks = nb_of_checks_per_var*len(VARS2)

    print(rank_str + "Checking " + str(nb_of_checks) + " random files.")
    
    check_nb = 0
    successes = []
    fails = []
    skipped = []
    for v in range(len(VARS2)):
    
        rand_idces = []
        target_var = VARS2[v]
    
        r = 0
        while ((r < nb_of_checks_per_var) and (len(rand_idces) < len(flist) )):
            print(rank_str + "Check " + str(r) + " of " + str(nb_of_checks_per_var) )
            
            check_nb += 1
            rand_avail = np.arange(0,rand_max)[~np.isin(np.arange(0,rand_max), rand_idces)]
            rand_idx = np.random.choice( rand_avail )
            rand_idces.append(rand_idx)

            try:
                ref_ds = xr.load_dataset(flist[rand_idx], engine="argo")
            except:
                print(rank_str + 'Failed on ' + str(flist[idx]))
                continue
                
            # print(rank_str + "Reading file " + flist[rand_idx] )
            ref_platform = ref_ds.PLATFORM_NUMBER.values[0]
        
            invars = list(set(VARS2) & set(list(ref_ds.data_vars)))
            
            if target_var not in invars:
                ref_var = None
                del ref_var
                gc.collect()

                skipped.append( (rand_idx, target_var ) )
                # print(rank_str + "Current random file does not contain variable " + target_var + ", skipping this check.")
                continue
                
            print(rank_str + "Checking " + target_var + " in platform number " + str(ref_platform) + ".")
        
            # print(ref_ds[target_var].dims)
            # if len(ref_ds[target_var].dims) > 1:
            dim0 = ref_ds.sizes["N_PROF"]
            dim1 = ref_ds.sizes["N_LEVELS"]
        
            if np.issubdtype(ref_ds[target_var].dtype, np.datetime64):
                ref_var = np.zeros( dim0*dim1, dtype='datetime64[ns]' )
            else:
                ref_var = np.zeros( dim0*dim1, dtype=np.float64 )
        
            # print(ref_ds[target_var][10].values)
        
            for j in range(dim0):
                for k in range(dim1):
                    ref_id = j*dim1+k
                    if len(ref_ds[target_var].dims) > 1:
                        ref_var[ref_id] = ref_ds[target_var][j,k].values
                    else:
                        ref_var[ref_id] = ref_ds[target_var][j].values
        
            sel_pqt = [("PLATFORM_NUMBER", "==", ref_platform)]

            try:
                df_pqt = pd.read_parquet( outdir_pqt , engine='pyarrow', filters = sel_pqt )
            except:
                print("Loading parquet file failed!")
                continue
    
            if target_var not in df_pqt.columns.tolist():
                fails.append( (rand_idx, target_var ) )
                r += 1
                print(rank_str + "Warning: " + target_var + " not found in parquet file.")
                continue
        
            df_pqt_var = df_pqt[target_var].values
        
            success = np.array_equal(ref_var, df_pqt_var, equal_nan=True)

            ref_var = None
            df_pqt_var = None
            del ref_var
            del df_pqt_var
            gc.collect()
            
            if success:
                successes.append( (rand_idx, target_var ) )
                # print(rank_str + "Check " + str(check_nb) + " of " + str(nb_of_checks) + " succesful.")
            else:
                fails.append( (rand_idx, target_var ) )
                # print(rank_str + "Check " + str(check_nb) + " of " + str(nb_of_checks) + " failed.")
    
            r += 1

    print(rank_str + "All checks in process done")
    print(rank_str +  str(len(successes)) + " checks were succesful.")
    print(rank_str +  str(len(fails)) + " checks failed.")

############################################################################################################

# flist = glob.glob("/vortexfs1/share/boom/data/gdac_snapshot/202403-ArgoData/dac/*/*/*_Sprof.nc")

nb_of_checks = len(flist)

NPROC = 20
CHUNK_SZ = int(np.ceil(nb_of_checks/NPROC))
chunks = batched(flist,CHUNK_SZ)

print(CHUNK_SZ)

# print(list(chunks))

print("Checking " + str(nb_of_checks) + " random files.")
print("")

# print([(rank, chunk) for rank, chunk in enumerate(chunks)])
pool_obj = multiprocessing.Pool(processes=NPROC)
pool_obj.starmap(checkVars, [(rank, chunk) for rank, chunk in enumerate(chunks)])
pool_obj.close()

print("")
print("All checks were done.")

110
Checking 2192 random files.

#0: Checking 180 random files.#1: Checking 180 random files.#5: Checking 180 random files.
#6: Checking 180 random files.#7: Checking 180 random files.#3: Checking 180 random files.#2: Checking 180 random files.#4: Checking 180 random files.#8: Checking 180 random files.#9: Checking 180 random files.#0: Check 0 of 30#10: Checking 180 random files.
#11: Checking 180 random files.
#12: Checking 180 random files.#13: Checking 180 random files.


#14: Checking 180 random files.#15: Checking 180 random files.


#17: Checking 180 random files.#16: Checking 180 random files.
#5: Check 0 of 30#18: Checking 180 random files.

#19: Checking 180 random files.
#1: Check 0 of 30

#6: Check 0 of 30#7: Check 0 of 30#3: Check 0 of 30
#2: Check 0 of 30
#8: Check 0 of 30#4: Check 0 of 30


#9: Check 0 of 30
#11: Check 0 of 30

#10: Check 0 of 30#13: Check 0 of 30#12: Check 0 of 30


#15: Check 0 of 30
#14: Check 0 of 30
#17: Check 0 of 30

#16: Check 0 of 30
#18: Check 0

NameError: name 'idx' is not defined

In [5]:
def checkPlatformNb(rank,flist):

    rank_str = "#" + str(rank) + ": "
    outdir_pqt = '/vortexfs1/share/boom/data/nc2pqt_test/PQT/'
    
    check_nb = 0
    successes = []
    fails = []
    failed_on_read = []

    for idx in range(len(flist)):
    
        check_nb += 1
        try:
            ref_ds = xr.load_dataset(flist[idx], engine="argo")
        except:
            print(rank_str + 'Failed on ' + str(flist[idx]))
            failed_on_read.append(flist[idx])
            continue
        
        ref_platform = ref_ds.PLATFORM_NUMBER.values[0]

        ref_ds = None
        del ref_ds
        gc.collect()
        
        sel_pqt = [("PLATFORM_NUMBER", "==", ref_platform)]
        try:
            df_pqt = pd.read_parquet( outdir_pqt , engine='pyarrow', filters = sel_pqt )
        except:
            fails.append( (idx ) )
            continue

        df_pqt = None
        del df_pqt
        gc.collect()
        successes.append( (idx ) )
        
        if check_nb%10:
        print(rank_str + "Check " + str(check_nb) + " of " + str(len(flist)) + " done.")

    print(rank_str + "All checks in process done")
    print(rank_str +  str(len(successes)) + " checks were succesful.")
    print(rank_str +  str(len(fails)) + " checks failed.")
    if len(failed_on_read)>0:
        print(rank_str +  str(len(failed_on_read)) + " original Argo file(s) could not be loaded, likely due to errors in the original file. These files were likely never converted to parquet.")
        print("File list:")
        print(failed_on_read)
    else:
        print(rank_str +  str(len(failed_on_read)) + " original Argo file(s) could not be loaded.")

############################################################################################################

flist = glob.glob("/vortexfs1/share/boom/data/gdac_snapshot/202403-ArgoData/dac/coriolis/*/*_Sprof.nc")

nb_of_checks = len(flist)

NPROC = 20
CHUNK_SZ = int(np.ceil(nb_of_checks/NPROC))
chunks = batched(flist,CHUNK_SZ)

print(CHUNK_SZ)

# print(list(chunks))

print("Checking " + str(nb_of_checks) + " random files.")
print("")

pool_obj = multiprocessing.Pool(processes=NPROC)
pool_obj.starmap(checkPlatformNb, [(rank, chunk) for rank, chunk in enumerate(chunks)])
pool_obj.close()

print("")
print("All checks were done.")

39
Checking 765 random files.

#0: Check 1 of 39 done.
#0: Check 2 of 39 done.
#19: Check 1 of 24 done.
#0: Check 3 of 39 done.
#4: Check 1 of 39 done.
#17: Check 1 of 39 done.
#0: Check 4 of 39 done.
#5: Check 1 of 39 done.
#3: Check 1 of 39 done.
#6: Check 1 of 39 done.
#7: Check 1 of 39 done.
#0: Check 5 of 39 done.
#17: Check 2 of 39 done.
#19: Check 2 of 24 done.
#2: Check 1 of 39 done.
#0: Check 6 of 39 done.
#5: Check 2 of 39 done.
#3: Check 2 of 39 done.
#4: Check 2 of 39 done.
#6: Check 2 of 39 done.
#0: Check 7 of 39 done.
#7: Check 2 of 39 done.
#9: Check 1 of 39 done.
#17: Check 3 of 39 done.
#1: Check 1 of 39 done.
#19: Check 3 of 24 done.
#2: Check 2 of 39 done.
#0: Check 8 of 39 done.
#14: Check 1 of 39 done.
#5: Check 3 of 39 done.
#0: Check 9 of 39 done.
#18: Check 1 of 39 done.
#4: Check 3 of 39 done.
#16: Check 1 of 39 done.
#17: Check 4 of 39 done.
#2: Check 3 of 39 done.#1: Check 2 of 39 done.

#7: Check 3 of 39 done.
#0: Check 10 of 39 done.
#12: Check 1 of 39 don

### Test direct subsetting from parquet directory

In [None]:
#t = pq.read_table(Path(outdir),filters=[('PRES', '<', 20),('LATITUDE', '<', 21.1),('LATITUDE', '>', 21)])
t = pq.read_table(Path(outdir),filters=[('PLATFORM_NUMBER', '==', 1902304)])
df = t.to_pandas()
df
#ds = xr.Dataset.from_dataframe(df)
#ds

### Example loading Sprof from snapshot
```
ds = xr.load_dataset('/vortexfs1/share/boom/data/gdac_snapshot/202403-ArgoData/dac/aoml/1902304/1902304_Sprof.nc')
df = ds.to_dataframe()
```

In [None]:

from dotenv import load_dotenv
load_dotenv()
import os
from pyarrow import fs
s3 = fs.S3FileSystem(region='us-east-1')


In [None]:
s3

In [None]:
import pyarrow as pa
import pyarrow.parquet as pq
from pyarrow import Table

ds = xr.load_dataset('/vortexfs1/share/boom/data/gdac_snapshot/202403-ArgoData/dac/aoml/1902304/1902304_Sprof.nc',engine="argo")
df = ds[['DOXY','PRES','NITRATE','PLATFORM_NUMBER']].to_dataframe()

s3_filepath = 'data.parquet'

pq.write_to_dataset(
    Table.from_pandas(df),
    s3_filepath,
    filesystem=s3,
    use_dictionary=True,
    compression="snappy",
    version="2.4",
)



### single threaded

In [None]:

CHUNK_SZ = 100
VARS = ['JULD','LATITUDE','LONGITUDE','PRES','PRES_ADJUSTED','DOXY_ADJUSTED','DOXY_ADJUSTED_QC','NITRATE','NITRATE_ADJUSTED','PSAL','TEMP','CYCLE_NUMBER','PLATFORM_NUMBER']
for chunk in batched(wmo_fp,CHUNK_SZ):
    dflist = []
    for line in chunk:  
        fn = (line.split('/')[1] + "_Sprof.nc")
        fpath = Path(local_gdac) / "dac" / line / fn
        try:
            ds = xr.load_dataset(fpath)
        except:
            print(fpath)
        invars = list(set(VARS) & set(list(ds.data_vars)))
        df = ds[invars].to_dataframe()
        dflist.append(df)
    print(fpath)
    df = pd.concat(dflist)
    df.to_parquet('pq/test' + line.split('/')[1] + ".parquet",coerce_timestamps='us',allow_truncated_timestamps=True)
        
        