'''
Convert raw csv data into parquet
Input(s): Spire_Cargos_AIS_01012019_31122021_hourlydownsampled_0*.csv
Output(s): aisparquet.parquet
Runtime: 9 hours
'''

In [1]:
#pip install dask

In [2]:
#pip install distributed

In [3]:
#pip install dask_jobqueue

In [4]:
#pip install fastparquet

In [12]:
#pip install bokeh>=2.1.1

Note: you may need to restart the kernel to use updated packages.


In [1]:
import dask.dataframe as dd
import glob, os, time
from dask.distributed import Client
from dask_jobqueue import SLURMCluster

  from distributed.utils import tmpfile


In [8]:
def convert_csv_parquet(files, outdir = os.getcwd() + "parquetdata", usecols = None, dtypes = None, date_cols = None, append = True):
    """Convert csv files to parquet"""
    (
    dd.read_csv(
        files,
        usecols = usecols,
        dtype = dtypes,
        parse_dates = date_cols,
        assume_missing = True,
        verbose = False
    )
    .to_parquet(
        outdir,
        write_index = False,
        append = append
    )
    )

# Parsing details

In [9]:
usecols = ['timestamp', 'mmsi', 'msg_type', 'latitude', 'longitude', 'speed', 'heading', 'draught']
dtypes = {
    'mmsi' : 'int32',
    'msg_type' : 'int8',
    'latitude' : 'float32',
    'longitude' : 'float32',
    'speed' : 'float16', # can probably reduce size using float16
    'heading' : 'float16',
    'draught' : 'float16'
}
date_cols = ['timestamp']

# Files to convert

In [10]:
filepath = '/scratch/petersal/ShippingEmissions/src/data/AIS/ais_csv'
filekeystring = "Spire_Cargos_AIS_01012019_31122021_hourlydownsampled_0"
files = glob.glob(os.path.join(filepath,'*' + filekeystring + '*'))
files = files[0:2]

# Cluster setup

In [2]:
cluster = SLURMCluster(project='def-kasahara-ab',
                       cores=8,
                       memory="64GB",
                       walltime='00:15:00')
client = Client(cluster)
cluster

Tab(children=(HTML(value='<div class="jp-RenderedHTMLCommon jp-RenderedHTML jp-mod-trusted jp-OutputArea-outpu…

In [None]:
client.dashboard_link

'http://172.16.138.254:8787/status'

In [None]:
print(cluster.job_script())

In [None]:
cluster.scale(8)

In [None]:
!squeue -u petersal

# Convert

In [None]:
print(f"Converting {len(files)} files from {filepath}:")
for file in list(map(lambda x : os.path.split(x)[1], files)):
    print(file)
start = time.time()
convert_csv_parquet(files, os.path.join(os.path.split(filepath)[0], 'ais_raw'), usecols, dtypes, date_cols = date_cols, append = False)
end = time.time()
print(f"Elapsed time: {(end - start)}")

cluster.close()
client.close()