'''
Convert raw csv data into parquet
Input(s): Spire_Cargos_AIS_01012019_31122021_hourlydownsampled_0*.csv
Output(s): aisparquet.parquet
Runtime: 9 hours
'''

In [None]:
import dask.dataframe as dd
import glob, os, time

In [None]:
from dask.distributed import Client, LocalCluster
cluster = LocalCluster(n_workers=8)
client = Client(cluster)

In [None]:
def convert_csv_parquet(files, outdir = os.getcwd() + "parquetdata", usecols = None, dtypes = None, date_cols = None, append = True):
    """Convert csv files to parquet"""
    (
    dd.read_csv(
        files,
        usecols = usecols,
        dtype = dtypes,
        parse_dates = date_cols,
        assume_missing = True,
        verbose = False
    )
    .to_parquet(
        outdir,
        write_index = False,
        append = append
    )
    )

# Parsing details

In [None]:
usecols = ['timestamp', 'mmsi', 'msg_type', 'latitude', 'longitude', 'speed', 'heading', 'draught']
dtypes = {
    'mmsi' : 'int32',
    'msg_type' : 'int8',
    'latitude' : 'float32',
    'longitude' : 'float32',
    'speed' : 'float16', # can probably reduce size using float16
    'heading' : 'float16',
    'draught' : 'float16'
}
date_cols = ['timestamp']

# Files to convert

In [None]:
filepath = '/media/apeters/Extreme SSD/maritime_client_ubc'
filekeystring = "Spire_Cargos_AIS_01012019_31122021_hourlydownsampled_0"
files = glob.glob(os.path.join(filepath,'*' + filekeystring + '*'))
files = files

# Convert

In [None]:
print(f"Converting {len(files)} files from {filepath}:")
for file in list(map(lambda x : os.path.split(x)[1], files)):
    print(file)
start = time.time()
convert_csv_parquet(files, os.path.join(filepath, 'aisparquet'), usecols, dtypes, date_cols = date_cols, append = False)
end = time.time()
print(f"Elapsed time: {(end - start)}")