## CSV to Zarr

#### Import libraries

In [133]:
import os
import fsspec
import numpy as np
import pandas as pd
import time

#### Generate dataframe

In [184]:
fs = fsspec.filesystem('s3', anon=True)
all_files = fs.glob(f's3://imos-data-pixeldrill/viet-test/csv/*.csv')

start_time = time.time()
dfs = []
for i, file in enumerate(all_files):
    s3_fn = 's3://' + file
    with fs.open(s3_fn, "rb") as f:
        df_metadata = pd.read_csv(f, nrows=1)
    with fs.open(s3_fn, "rb") as f:
        df_data = pd.read_csv(f, skiprows=[0,1])

    df = pd.concat([df_metadata, df_data], axis=0, ignore_index=True)
    df.ffill(axis=0, inplace=True)
    dfs.append(df)
print("---------- Total: %.2f seconds ----------" % (time.time() - start_time))

---------- Total: 6.85 seconds ----------


In [185]:
dfs[0]

Unnamed: 0,dive_number,dive_name,dive_metadata_uuid,facility_code,campaign_code,dive_code,distance_covered_in_m,number_of_images,image_path,abstract,...,time,cluster_tag,up_left_lon,up_left_lat,up_right_lon,up_right_lat,low_right_lon,low_right_lat,low_left_lon,low_left_lat
0,2.0,n01 muiron long leg auv1,37a987ea-94ea-447c-ac63-961269362128,AUV,Ningaloo201203,r20120303_061943_n01_muiron_long_leg_auv1,3534.919783,6746.0,i20120303_061943_gtif,survey ningaloo reef,...,,,,,,,,,,
1,2.0,n01 muiron long leg auv1,37a987ea-94ea-447c-ac63-961269362128,AUV,Ningaloo201203,r20120303_061943_n01_muiron_long_leg_auv1,3534.919783,6746.0,i20120303_061943_gtif,survey ningaloo reef,...,20120303T063355Z,3.0,114.260646,-21.605323,114.260658,-21.605319,114.260661,-21.605328,114.260649,-21.605331
2,2.0,n01 muiron long leg auv1,37a987ea-94ea-447c-ac63-961269362128,AUV,Ningaloo201203,r20120303_061943_n01_muiron_long_leg_auv1,3534.919783,6746.0,i20120303_061943_gtif,survey ningaloo reef,...,20120303T063356Z,2.0,114.260646,-21.605320,114.260659,-21.605318,114.260661,-21.605327,114.260648,-21.605329
3,2.0,n01 muiron long leg auv1,37a987ea-94ea-447c-ac63-961269362128,AUV,Ningaloo201203,r20120303_061943_n01_muiron_long_leg_auv1,3534.919783,6746.0,i20120303_061943_gtif,survey ningaloo reef,...,20120303T063357Z,2.0,114.260646,-21.605316,114.260661,-21.605315,114.260661,-21.605326,114.260647,-21.605326
4,2.0,n01 muiron long leg auv1,37a987ea-94ea-447c-ac63-961269362128,AUV,Ningaloo201203,r20120303_061943_n01_muiron_long_leg_auv1,3534.919783,6746.0,i20120303_061943_gtif,survey ningaloo reef,...,20120303T063358Z,2.0,114.260648,-21.605312,114.260663,-21.605314,114.260661,-21.605324,114.260646,-21.605322
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6742,2.0,n01 muiron long leg auv1,37a987ea-94ea-447c-ac63-961269362128,AUV,Ningaloo201203,r20120303_061943_n01_muiron_long_leg_auv1,3534.919783,6746.0,i20120303_061943_gtif,survey ningaloo reef,...,20120303T082622Z,3.0,114.259621,-21.574862,114.259627,-21.574849,114.259637,-21.574853,114.259631,-21.574866
6743,2.0,n01 muiron long leg auv1,37a987ea-94ea-447c-ac63-961269362128,AUV,Ningaloo201203,r20120303_061943_n01_muiron_long_leg_auv1,3534.919783,6746.0,i20120303_061943_gtif,survey ningaloo reef,...,20120303T082623Z,3.0,114.259616,-21.574861,114.259622,-21.574848,114.259632,-21.574852,114.259626,-21.574864
6744,2.0,n01 muiron long leg auv1,37a987ea-94ea-447c-ac63-961269362128,AUV,Ningaloo201203,r20120303_061943_n01_muiron_long_leg_auv1,3534.919783,6746.0,i20120303_061943_gtif,survey ningaloo reef,...,20120303T082624Z,3.0,114.259611,-21.574859,114.259617,-21.574847,114.259627,-21.574851,114.259621,-21.574863
6745,2.0,n01 muiron long leg auv1,37a987ea-94ea-447c-ac63-961269362128,AUV,Ningaloo201203,r20120303_061943_n01_muiron_long_leg_auv1,3534.919783,6746.0,i20120303_061943_gtif,survey ningaloo reef,...,20120303T082625Z,3.0,114.259606,-21.574858,114.259612,-21.574845,114.259622,-21.574849,114.259616,-21.574862


In [50]:
# check if columns across the CSV files are identical
all(x==all_cols[0] for x in all_cols)

True

In [122]:
from csv import reader
with fs.open(all_files[0], "r") as my_file:
    # pass the file object to reader()
    file_reader = reader(my_file)
    # do this for all the rows
    for i in file_reader:
        # print the rows
        print(i)

['dive_number', 'dive_name', 'dive_metadata_uuid', 'facility_code', 'campaign_code', 'dive_code', 'distance_covered_in_m', 'number_of_images', 'image_path', 'abstract', 'platform_code', 'pattern', 'dive_report_path', 'kml_path', 'geospatial_lat_min', ' geospatial_lon_min', 'geospatial_lat_max', ' geospatial_lon_max', 'geospatial_vertical_min', ' geospatial_vertical_max', 'time_coverage_start', ' time_coverage_end']
['2', 'n01 muiron long leg auv1', '37a987ea-94ea-447c-ac63-961269362128', 'AUV', 'Ningaloo201203', 'r20120303_061943_n01_muiron_long_leg_auv1', '3534.919783', '6746', 'i20120303_061943_gtif', 'survey ningaloo reef', 'SIRIUS', 'Trajectory', 'Ningaloo201203/all_reports/r20120303_061943_n01_muiron_long_leg_auv1_report.pdf', 'Ningaloo201203/r20120303_061943_n01_muiron_long_leg_auv1/track_files/n01_muiron_long_leg_auv1.kml', '-21.605319', '114.259607', '-21.574844', '114.261265', '55.279468', '65.668699', '20120303T063355Z', '20120303T082626Z']
['campaign_code', 'dive_code', 'ima