In [1]:
import awswrangler as wr
import fsspec
import pandas as pd
from aodndata.soop.soop_xbt_nrt import parse_bufr_file


In [4]:
BUCKET = "imos-data-lab-raw"
PATH = "IMOS/SOOP/SOOP-XBT/REALTIME_BUFR"

BUCKET_OPTIMISED = "imos-data-lab-optimised"
PATH_OPTIMISED = "TEST_DATA/XBT"

In [5]:
# Use FSSpec to get a list of files in the bucket at the path
fs = fsspec.filesystem('s3')
files = ['s3://' + f for f in fs.glob(f"s3://{BUCKET}/{PATH}/*/*.csv")]
files[0]

's3://imos-data-lab-raw/IMOS/SOOP/SOOP-XBT/REALTIME_BUFR/2020/IOSS01_AMMC_20200901134700_D5LR9.csv'

In [6]:
def process_file(file, test=False):
    print(f"Processing file {file}")
    profiles = parse_bufr_file(file)

    for profile in profiles:
        metadata = {
            "profile_geotime": profile["profile_geotime"],
            "profile_metadata": profile["profile_metadata"],
        }

        out_file = f'{metadata["profile_geotime"]["date_utc"]:%Y%m%dT%H%M}_{metadata["profile_metadata"]["XBT_uniqueid"]}'
        metadata["profile_geotime"][
            "date_utc"
        ] = f'{metadata["profile_geotime"]["date_utc"]:%Y-%m-%dT%H:%M:00}'

        p = profile["profile_data"]
        data_headers = ["depth", "temp"]
        data_headers_array = ["glob_gtspp", "glob_gtspp_depth", "glob_gtspp_temp"]

        to_zip = [list(p[var].values) for var in data_headers] + [
            list(p[var]) for var in data_headers_array
        ]
        data = list(zip(*to_zip))

        point = f'POINT ({metadata["profile_geotime"]["longitude"]} {metadata["profile_geotime"]["latitude"]})'
        time = metadata["profile_geotime"]["date_utc"]
        uid = metadata["profile_metadata"]["XBT_uniqueid"]

        df = pd.DataFrame(data, columns=data_headers + data_headers_array)
        df["uid"] = uid
        df["geom"] = point
        df["datetime"] = time
        df['datetime'] = pd.to_datetime(df.datetime)
        for var in data_headers_array:
            df[var] = df[var].astype(int)

        if test:
            return df
        else:
            result = wr.s3.to_parquet(
                df=df,
                path=f"s3://imos-data-lab-optimised/TEST_DATA/XBT-WRANGLER/",
                dataset=True,
                database="profile",
                table="xbt-wrangler",
                mode="append",
                partition_cols=["uid"]
            )
            if not result:
                print(f"Failed to write {file} to parquet")
            else:
                return result


In [7]:
import concurrent.futures

# Process each file in the list on a separate thread
with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
    executor.map(process_file, files)


Processing file s3://imos-data-lab-raw/IMOS/SOOP/SOOP-XBT/REALTIME_BUFR/2020/IOSS01_AMMC_20200901134700_D5LR9.csvProcessing file s3://imos-data-lab-raw/IMOS/SOOP/SOOP-XBT/REALTIME_BUFR/2020/IOSS01_AMMC_20200901170100_D5LR9.csv

Processing file s3://imos-data-lab-raw/IMOS/SOOP/SOOP-XBT/REALTIME_BUFR/2020/IOSS01_AMMC_20200901210400_D5LR9.csv
Processing file s3://imos-data-lab-raw/IMOS/SOOP/SOOP-XBT/REALTIME_BUFR/2020/IOSS01_AMMC_20200902010100_D5LR9.csv
Processing file s3://imos-data-lab-raw/IMOS/SOOP/SOOP-XBT/REALTIME_BUFR/2020/IOSS01_AMMC_20200902050000_D5LR9.csvProcessing file s3://imos-data-lab-raw/IMOS/SOOP/SOOP-XBT/REALTIME_BUFR/2020/IOSS01_AMMC_20200902085600_D5LR9.csv

Processing file s3://imos-data-lab-raw/IMOS/SOOP/SOOP-XBT/REALTIME_BUFR/2020/IOSS01_AMMC_20200902130400_D5LR9.csvProcessing file s3://imos-data-lab-raw/IMOS/SOOP/SOOP-XBT/REALTIME_BUFR/2020/IOSS01_AMMC_20200902180000_D5LR9.csv

Processing file s3://imos-data-lab-raw/IMOS/SOOP/SOOP-XBT/REALTIME_BUFR/2020/IOSS01_AMMC

In [2]:
wr.s3.read_parquet("s3://imos-data-lab-optimised/TEST_DATA/XBT-WRANGLER/").query()


wr.s3.read_partu

ClientError: An error occurred (ExpiredToken) when calling the ListObjectsV2 operation: The provided token has expired.

In [3]:
import awswrangler as wr

table = wr.athena.read_sql_query("select * from soop_xbt_nrt limit 5", "profile")


In [5]:
type(table)

pandas.core.frame.DataFrame

In [6]:
import dask.dataframe as dd


  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


In [9]:
%%time

import dask.dataframe as dd
df = dd.read_parquet('s3://imos-data-lab-optimised/parquet/xbt/', columns=['temp'], engine='pyarrow')
df.temp.mean().compute()

CPU times: user 1.84 s, sys: 269 ms, total: 2.11 s
Wall time: 2.17 s


8.40091803179103