# io

> File operations on chronicle parquet files

In [None]:
#| default_exp io

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import polars as pl
import pyarrow.parquet as pq
from pyarrow import fs
# import pyarrow.dataset as ds
# from s3fs import S3FileSystem
import tempfile
import os
import re
from chronicle.core import read_chronicle_metrics

In [None]:
#| export
def write_parquet(
        x: pl.DataFrame, # polars DataFrame
        filename:str # Full file file name
    ) -> None:
    "Write chronicle data to parquet file"
    return pq.write_table(x.to_arrow(), filename)


In [None]:
m = read_chronicle_metrics("./data")

# create a temporary file
tf = tempfile.NamedTemporaryFile(suffix = ".parquet")
assert os.path.getsize(tf.name) == 0
z = write_parquet(m, tf)

assert os.path.getsize(tf.name) > 0
assert z is None

In [None]:
#| export
def get_s3_bucket_dates(
        bucket:str, # S3 bucket name, without the "s3://" prefix 
        type="logs", # "logs" or "metrics"
        version="v1" # "v1" or "v2"
    ) -> list:
    "Get a list of dates for which there are chronicle logs or metrics in an S3 bucket"
    s3 = fs.S3FileSystem()
    p = s3.get_file_info(
        fs.FileSelector(
            f'{bucket}/{version}/{type}', 
            recursive=True)
        )
    # list all paths where type == file
    ps = [x.path for x in p if x.type == 2]
    # extract dates using a regular expression
    dates = [re.findall(r'\d{4}/\d{2}/\d{2}', x)[0] for x in ps]
    # convert to a set to get unique values
    dates = list(set(dates))
    dates.sort()
    return dates

In [None]:
#| eval: false
bucket = "colorado-posit-chronicle"
get_s3_bucket_dates(bucket, "metrics")