In [1]:
from pathlib import Path
import pandas as pd
import fsspec

In [2]:
image_bucket = "ncar-cesm-pop-test/images"
fs = fsspec.filesystem("s3", anon=True)

In [3]:
files = fs.glob(f"{image_bucket}/g.e22*/*/*.png")
len(files)

4633

In [4]:
def parse_file(file):
    path = file
    file = Path(file)
    x = dict(path=f"s3://{path}")
    if "checkpoint" in path:
        return {}
    plot_type = file.parent.stem
    casename = file.parent.parent.parts[-1]
    x["casename"] = casename
    x["plot_type"] = plot_type
    parts = file.stem.split("+")
    x["varname"] = parts[0]
    if plot_type == "timestep-global-map":
        x["plot_type"] = plot_type
        x["time"] = parts[1]
    if len(parts) > 2:
        if plot_type == "global-timeseries":
            idx = 1
        else:
            idx = 2
        for part in parts[idx:]:
            y = part.split("@")
            x[y[0]] = y[1]
            if y[0].startswith("z_t"):
                x["depth_level"] = y[1]
    return x


entries = [parse_file(file) for file in files]

In [5]:
df = pd.DataFrame(entries).dropna(subset=["path"])

df["z_t"] = df["z_t"].astype(float)
df["z_t_150m"] = df["z_t_150m"].astype(float)
df["depth_level"] = df["depth_level"].astype(float)
df["depth_level"] = df["depth_level"].fillna(-9999)
df["log_10"] = df.log_10.map(lambda x: True if x == "True" else False)
df.head()

Unnamed: 0,path,casename,plot_type,varname,spatial_op,time_coarsen_len,z_t,depth_level,z_t_150m,log_10,time_range,time
0,s3://ncar-cesm-pop-test/images/g.e22.G1850ECO_...,g.e22.G1850ECO_JRA_HR.TL319_t13.003,global-timeseries,CaCO3_FLUX_100m,integrate,12,,-9999.0,,False,,
1,s3://ncar-cesm-pop-test/images/g.e22.G1850ECO_...,g.e22.G1850ECO_JRA_HR.TL319_t13.003,global-timeseries,DpCO2,average,12,,-9999.0,,False,,
2,s3://ncar-cesm-pop-test/images/g.e22.G1850ECO_...,g.e22.G1850ECO_JRA_HR.TL319_t13.003,global-timeseries,FG_CO2,integrate,12,,-9999.0,,False,,
3,s3://ncar-cesm-pop-test/images/g.e22.G1850ECO_...,g.e22.G1850ECO_JRA_HR.TL319_t13.003,global-timeseries,NH4,average,12,500.0,500.0,,False,,
4,s3://ncar-cesm-pop-test/images/g.e22.G1850ECO_...,g.e22.G1850ECO_JRA_HR.TL319_t13.003,global-timeseries,NHx_SURFACE_EMIS,integrate,12,,-9999.0,,False,,


In [6]:
df.to_csv("HiRes-CESM-analysis.csv", index=False)