In [None]:
import dask
import fsspec
import pandas as pd
from dask.distributed import Client

import carbonplan_benchmarks.analysis as cba

In [None]:
client = Client(n_workers=8, threads_per_worker=2)

# Create a list of benchmarking results

First, define the paths to the baseline images that the tests will be compared against and paths to the metadata files associated with each benchmarking run.

In [None]:
baseline_fp = "s3://carbonplan-benchmarks/benchmark-data/v0.2/baselines.json"
metadata_base_fp = "s3://carbonplan-benchmarks/benchmark-data/v0.2"
url_filter = 'carbonplan-benchmarks.s3.us-west-2.amazonaws.com/data/'

In [None]:
fs = fsspec.filesystem('s3')
files = fs.glob(f'{metadata_base_fp}/data-*.json')
metadata_files = [f"{metadata_base_fp}/{f.split('/')[-1]}" for f in files]
print(len(metadata_files))

# Load baseline images for comparison

In [None]:
snapshots = cba.load_snapshots(snapshot_path=baseline_fp)

# Create summary table

Use the utilities from `carbonplan_benchmarks` to load the metadata and baseline images into DataFrames, process those results, and create a summary DataFrame for all runs.

In [None]:
@dask.delayed()
def add_summary(fp):
    metadata, trace_events = cba.load_data(metadata_path=fp, run=0)
    data = cba.process_run(metadata=metadata, trace_events=trace_events, snapshots=snapshots)
    return cba.create_summary(metadata=metadata, data=data, url_filter=url_filter)

In [None]:
result = []
for file in metadata_files:
    result.append(add_summary(file))

In [None]:
summary_dfs = dask.compute(result)

In [None]:
summary = pd.concat(summary_dfs[0])
summary

In [None]:
summary['region'] = summary['metadata_path'].apply(
    lambda x: 'us-west-2' if '2023-08-24' in x else 'us-east-1'
)

# Export summary table

In [None]:
output_fp = "s3://carbonplan-benchmarks/benchmark-data/v0.2/summary.parq"
summary.to_parquet(output_fp)