In [None]:
#default_exp create_project
%reload_ext autoreload
%autoreload 2

# Initializing a new project

## Folder structure
Assumes the following folder structure. It is important that logger files are named `record_NN.h5` or `record_NNN.h5`

Note that, because the data format to communicate parameters used is a Pickle file, the filenames **MUST** be indexed at 0

```
SIMULATION_NAME/
    parameter_grid.json #Describes each run
    record_00.h5
    record_01.h5
    ...
    record_NN.h5 # Each logger run
    sites.geojson # Polygons of the regions
```

From this information, you can specify an `output_directory` where summaries will automatically be available for frontend analysis.

```
public/demo/projects/SIMULATION_NAME/
    metadata.json
    summary_000.csv
    summary_001.csv
    ...
    summary_NNN.csv
```

We are provided a `parameter_grid.json` file that looks like the following:

```
{
    "pub": [
        0.0953169,
        0.521456,
        0.40569099999999997,
        0.484659,
        0.138482
    ],
    "grocery": [
        0.387384,
        0.452953,
        0.548852,
        0.042028699999999995,
        0.21261799999999997
    ], ...
}
```

(In this case, there are 5 runs and each run takes the parameter listed. This makes it tricky to do a grid search in the interface since many values will be distinct...)

In [None]:
#export
from pathlib import Path
import shutil
import os
import pandas as pd
import geopandas as gpd
import shapely
from shapely.geometry.polygon import Polygon
from shapely.ops import cascaded_union, unary_union
import numpy as np
from time import time
from typing import *
import junevis.path_fixes as pf
import json

import junevis.process_loggers as process_loggers

## Check available projects

Because extracting from the records can take a while, we don't want to overwrite an existing project unless indicated

In [None]:
#export
def init_available_projects(project_name: str):
    pf.AVAILABLE_PROJECTS.touch()

    with open(str(pf.AVAILABLE_PROJECTS), 'r+') as fp:
        available_projects = set([p.strip() for p in fp.readlines()])
        if project_name in available_projects:
            if not force_add_project:
                raise ValueError(f"Cannot create project of name '{project_name}': Project already exists in {pf.AVAILABLE_PROJECTS}"
    )
            else:
                shutil.rmtree(outdir) # Delete existing project of that name
                fp.truncate(0); fp.seek(0); # Delete file contents
                available_projects.remove(project_name)
        return available_projects

## Create the Summary CSVs

> Take the `record_**.h5` and convert them to CSVs the frontend can parse

These record files can be on the order of 8GB and summarizing each can take about 45 minutes. It works, though it is not the most efficient or parallelized implementation

In [None]:
#export 
def summarize_h5(record_f, outdir):
    """Dependent on the context variable `output_dir`. The actual summarized output is much smaller than the record file itself"""
    start = time()
    runId = record_f.stem.split("_")[1]
    print(f"Processing {runId}: ")
    df = process_loggers.regional_outputs(record_f)
    
    # Add cumulative columns
    region_grouped_df = df.groupby(level=0)
    df['currently_dead'] = region_grouped_df.deaths.cumsum()
    df['currently_recovered'] = region_grouped_df.recovered.cumsum()
    
    # Rename region
    df = df.rename_axis(index=["region", "timestamp"])
    
    outfile = outdir / f"summary_{int(runId):03}.csv"
    print(f"Saving to {str(outfile)}")
    df.to_csv(str(outfile))
    print(f"\nTook {time() - start} seconds")
    print("\n-------\n")
    return df

## Creating the `metadata.json`

We want to convert the provided `parameter_grid.json` file into a `metadata.json` file (e.g., below) that also includes some basic summary statistics from the project. This has the format: 

```
{
    "description": "Learning center comparison",
    "parameters_varied": [
        "indoor_beta",
        "outdoor_beta",
        "household_beta",
        "learning_centers"
    ],
    "run_parameters": {
        "1": {
            "learning_centers": false,
            "household_beta": 0.2,
            "indoor_beta": 0.45,
            "outdoor_beta": 0.05
        },
        "2": {
            "learning_centers": false,
            "household_beta": 0.2,
            "indoor_beta": 0.55,
            "outdoor_beta": 0.05
        }, ...
    },
    "all_regions": [
        "CXB-201",
        "CXB-202", ...
    ], 
    "all_timestamps": [
        "2020-05-01",
        "2020-05-02", ...
    ], 
    "all_fields": [
        "currently_dead",
        "currently_in_hospital_0_12", ...
    ],
    "field_statistics": {
        "n_infections_in_communal": {
            "max": 132.0,
            "min": 0.0
        },
        "recovered": {
            "max": 1937.0,
            "min": 0.0
        }, ...
    }
```

This involves restructuring the provided parameter grids and parsing the new `summary_**.csvs` for extents of each field.

In [None]:
#export
def pgrid_to_run_parameters(parameter_grid: dict) -> dict:
    """Convert parameter_grid dictionary to desired metadata dictionary"""
    run_parameters = {}
    
    # Create run_parameters
    for k, v in parameter_grid.items():
        for i in range(len(v)):
            curr = run_parameters.get(str(i), {})
            curr[k] = v[i]
            run_parameters[str(i)] = curr
    
    params_varied = list(parameter_grid.keys())
    
    return {
        "parameters_varied": params_varied,
        "run_parameters": run_parameters,
    }

In [None]:
#export
def collect_statistics(project: Union[str, Path]):
    project = Path(project)
    csvfs = list(project.glob("summary*.csv"))
    dfs = [pd.read_csv(csvf) for csvf in csvfs]
    big_df = pd.concat(dfs, ignore_index=True)

    all_regions = list(set(big_df.region))
    all_fields = [f for f in big_df.columns if f != "Unnamed: 0"]
    all_timestamps = list(set(big_df.timestamp))

    string_fields = ["timestamp", "region", "Unnamed: 0"]
    numerical_fields = [f for f in all_fields if f not in string_fields]
    big_df_num = big_df.loc[:, numerical_fields]
    max_vals = big_df_num.max(axis=0)
    min_vals = big_df_num.min(axis=0)

    df_minmax = pd.DataFrame([max_vals, min_vals], index=["max", "min"])
    field_minmaxes = df_minmax.to_dict(orient="dict")
    
    return {
        "all_regions": sorted(all_regions),
        "all_timestamps": sorted(all_timestamps),
        "all_fields": sorted(all_fields),
        "field_statistics": field_minmaxes
    }

## Copying the `sites.geojson`

This part is a bit simpler. We need to copy the `sites.geojson` file from the provided records to the output directory.

Note: some geojson files may be very large. This is the place to reduce the size to something more reasonable yet still functional.

Also, some geojson files for this project have been annotated with `SSID` as the 'property' that describes each region. Others are annotated with the `region` key. We need to unify this interface

### Fixing the sites.geojson

We need to unify the geojson file a bit. First, the files are terribly large with high resolution (making it very slow to load in the frontend), and the multipolygons are rendering incorrectly.

In [None]:
#export
def fix_geojson(gjson_file):
    gdf = gpd.read_file(gjson_file)
    
    for i, shape in enumerate(gdf.geometry):
        # To reduce the shape of the multipolygon, take the shape of the largest area
        if shape.geom_type == "MultiPolygon":
            polygon = shape[np.argmax(np.array([p.area for p in shape]))]
        else:
            polygon = shape
        gdf.geometry[i] = polygon
    
    # The frontend operates with a `SSID` field instead of a `region` field to name each area.
    gdf['SSID'] = gdf['region']
    return gdf

# Bundle as Script

In [None]:
#export
from fastcore.script import *

@call_parse
def main(record_path:Param("Path to JUNE simulation records and parameter grid", str), 
         force_add_project:Param("Overwrite project if it already exists", store_true)=False,
         test_only:Param("Test behavior without changing files", store_true)=False,
         project_name:Param("Name the project. If not provided, use folder name of `record_path`", str)=None, 
         description:Param("Description of project", str)="NA",
        ):
    """Create a project that can be visualized from the record files"""

    base = Path(record_path) # Path where loggers and parameter grid are stored
    project_name = base.stem if project_name is None else project_name
    output_dir = pf.PROJECTS / project_name
    if not output_dir.exists() and not test_only: output_dir.mkdir(parents=True)
    
    active_projects = init_available_projects(project_name)
    
    record_names = sorted(list(base.glob("*.h5")))
    for r in record_names:
        print(f"Summarizing {r}")
        if not test_only: df = summarize_h5(r, output_dir)

    print("ALL SUMMARIES COMPLETED\n-------------\n-------------\n")
    
    # Once the summary files have been created, we can accumulate the statistics into the `metadata.json` file
    print("Creating metadata...")
    with open(base / "parameter_grid.json") as fp:
        parameter_grid = json.load(fp)
    param_info = pgrid_to_run_parameters(parameter_grid)
    project_stats = collect_statistics(output_dir)
    
    # Now we can save the metadata for this project, including the optional description
    metadata = {"description": description}; [metadata.update(p) for p in [param_info, project_stats]];
    if not test_only:
        with open(output_dir / "metadata.json", 'w+') as fp:
            json.dump(metadata, fp, indent=4)
        
    # Copy over the geography description
    print("Fixing geojson...")
    gdf = fix_geojson(base / "sites.geojson")
    if not test_only: gdf.to_file(output_dir / "sites.new.geojson", driver='GeoJSON')
    
    # Add to available projects
    print(f"Adding '{project_name}' to {pf.AVAILABLE_PROJECTS}")
    new_available_projects = "".join(["\n" + p for p in (list(active_projects) + [project_name])]).strip()
    print(f"New projects: {new_available_projects}")
    
    if not test_only:
        with open(pf.AVAILABLE_PROJECTS, 'r+') as fp: 
            fp.write(new_available_projects)
            
    print("COMPLETE")

# Export -

In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_core.ipynb.
Converted 01_Create Project.ipynb.
Converted 01_Tokenizer.ipynb.
Converted Collect Global Statistics.ipynb.
Converted Scrap for init_project.py.ipynb.
Converted index.ipynb.
