In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
import click
from pathlib import Path
import shutil
import os
import pandas as pd
import numpy as np
import pickle
import json
from IPython.display import Markdown as md
from typing import *

In [None]:
# UNDERSTANDING PROJECT PATHS RELATIVE TO CURRENT FILE
# ROOT = Path(
#     os.path.abspath(__file__)
# ).parent.parent  # Root directory of the project
ROOT = Path(os.path.abspath(".."))
SCRIPTS = ROOT / "scripts"
CLIENT = ROOT / "client"
PUBLIC = ROOT / "public"
DEMO = PUBLIC / "demo"
PROJECTS = DEMO / "projects"
DEFAULT_GEOJSON = DEMO / "coxs_bazar.geojson"
AVAILABLE_PROJECTS = DEMO / "availableProjects.txt"


# Example data
project_name = "learning_centers_simple"
data_dir = Path(f"/Users/benjamin.hoover@ibm.com/Downloads") / project_name
out_dir = PROJECTS / project_name

# Data Exploring

## What do we have to work with?
> Exploring the output of a JUNE simulation

In [None]:
contents = list(data_dir.glob("*")); contents.sort(key=lambda p: p.stem); contents

[PosixPath('/Users/benjamin.hoover@ibm.com/Downloads/learning_centers_simple/locations_000.csv'),
 PosixPath('/Users/benjamin.hoover@ibm.com/Downloads/learning_centers_simple/locations_001.csv'),
 PosixPath('/Users/benjamin.hoover@ibm.com/Downloads/learning_centers_simple/locations_002.csv'),
 PosixPath('/Users/benjamin.hoover@ibm.com/Downloads/learning_centers_simple/locations_003.csv'),
 PosixPath('/Users/benjamin.hoover@ibm.com/Downloads/learning_centers_simple/locations_004.csv'),
 PosixPath('/Users/benjamin.hoover@ibm.com/Downloads/learning_centers_simple/locations_005.csv'),
 PosixPath('/Users/benjamin.hoover@ibm.com/Downloads/learning_centers_simple/locations_006.csv'),
 PosixPath('/Users/benjamin.hoover@ibm.com/Downloads/learning_centers_simple/locations_007.csv'),
 PosixPath('/Users/benjamin.hoover@ibm.com/Downloads/learning_centers_simple/locations_008.csv'),
 PosixPath('/Users/benjamin.hoover@ibm.com/Downloads/learning_centers_simple/locations_009.csv'),
 PosixPath('/Users/b

The outputs of the JUNE simulations are explored below:

### `summary_XXX.csv`

In [None]:
summary_df_ex = pd.read_csv(list(data_dir.glob("summary_*.csv"))[0]).astype(np.uint64, errors='ignore'); summary_df_ex.head()

Unnamed: 0,time_stamp,region,current_infected,daily_infected,current_recovered,daily_recovered,current_hospitalised,daily_hospitalised,current_intensive_care,daily_intensive_care,current_susceptible,daily_hospital_deaths,daily_deaths
0,2020-05-24,CXB-219,3,0,0,0,0,0,0,0,22950,0,0
1,2020-05-24,CXB-203,3,0,0,0,0,0,0,0,25649,0,0
2,2020-05-24,CXB-216,0,0,0,0,0,0,0,0,6772,0,0
3,2020-05-24,CXB-209,4,0,0,0,0,0,0,0,24418,0,0
4,2020-05-24,CXB-217,4,0,0,0,0,0,0,0,29655,0,0


To this we want to add the columns from the location file in a format `n_infections_in_XXXX`

### `locations_XXX.csv`

In [None]:
location_df_ex = pd.read_csv(list(data_dir.glob("locations_*.csv"))[0]).astype(np.uint64, errors='ignore'); location_df_ex.head()
location_df_ex.fillna(value=0, inplace=True)
pivot_locations = location_df_ex.pivot(index="timestamp", columns=["location_specs"]).fillna(value=0).astype(np.uint32, errors='ignore')
def rename_columns(c): return f"n_infections_in_{c}"
pivot_locations.rename(rename_columns, axis='columns', inplace=True)
pivot_locations.columns = [c[1] for c in pivot_locations.columns]; pivot_locations.head()

Unnamed: 0_level_0,n_infections_in_communal,n_infections_in_distribution_center,n_infections_in_e_voucher,n_infections_in_hospital,n_infections_in_learning_center,n_infections_in_play_group,n_infections_in_pump_latrine,n_infections_in_religious,n_infections_in_shelter
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020-05-24,0,0,0,0,0,1,0,0,23
2020-05-25,3,0,0,0,4,1,2,0,38
2020-05-26,1,0,0,0,4,1,4,0,61
2020-05-27,7,0,0,0,11,1,2,0,72
2020-05-28,2,0,0,0,6,1,1,1,74


### `parameter_grid.pkl`

In [None]:
pgrid_f = list(data_dir.glob("parameter_grid*"))[0]
with open(pgrid_f, 'rb') as fp:
    pgrid = pickle.load(fp)
    
pgrid

[{'learning_centers': False,
  'household_beta': 0.25,
  'indoor_beta_ratio': 0.45,
  'outdoor_beta_ratio': 0.05},
 {'learning_centers': False,
  'household_beta': 0.25,
  'indoor_beta_ratio': 0.55,
  'outdoor_beta_ratio': 0.05},
 {'learning_centers': False,
  'household_beta': 0.25,
  'indoor_beta_ratio': 0.55,
  'outdoor_beta_ratio': 0.15},
 {'learning_centers': False,
  'household_beta': 0.25,
  'indoor_beta_ratio': 0.55,
  'outdoor_beta_ratio': 0.1},
 {'learning_centers': False,
  'household_beta': 0.25,
  'indoor_beta_ratio': 0.65,
  'outdoor_beta_ratio': 0.05},
 {'learning_centers': True,
  'household_beta': 0.25,
  'indoor_beta_ratio': 0.45,
  'outdoor_beta_ratio': 0.05},
 {'learning_centers': True,
  'household_beta': 0.25,
  'indoor_beta_ratio': 0.55,
  'outdoor_beta_ratio': 0.05},
 {'learning_centers': True,
  'household_beta': 0.25,
  'indoor_beta_ratio': 0.55,
  'outdoor_beta_ratio': 0.15},
 {'learning_centers': True,
  'household_beta': 0.25,
  'indoor_beta_ratio': 0.55,
 

## What do we want?
> What does the client side expect as a project folder?

In [None]:
ex_project = PROJECTS / "learning_centers"
contents = list(ex_project.glob("*")); contents.sort(key=lambda p: p.stem); contents

[PosixPath('/Users/benjamin.hoover@ibm.com/Projects/camp-dashboard/public/demo/projects/learning_centers/metadata.json'),
 PosixPath('/Users/benjamin.hoover@ibm.com/Projects/camp-dashboard/public/demo/projects/learning_centers/sites.geojson'),
 PosixPath('/Users/benjamin.hoover@ibm.com/Projects/camp-dashboard/public/demo/projects/learning_centers/summary_001.csv'),
 PosixPath('/Users/benjamin.hoover@ibm.com/Projects/camp-dashboard/public/demo/projects/learning_centers/summary_002.csv'),
 PosixPath('/Users/benjamin.hoover@ibm.com/Projects/camp-dashboard/public/demo/projects/learning_centers/summary_003.csv'),
 PosixPath('/Users/benjamin.hoover@ibm.com/Projects/camp-dashboard/public/demo/projects/learning_centers/summary_004.csv'),
 PosixPath('/Users/benjamin.hoover@ibm.com/Projects/camp-dashboard/public/demo/projects/learning_centers/summary_005.csv'),
 PosixPath('/Users/benjamin.hoover@ibm.com/Projects/camp-dashboard/public/demo/projects/learning_centers/summary_006.csv'),
 PosixPath('

### The New `summary_XXX.csv`

The new `summary_XXX.csv` files contain the location information previously in the `locations_XXX.csv` file.

In [None]:
ex_client_summary = pd.read_csv(list(ex_project.glob("summary_*.csv"))[0]); ex_client_summary.head()

Unnamed: 0,region,timestamp,n_infections_in_communal,n_infections_in_distribution_center,n_infections_in_e_voucher,n_infections_in_hospital,n_infections_in_learning_center,n_infections_in_play_group,n_infections_in_pump_latrine,n_infections_in_religious,...,recovered_12_25,recovered_25_65,recovered_65_101,currently_susceptible_0_12,currently_susceptible_12_25,currently_susceptible_25_65,currently_susceptible_65_101,currently_susceptible,currently_dead,currently_recovered
0,CXB-201,2020-05-01,0,0,0,0,0,0,0,0,...,0,0,0,14084,7944,15745,124,37897,0,0
1,CXB-201,2020-05-02,0,0,0,0,0,0,0,0,...,0,0,0,14084,7944,15745,124,37897,0,0
2,CXB-201,2020-05-03,0,0,0,0,0,0,0,0,...,0,0,0,14084,7944,15745,124,37897,0,0
3,CXB-201,2020-05-04,0,0,0,0,0,0,0,0,...,0,0,0,14084,7944,15745,124,37897,0,0
4,CXB-201,2020-05-05,0,0,0,0,0,0,0,0,...,0,0,0,14084,7944,15745,124,37897,0,0


There are two things that need to be added to this format: the `n_infections_in_*` information (present in the `locations_XXX.csv` files provided from the simulation), and the `*_0_12`, `*_12_25`, etc. 

We cannot guaranteed have both of these pieces of information. The interface should remain agnostic in case these columns are not present

### The `metadata.json` file

In [None]:
with open(list(ex_project.glob("metadata.json"))[0], 'r') as fp:
    metadata = json.load(fp)
list(metadata.keys())

['description',
 'parameters_varied',
 'run_parameters',
 'all_regions',
 'all_timestamps',
 'all_fields',
 'field_statistics']

### The `sites.geojson`

In [None]:
with open(list(ex_project.glob("sites.geojson"))[0], 'r') as fp:
    geojson = json.load(fp)
list(geojson.keys())

['type', 'crs', 'features']

Most important is that this file contains the geometrical coordinates in, for example:

In [None]:
feat = geojson['features'][0]
{feat['properties']['SSID'] : geojson['features'][0]['geometry']}

{'CXB-224': {'type': 'Polygon',
  'coordinates': [[[92.15055569948606, 21.161191094393644],
    [92.15050138499646, 21.160151610964192],
    [92.1506616900773, 21.16019079353714],
    [92.15082697238086, 21.160253774929288],
    [92.15099270563735, 21.160237873404242],
    [92.15117416358822, 21.160145556170562],
    [92.1512789564838, 21.160091101856267],
    [92.15134841304786, 21.15997670814096],
    [92.15135912448403, 21.15988592568027],
    [92.15134187243575, 21.159783050000044],
    [92.15125685156255, 21.15957837701285],
    [92.15125700273262, 21.159578262560444],
    [92.15263972184573, 21.159540768291947],
    [92.15262349627744, 21.15897524850891],
    [92.15244662391268, 21.158449964394435],
    [92.15239224460646, 21.158155968823053],
    [92.15245904984533, 21.156501817904598],
    [92.15277507610898, 21.155825239942942],
    [92.15284403882151, 21.15525531258271],
    [92.15292946272467, 21.154467815837222],
    [92.15282185794251, 21.15393957565101],
    [92.152825836

## Implementing

From the above, the following features must be implemented:

- Pre-caching information from the parameter grid and summary runs into the `metadata.json` file
- Copying a sites.geojson file over, if it exists. Otherwise, using the default Cox's Bazar geojson information
- Adding the new project name to the `AVAILABLE_PROJECTS` file
- Add new columns to the summary files and copy them over

### Creating the `metadata.json`

In [None]:
def jsonify_params(params):
    """Some params are given as numpy values that are not json serializable. Fix"""
    out = {}
    for k, v in params.items():
        if type(v) == np.bool_:
            out[k] = bool(v)
        else:
            out[k] = v
    return out

def parameter_grid_to_metadata(pickle_file: str, description: str=""):
    """Get the basic JSON information from a pkl file
    
    Args:
        pickle_file: The pickle file containing a description of every run in the project
        description: A short string describing the project
    """
    with open(pickle_file, 'rb') as fp:
        data = pickle.load(fp)

    parameters_varied = set([])
    for d in data:
        for k in d.keys():
            if k not in parameters_varied:
                parameters_varied.add(k)
                
    run_names = range(len(data))

    run_parameters = {k: jsonify_params(o) for k, o in zip(run_names,data)}

    json_info = {
        "description": description,
        "parameters_varied": list(parameters_varied),
        "run_parameters": run_parameters
    }

    return json_info

def collect_statistics(project: Union[str, Path]):
    project = Path(project)
    csvfs = list(project.glob("summary*.csv"))
    dfs = [pd.read_csv(csvf) for csvf in csvfs]
    big_df = pd.concat(dfs, ignore_index=True) # The memory requirements of this may be large for very large projects

    all_regions = list(set(big_df.region))
    all_fields = [f for f in big_df.columns if f != "Unnamed: 0"]
    
    timestamps = None
    try:
        timestamps = big_df.timestamp
    except AttributeError:
        try:
            timestamps = big_df.time_stamp
        except:
            raise
    
    all_timestamps = list(set(timestamps))

    string_fields = ["timestamp", "region", "Unnamed: 0"]
    numerical_fields = [f for f in all_fields if f not in string_fields]
    big_df_num = big_df.loc[:, numerical_fields]
    max_vals = big_df_num.max(axis=0)
    min_vals = big_df_num.min(axis=0)

    df_minmax = pd.DataFrame([max_vals, min_vals], index=["max", "min"])
    field_minmaxes = df_minmax.to_dict(orient="dict")

    metadata = {
        "all_regions": sorted(all_regions),
        "all_timestamps": sorted(all_timestamps),
        "all_fields": sorted(all_fields),
        "field_statistics": field_minmaxes
    }

    return metadata

In [None]:
def save_dict_as_json(obj, fname):
    with open(fname, 'w') as fp:
        json.dump(obj, fp)

In [None]:
statistic_metadata = collect_statistics(data_dir)
parameter_metadata = parameter_grid_to_metadata(pgrid_f, "Simple test on learning centers, again")
metadata = {**parameter_metadata, **statistic_metadata}; metadata.keys()

dict_keys(['description', 'parameters_varied', 'run_parameters', 'all_regions', 'all_timestamps', 'all_fields', 'field_statistics'])

Now we just need to save the metadata as a json and we are all set

### Copying over geojson information

In [None]:
def init_geojson(project_dir, outdir, geojson_fname="sites.geojson"):
    """Search `project_dir` for a filename called `geojson_fname`. If it exists, copy it over. Otherwise use default"""
    geojson_f = project_dir / geojson_fname
    
    if geojson_f.exists():
        print(f"Using provided sites.geojson file")
    else:
        print(f"Using default geojson in {DEFAULT_GEOJSON}")
        geojson_f = DEFAULT_GEOJSON
        
    if not outdir.exists(): outdir.mkdir(parents=True)
    shutil.copy(str(geojson_f), str(outdir / "sites.geojson"))

In [None]:
init_geojson(data_dir, out_dir)

Using default geojson in /Users/benjamin.hoover@ibm.com/Projects/camp-dashboard/public/demo/coxs_bazar.geojson


### Adding to list of available projects

The client will serve any project directory listed as a line in the `AVAILABLE_PROJECTS` file. See `main` for how this is added

### Creating the new summary scripts

In [None]:
def read_location_as_df(loc_file):
    loc_df = pd.read_csv(loc_file).astype(np.uint64, errors='ignore');
    loc_df.fillna(value=0, inplace=True)
    pivot_locations = loc_df.pivot(index="timestamp", columns=["location_specs"]).fillna(value=0).astype(np.uint32, errors='ignore')
    def rename_columns(c): return f"n_infections_in_{c}"
    pivot_locations.rename(rename_columns, axis='columns', inplace=True)
    pivot_locations.columns = [c[1] for c in pivot_locations.columns]; pivot_locations.head()
    return pivot_locations

def read_summary_as_df(sum_file):
    sum_df = pd.read_csv(sum_file).astype(np.uint64, errors='ignore')
    return sum_df

In [None]:
summary_df_ex.head()

Unnamed: 0,time_stamp,region,current_infected,daily_infected,current_recovered,daily_recovered,current_hospitalised,daily_hospitalised,current_intensive_care,daily_intensive_care,current_susceptible,daily_hospital_deaths,daily_deaths
0,2020-05-24,CXB-219,3,0,0,0,0,0,0,0,22950,0,0
1,2020-05-24,CXB-203,3,0,0,0,0,0,0,0,25649,0,0
2,2020-05-24,CXB-216,0,0,0,0,0,0,0,0,6772,0,0
3,2020-05-24,CXB-209,4,0,0,0,0,0,0,0,24418,0,0
4,2020-05-24,CXB-217,4,0,0,0,0,0,0,0,29655,0,0


In [None]:
pivot_locations.head()

Unnamed: 0_level_0,n_infections_in_communal,n_infections_in_distribution_center,n_infections_in_e_voucher,n_infections_in_hospital,n_infections_in_learning_center,n_infections_in_play_group,n_infections_in_pump_latrine,n_infections_in_religious,n_infections_in_shelter
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020-05-24,0,0,0,0,0,1,0,0,23
2020-05-25,3,0,0,0,4,1,2,0,38
2020-05-26,1,0,0,0,4,1,4,0,61
2020-05-27,7,0,0,0,11,1,2,0,72
2020-05-28,2,0,0,0,6,1,1,1,74


In [None]:
def copy_summary_files(project_dir, out_dir):
    """Copy the summary files with location information to the output directory"""
    summary_files = list(project_dir.glob("summary_*.csv"))

    for sum_file in project_dir.glob("summary_*.csv"):
        print(f"Begin processing {sum_file}")
        sum_df = read_summary_as_df(sum_file)
        
        run_id = sum_file.stem.split("_")[1]
        loc_file = project_dir / f"locations_{run_id}.csv"
        if loc_file.exists():
            loc_df = read_location_as_df(loc_file)
            # Important location assumptions not included in location format
            if "region" in set(loc_df.columns):
                # Combine the columns
                sum_df = sum_df.join(loc_df, on="time_stamp").fillna(0).astype(np.uint64, errors='ignore')


        print(f"Saving to {out_dir / sum_file.name}")
        sum_df.to_csv(str(out_dir / sum_file.name), index=False)
        # No age information provided in the new template

In [None]:
# summary_files = list(data_dir.glob("summary_*.csv"))

# for sum_file in data_dir.glob("summary_*.csv"):
#     run_id = d.stem.split("_")[1]
#     loc_file = data_dir / f"locations_{run_id}.csv"
#     sum_df = read_summary_as_df(sum_file)
#     loc_df = read_location_as_df(loc_file)
    
#     # Combine the columns
#     combined_df = sum_df.join(loc_df, on="time_stamp").fillna(0).astype(np.uint64, errors='ignore')

In [None]:
len(set(summary_df_ex.time_stamp))

200

In [None]:
summary_df_ex.sort_values(by=["region", "time_stamp"])

Unnamed: 0,time_stamp,region,current_infected,daily_infected,current_recovered,daily_recovered,current_hospitalised,daily_hospitalised,current_intensive_care,daily_intensive_care,current_susceptible,daily_hospital_deaths,daily_deaths
6,2020-05-24,CXB-201,5,0,0,0,0,0,0,0,37893,0,0
27,2020-05-24,CXB-201,5,0,0,0,0,0,0,0,37893,0,0
48,2020-05-24,CXB-201,5,0,0,0,0,0,0,0,37893,0,0
69,2020-05-24,CXB-201,5,0,0,0,0,0,0,0,37893,0,0
90,2020-05-24,CXB-201,6,1,0,0,0,0,0,0,37892,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
25108,2020-12-09,CXB-232,0,0,3911,0,0,0,0,0,392,0,0
25129,2020-12-09,CXB-232,0,0,3911,0,0,0,0,0,392,0,0
25150,2020-12-09,CXB-232,0,0,3911,0,0,0,0,0,392,0,0
25171,2020-12-09,CXB-232,0,0,3911,0,0,0,0,0,392,0,0


In [None]:
ref_summary_df.head()

Unnamed: 0,region,timestamp,n_infections_in_communal,n_infections_in_distribution_center,n_infections_in_e_voucher,n_infections_in_play_group,n_infections_in_pump_latrine,n_infections_in_religious,n_infections_in_shelter,infected,...,recovered_12_25,recovered_25_65,recovered_65_101,currently_susceptible_0_12,currently_susceptible_12_25,currently_susceptible_25_65,currently_susceptible_65_101,currently_susceptible,currently_dead,currently_recovered
0,CXB-201,2020-05-01,0,0,0,0,0,0,0,0,...,0,0,0,14084,7925,15761,127,37897,0,0
1,CXB-201,2020-05-02,0,0,0,0,0,0,0,0,...,0,0,0,14084,7925,15761,127,37897,0,0
2,CXB-201,2020-05-03,0,0,0,0,0,0,0,0,...,0,0,0,14084,7925,15761,127,37897,0,0
3,CXB-201,2020-05-04,0,0,0,0,0,0,0,0,...,0,0,0,14084,7925,15761,127,37897,0,0
4,CXB-201,2020-05-05,0,0,0,0,0,0,0,0,...,0,0,0,14084,7925,15761,127,37897,0,0


In [None]:
ref_summary_csv = PROJECTS/"learning_centers"/"summary_001.csv"
ref_summary_df = pd.read_csv(ref_summary_csv)

In [None]:
len(set(ref_summary_df.timestamp))

245

# Script

In [None]:
def main(data_dir, name=None, force=False):
    data_d = Path(data_dir)
    project_name = data_d.stem if name is None else name
    AVAILABLE_PROJECTS.touch()
    outdir = PROJECTS / project_name
    print(f"Creating data for new project: '{project_name}'")
    if not outdir.exists(): outdir.mkdir(parents=True)
    
    with open(str(AVAILABLE_PROJECTS), 'r+') as fp:
        available_projects = set([p.strip() for p in fp.readlines()])
        if project_name in available_projects:
            if not force:
                raise ValueError(f"Cannot create project of name '{project_name}': Project already exists in {AVAILABLE_PROJECTS}"
)
            else:
                shutil.rmtree(outdir) # Delete existing project of that name
                fp.truncate(0); fp.seek(0); # Delete file contents
                available_projects.remove(project_name)
                new_available_projects = "".join(["\n" + p for p in list(available_projects)]).strip()
                fp.write(new_available_projects)


    init_geojson(data_d, outdir)

    print(f"Adding '{project_name}' to {AVAILABLE_PROJECTS}")
    with open(AVAILABLE_PROJECTS, "a") as fp: 
        fp.write("\n" + str(project_name))

    copy_summary_files(data_d, outdir)
    print("COMPLETE")

In [None]:
main(data_dir, "learning_center_simple_2", True)

Creating data for new project: 'learning_center_simple_2'
Using default geojson in /Users/benjamin.hoover@ibm.com/Projects/camp-dashboard/public/demo/coxs_bazar.geojson
Adding 'learning_center_simple_2' to /Users/benjamin.hoover@ibm.com/Projects/camp-dashboard/public/demo/availableProjects.txt
Begin processing /Users/benjamin.hoover@ibm.com/Downloads/learning_centers_simple/summary_006.csv
Saving to /Users/benjamin.hoover@ibm.com/Projects/camp-dashboard/public/demo/projects/learning_center_simple_2/summary_006.csv
Begin processing /Users/benjamin.hoover@ibm.com/Downloads/learning_centers_simple/summary_007.csv
Saving to /Users/benjamin.hoover@ibm.com/Projects/camp-dashboard/public/demo/projects/learning_center_simple_2/summary_007.csv
Begin processing /Users/benjamin.hoover@ibm.com/Downloads/learning_centers_simple/summary_005.csv
Saving to /Users/benjamin.hoover@ibm.com/Projects/camp-dashboard/public/demo/projects/learning_center_simple_2/summary_005.csv
Begin processing /Users/benjam