### Update glossary.csv file using data from schemas.yml

This script assumes you have a "new" schema obtained from running the `cea trace-inputlocator` script.

In [421]:
import cea.scripts
import cea.inputlocator
import cea.config
import cea.glossary
from cea.tests.trace_inputlocator import get_csv_schema
import os
import yaml
import json
from itertools import repeat

In [422]:
schemas = cea.scripts.schemas()
glossary_df = cea.glossary.read_glossary_df()
locators = schemas.keys()

def save_glossary(glossary_df):
    glossary_df = glossary_df.sort_values(["SCRIPT", "LOCATOR_METHOD", "FILE_NAME", "VARIABLE"])
    glossary_df.to_csv(os.path.join(os.path.dirname(cea.glossary.__file__), 'glossary.csv'),
                  columns=["SCRIPT", "LOCATOR_METHOD", "FILE_NAME", "VARIABLE", "DESCRIPTION", "UNIT", "VALUES", "TYPE", "COLOR"],
                  index=False)
    print("saved new glossary.csv - reloading")
    glossary_df = cea.glossary.read_glossary_df()
    return glossary_df
glossary_df = save_glossary(glossary_df)

saved new glossary.csv - reloading


### start by finding all entries in schemas.yml without a schema

the following three locator methods need "special" treatment:
- get_optimization_checkpoint
  - "special" schema
- get_optimization_disconnected_cooling_capacity
  - only present in projects with cooling network
- get_optimization_connected_cooling_capacity
  - only present in projects with cooling network
  
this code assumes you have a "reference-case-cooling/baseline" in your projectroot and have run the optimization on that (e.g. run `cea workflow --workflow district-cooling-system`)

In [423]:
config = cea.config.Configuration()
config.scenario = os.path.join(config.project, "..", "reference-case-cooling", "baseline")
locator = cea.inputlocator.InputLocator(scenario=config.scenario)

# load get_optimization_checkpoint schema
if not schemas["get_optimization_checkpoint"]["schema"]:
    with open(locator.get_optimization_checkpoint(1), 'r') as fp:
        get_optimization_checkpoint = json.load(fp)
    schemas["get_optimization_checkpoint"]["schema"] = {
        str(key): {"sample_data": get_optimization_checkpoint[key],
                   "types_found": None}
        for key in get_optimization_checkpoint.keys()
    }

# load get_optimization_disconnected_cooling_capacity schema
if not schemas["get_optimization_disconnected_cooling_capacity"]["schema"]:
    schemas["get_optimization_disconnected_cooling_capacity"]["schema"] = get_csv_schema(
        locator.get_optimization_disconnected_cooling_capacity(1, 1))
    
# load get_optimization_connected_cooling_capacity schema
if not schemas["get_optimization_connected_cooling_capacity"]["schema"]:
    schemas["get_optimization_connected_cooling_capacity"]["schema"] = get_csv_schema(
        locator.get_optimization_disconnected_cooling_capacity(1, 1))

In [424]:
# each locator method needs a "schema" entry (this should not output anything)
for lm in locators:
    if not "schema" in schemas[lm]:
        print lm

In [425]:
# the "schema" entry should not be `None` (this should not output anything)
for lm in locators:
    if not schemas[lm]["schema"]:
        print lm

if any of the above produce printed output, update schemas.yml and re-run the notebook

### make sure the "used-by" and "created-by" lists don't contain duplicates

In [426]:
for lm in locators:
    if not "used_by" in schemas[lm]:
        print lm

In [427]:
for lm in locators:
    if not "created_by" in schemas[lm]:
        print lm

each locator should have a "used_by" and a "created_by" - let's assume they're all lists

In [428]:
for lm in locators:
    schemas[lm]["used_by"] = sorted(set(schemas[lm]["used_by"]))
    schemas[lm]["created_by"] = sorted(set(schemas[lm]["created_by"]))

In [429]:
# save it back
schemas_yml = os.path.join(os.path.dirname(cea.scripts.__file__), 'schemas.yml')
print "saving to:", schemas_yml
with open(schemas_yml, 'w') as fp:
    yaml.dump(schemas, fp)
schemas = cea.scripts.schemas()

saving to: c:\users\darthoma\documents\github\cityenergyanalyst\cea\schemas.yml


### find all schema entries that are not in glossary.csv

In [430]:
# first: what are the missing locator methods?
glossary_lms = set(glossary_df.LOCATOR_METHOD.values)
schemas_lms = set(schemas.keys())
missing_lms = sorted(schemas_lms - glossary_lms)
print '\n'.join(missing_lms)




for each of those missing locator methods in glossary.csv, we need to append entries for each of the fields of that file. some of those files are special (the optimization checkpoints comes to mind). each glossary.csv entry has the following fields:

- SCRIPT (use first "created_by" or "-", if input file)
- LOCATOR_METHOD
- FILE_NAME (get from schemas.yml file_path)
- VARIABLE (this is the field name)
- DESCRIPTION (use "TODO")
- UNIT (use "TODO")
- VALUES (use "TODO")
- TYPE (use the first from schemas.types_found)
- COLOR (use "black") - I'm not really sure we need this at all in glossary.csv?

In [431]:
def extract_glossary_row(script, lm, file_name, variable, variable_entry):
    if "types_found" in variable_entry:
        var_type = variable_entry["types_found"][0] if variable_entry["types_found"] else "TODO"
    else:
        var_type = "TODO"
    return {
        "key": "{lm}!!!{variable}".format(**locals()),
        "SCRIPT": script,
        "LOCATOR_METHOD": lm,
        "FILE_NAME": file_name,
        "VARIABLE": variable,
        "DESCRIPTION": "TODO",
        "UNIT": "TODO",
        "VALUES": "TODO",
        "TYPE": var_type,
        "COLOR": "black"}

for lm in missing_lms:
    print("processing lm:", lm)
    script = schemas[lm]["created_by"][0] if len(schemas[lm]["created_by"]) else "-"
    file_name = schemas[lm]["file_path"]
    file_type = schemas[lm]["file_type"]
    if file_type in {"xls", "xlsx"}:
        for worksheet in schemas[lm]["schema"].keys():
            for variable in schemas[lm]["schema"][worksheet].keys():
                variable_entry = schemas[lm]["schema"][worksheet][variable]
                ws_file_name = "{file_name}:{worksheet}".format(**locals())
                row = extract_glossary_row(script, lm, ws_file_name, variable, variable_entry)
                glossary_df = glossary_df.append(row, ignore_index=True)
    else:
        for variable in schemas[lm]["schema"].keys():
            row = extract_glossary_row(script, lm, file_name, variable, schemas[lm]["schema"][variable])
            glossary_df = glossary_df.append(row, ignore_index=True)
        
glossary_df = save_glossary(glossary_df)

saved new glossary.csv - reloading


### find all glossary entries that are not in schemas.yml

In [432]:
# find all filenames for excel files that don't fit the convention (file_name, ":", worksheet)
# (clean glossary_df until this doesn't output anything)
for _, row in glossary_df.iterrows():
    # we know the locator method is in here from the previous cell ;)
    lm = row["LOCATOR_METHOD"]
    file_type = schemas[lm]["file_type"]
    if file_type in {"xls", "xlsx"}:
        if not ":" in row["FILE_NAME"]:
            print "BAD FILE_NAME:", row["FILE_NAME"]

In [433]:
# find all locator methods not present in schemas.yml
invalid_lms = []  # stuff left over from previous versions of cea
for _, row in glossary_df.iterrows():
    lm = row["LOCATOR_METHOD"]
    if lm not in schemas:
        invalid_lms.append(lm)

for lm in invalid_lms:
    print "invalid:", lm
    glossary_df = glossary_df[glossary_df["LOCATOR_METHOD"] != lm]

glossary_df = save_glossary(glossary_df)

saved new glossary.csv - reloading


In [434]:
# find all variables not present in schemas.yml
# NOTE: treat xls & xlsx files differently
glossary_df = cea.glossary.read_glossary_df()
invlaid_vars = [] # list of rows that are not valid anymore
for _, row in glossary_df.iterrows():
    # we know the locator method is in here from the previous cell ;)
    lm = row["LOCATOR_METHOD"]
    var = row["VARIABLE"]
    file_type = schemas[lm]["file_type"]
    if file_type in {"xls", "xlsx"}:
        worksheet = row["FILE_NAME"].split(":")[1]
        if not var in schemas[lm]["schema"][worksheet]:
            print "invalid VARIABLE: {lm}/{worksheet}/{var}".format(**locals())
    else:
        if not var in schemas[lm]["schema"]:
            print "invalid VARIABLE: {lm}/{var}".format(**locals())
            
glossary_df = save_glossary(glossary_df)

invalid VARIABLE: SC_results/SC_roofs_top_Q_kWh
invalid VARIABLE: SC_results/SC_roofs_top_m2
invalid VARIABLE: SC_results/SC_walls_east_Q_kWh
invalid VARIABLE: SC_results/SC_walls_east_m2
invalid VARIABLE: SC_results/SC_walls_north_Q_kWh
invalid VARIABLE: SC_results/SC_walls_north_m2
invalid VARIABLE: SC_results/SC_walls_south_Q_kWh
invalid VARIABLE: SC_results/SC_walls_south_m2
invalid VARIABLE: SC_results/SC_walls_west_Q_kWh
invalid VARIABLE: SC_results/SC_walls_west_m2
invalid VARIABLE: SC_total_buildings/SC_roofs_top_Q_kWh
invalid VARIABLE: SC_total_buildings/SC_roofs_top_m2
invalid VARIABLE: SC_total_buildings/SC_walls_east_Q_kWh
invalid VARIABLE: SC_total_buildings/SC_walls_east_m2
invalid VARIABLE: SC_total_buildings/SC_walls_north_Q_kWh
invalid VARIABLE: SC_total_buildings/SC_walls_north_m2
invalid VARIABLE: SC_total_buildings/SC_walls_south_Q_kWh
invalid VARIABLE: SC_total_buildings/SC_walls_south_m2
invalid VARIABLE: SC_total_buildings/SC_walls_west_Q_kWh
invalid VARIABLE: SC

**NOTE:** for now,there are still some invalid variables in `SC_results` and `SC_total_buildings` that we'll just gloss over (pun intended). 

### check to make sure all variables in schemas.yml are present in glossary.csv

In [439]:
glossary_df = cea.glossary.read_glossary_df()

# TODO: come up with a solution for these...
EXCLUDE_LOCATOR_METHODS = {"SC_totals", "SC_results", "SC_total_buildings"}

for lm in schemas.keys():
    if lm in EXCLUDE_LOCATOR_METHODS:
        continue
    schema = schemas[lm]["schema"]
    file_type = schemas[lm]["file_type"]
    file_name = schemas[lm]["file_path"]
    script = schemas[lm]["created_by"][0] if schemas[lm]["created_by"] else "-"
    if file_type in {"xls", "xlsx"}:
        for worksheet, schema in schemas[lm]["schema"].items():
            for var in schema.keys():
                matches = glossary_df[(glossary_df["LOCATOR_METHOD"] == lm)
                                       & (glossary_df["VARIABLE"] == var)
                                       & (glossary_df["FILE_NAME"] == glossary_df["FILE_NAME"] + ":" + worksheet)].values
                if len(matches) != 1:
                    glossary_df = glossary_df.append({
                        "SCRIPT": script,
                        "LOCATOR_METHOD": lm,
                        "FILE_NAME": "{file_name}:{worksheet}".format(**locals()),
                        "VARIABLE": var,
                        "DESCRIPTION": "TODO",
                        "UNIT": "TODO",
                        "VALUES": "TODO",
                        "TYPE": "TYPE",
                        "COLOR": "black",
                    }, ignore_index=True)
                    print("ADDED: {lm}/{worksheet}/{var} ({matches})".format(**locals()))
    else:
        for var in schema.keys():
            # check if it exists in `glossary.csv`
            matches = glossary_df[(glossary_df["LOCATOR_METHOD"] == lm) & (glossary_df["VARIABLE"] == var)]["VARIABLE"].values
            if len(matches) != 1:
                glossary_df = glossary_df.append({
                    "SCRIPT": script,
                    "LOCATOR_METHOD": lm,
                    "FILE_NAME": file_name,
                    "VARIABLE": var,
                    "DESCRIPTION": "TODO",
                    "UNIT": "TODO",
                    "VALUES": "TODO",
                    "TYPE": "TYPE",
                    "COLOR": "black",
                }, ignore_index=True)
                print("ADDED: {lm}/{var} ({matches})".format(**locals()))
                
glossary_df = save_glossary(glossary_df)

ADDED: get_archetypes_properties/INDOOR_COMFORT/RH_max_pc ([])
ADDED: get_archetypes_properties/INDOOR_COMFORT/Code ([])
ADDED: get_archetypes_properties/INDOOR_COMFORT/Tcs_set_C ([])
ADDED: get_archetypes_properties/INDOOR_COMFORT/Ths_setb_C ([])
ADDED: get_archetypes_properties/INDOOR_COMFORT/Ths_set_C ([])
ADDED: get_archetypes_properties/INDOOR_COMFORT/Tcs_setb_C ([])
ADDED: get_archetypes_properties/INDOOR_COMFORT/Ve_lpspax ([])
ADDED: get_archetypes_properties/INDOOR_COMFORT/RH_min_pc ([])
ADDED: get_archetypes_properties/INTERNAL_LOADS/El_Wm2 ([])
ADDED: get_archetypes_properties/INTERNAL_LOADS/Code ([])
ADDED: get_archetypes_properties/INTERNAL_LOADS/Qhpro_Wm2 ([])
ADDED: get_archetypes_properties/INTERNAL_LOADS/Occ_m2pax ([])
ADDED: get_archetypes_properties/INTERNAL_LOADS/Ed_Wm2 ([])
ADDED: get_archetypes_properties/INTERNAL_LOADS/Ea_Wm2 ([])
ADDED: get_archetypes_properties/INTERNAL_LOADS/Qcre_Wm2 ([])
ADDED: get_archetypes_properties/INTERNAL_LOADS/Epro_Wm2 ([])
ADDED: get_

ADDED: get_database_supply_systems/BH/cap_min ([])
ADDED: get_database_supply_systems/BH/LT_yr ([])
ADDED: get_database_supply_systems/BH/IR_% ([])
ADDED: get_database_supply_systems/BH/currency ([])
ADDED: get_database_supply_systems/BH/code ([])
ADDED: get_database_supply_systems/BH/assumption ([])
ADDED: get_database_supply_systems/BH/e ([])
ADDED: get_database_supply_systems/BH/unit ([])
ADDED: get_database_supply_systems/BH/d ([])
ADDED: get_database_supply_systems/ALL_IN_ONE_SYSTEMS/efficiency ([])
ADDED: get_database_supply_systems/ALL_IN_ONE_SYSTEMS/code ([])
ADDED: get_database_supply_systems/ALL_IN_ONE_SYSTEMS/Description ([])
ADDED: get_database_supply_systems/ALL_IN_ONE_SYSTEMS/reference ([])
ADDED: get_database_supply_systems/ALL_IN_ONE_SYSTEMS/feedstock ([])
ADDED: get_database_supply_systems/ALL_IN_ONE_SYSTEMS/system ([])
ADDED: get_database_supply_systems/ALL_IN_ONE_SYSTEMS/scale ([])
ADDED: get_database_supply_systems/Pump/a ([])
ADDED: get_database_supply_systems/Pump

ADDED: get_database_supply_systems/SC/mB_min_r ([])
ADDED: get_database_supply_systems/SC/cap_max ([])
ADDED: get_database_supply_systems/SC/C_eff ([])
ADDED: get_database_supply_systems/SC/t_max ([])
ADDED: get_database_supply_systems/SC/unit ([])
ADDED: get_database_supply_systems/SC/Description ([])
ADDED: get_database_supply_systems/SC/assumption ([])
ADDED: get_database_supply_systems/SC/type ([])
ADDED: get_database_supply_systems/SC/dP4 ([])
ADDED: get_database_supply_systems/SC/dP3 ([])
ADDED: get_database_supply_systems/SC/dP2 ([])
ADDED: get_database_supply_systems/SC/dP1 ([])
ADDED: get_database_supply_systems/SC/module_area_m2 ([])
ADDED: get_database_supply_systems/SC/cap_min ([])
ADDED: get_database_supply_systems/SC/IAM_d ([])
ADDED: get_database_supply_systems/SC/c2 ([])
ADDED: get_database_supply_systems/SC/c1 ([])
ADDED: get_database_supply_systems/SC/a ([])
ADDED: get_database_supply_systems/SC/c ([])
ADDED: get_database_supply_systems/SC/b ([])
ADDED: get_database_s

### make sure glossary.csv (locator_method, variable) is unique

### clean the sample_data (make longs into ints) 