In [1]:
import os
import json

from cleanair.instance import ValidationInstance
from cleanair.instance import InstanceQuery

In [2]:
# directory to your secrets directory
secrets_dir = "../../terraform/.secrets/"

# open the parser config
with open(os.path.join(secrets_dir, "config.json"), "r") as filepath:
    parser_config = json.load(filepath)

# setup your filepaths
data_dir = parser_config["config_dir"]
results_dir = parser_config["results_dir"]
secretfile=os.path.join(secrets_dir, "db_secrets.json")
    


## Loading instances from DB

By just having the `instance_id` of an instance, you can load the data, model and results of a model that has been executed by somebody else.

To get all possible instances, execute the following query.

In [3]:
iq = InstanceQuery(secretfile=secretfile)
instance_df = iq.get_all_instances()
instance_df.sample(3)  # get 3 random rows

2020-03-18 17:04:48     INFO: Database connection information loaded from None


Unnamed: 0,instance_id,model_name,tag,param_id,data_id,git_hash,fit_start_time,cluster_id
11,341494535a34786af9b8368f92a4efdd05b85e84a1e2f9...,svgp,test,16bfd462ce906f099e346c69e08318e0598e5270ac0fb6...,e77e8ca79c8928bb37ed6fe6295e60c7577d8188775c6b...,e1d315d1f0b26621c1157ab18076a123e9182b0b,2020-03-17 14:19:29.616079,patrick_laptop
7,2a1d2d2fa5c0869f6474e945efafd37a30e3b52998f67d...,svgp,validation,16bfd462ce906f099e346c69e08318e0598e5270ac0fb6...,ed8c582190037215743fee222532017ae8f5d87f6a1cd6...,dd3544244f22d569abd445f10522302f7f1b6a32,2020-03-16 15:31:48.992177,patrick_laptop
0,01691e714314345e889a4271680930788201080bb3320c...,mr_dgp,validation,16bfd462ce906f099e346c69e08318e0598e5270ac0fb6...,196cef9b97e3c60b89c1bed13f380e106634f0c2818942...,ec577aaa25590274521c9d3581cdcc8824cefc1f,2020-03-10 10:21:59.691353,patrick_laptop


In [4]:
# we can now filter instances by e.g. tag, model_name, fit_start_time, etc.
dgp_df = instance_df.loc[
    (instance_df.model_name=="mr_dgp") & (instance_df.tag == "validation")
]
dgp_df.sample(3)

Unnamed: 0,instance_id,model_name,tag,param_id,data_id,git_hash,fit_start_time,cluster_id
4,7886078cddd4fe9a50c6f9e44d25426b2cf29776c6796e...,mr_dgp,validation,16bfd462ce906f099e346c69e08318e0598e5270ac0fb6...,196cef9b97e3c60b89c1bed13f380e106634f0c2818942...,a6fba4359c58c24b0f59620cde57498ec1283fc8,2020-03-13 15:19:18.957921,patrick_laptop
8,b3ef88af90f6cbd9fae66d4d76b4988d21d75a5b2111f5...,mr_dgp,validation,16bfd462ce906f099e346c69e08318e0598e5270ac0fb6...,ed8c582190037215743fee222532017ae8f5d87f6a1cd6...,cc5e139b448e5a2ef1519a0fdec70592bf018b8f,2020-03-16 17:11:13.440396,patrick_laptop
5,9896e39160ef419f0af458badee0c9133ecc6350743238...,mr_dgp,validation,16bfd462ce906f099e346c69e08318e0598e5270ac0fb6...,196cef9b97e3c60b89c1bed13f380e106634f0c2818942...,ef6314f19ef7258714913c5a784e88b9ca4838f1,2020-03-16 09:32:47.551807,patrick_laptop


In [5]:
# choose an instance to examine in more detail
instance_row = dgp_df.iloc[0]
print(instance_row)

# now load the data, model params and results
instance = ValidationInstance.instance_from_id(
    instance_id=instance_row.instance_id,
    experiment_config=parser_config
)

instance_id       01691e714314345e889a4271680930788201080bb3320c...
model_name                                                   mr_dgp
tag                                                      validation
param_id          16bfd462ce906f099e346c69e08318e0598e5270ac0fb6...
data_id           196cef9b97e3c60b89c1bed13f380e106634f0c2818942...
git_hash                   ec577aaa25590274521c9d3581cdcc8824cefc1f
fit_start_time                           2020-03-10 10:21:59.691353
cluster_id                                           patrick_laptop
Name: 0, dtype: object


2020-03-18 17:04:54     INFO: Database connection information loaded from None
2020-03-18 17:04:57     INFO: Load data config from database.
2020-03-18 17:04:57     INFO: Load model params from database
2020-03-18 17:04:57     INFO: Database connection information loaded from None
2020-03-18 17:05:00     INFO: Tag is validation
2020-03-18 17:05:00     INFO: Model name is mr_dgp
2020-03-18 17:05:00     INFO: Param id is bda7364508b181d6743cefb2dafb6aa7c4f5dfb6249c964518541c3f36df739c
2020-03-18 17:05:00     INFO: Data id is 4e22eb8062087ab22d163adc33060adca8fff5fb7cc3068f93991eef6450dd06
2020-03-18 17:05:00     INFO: Instance id is 7422575009982a23c52be8e2c52fd26b25b2b495f1f5d6c3a8715b91d326272a
2020-03-18 17:05:00     INFO: Cluster id is patrick_laptop


ValueError: Data id and hashed data config do not match. Data id from DB is 196cef9b97e3c60b89c1bed13f380e106634f0c28189423da714670f9b28e314 . Hashed data config from DB is 4e22eb8062087ab22d163adc33060adca8fff5fb7cc3068f93991eef6450dd06 . Data config is: {
    "train_start_date": "2020-01-28T00:00:00",
    "train_end_date": "2020-01-30T00:00:00",
    "pred_start_date": "2020-01-30T00:00:00",
    "pred_end_date": "2020-02-01T00:00:00",
    "include_satellite": true,
    "include_prediction_y": true,
    "train_sources": [
        "laqn"
    ],
    "pred_sources": [
        "laqn"
    ],
    "train_interest_points": "all",
    "train_satellite_interest_points": "all",
    "pred_interest_points": "all",
    "species": [
        "NO2"
    ],
    "features": [
        "value_1000_total_a_road_length",
        "value_500_total_a_road_length",
        "value_500_total_a_road_primary_length",
        "value_500_total_b_road_length"
    ],
    "norm_by": "laqn",
    "tag": "validation"
}