# JSONPath proof-of-concept

exploring a potential JSONPath-like interface for retrieving rubicon logs

### filesystem directory structure

```
|- project_a
|  |- artifacts
|  |  |- id_a
|  |  |  |- data
|  |  |  |_ metadata.json
|  |  |_ ...
|  |- dataframes
|  |  |- id_a
|  |  |  |- data
|  |  |  |_ metadata.json
|  |  |_ ...
|  |- experiments
|     |- id_a
|     |  |- artifacts
|     |  |  |- id_a
|     |  |  |  |- data
|     |  |  |  |_ metadata.json
|     |  |  |_ ...
|     |  |- dataframes
|     |  |  |- id_a
|     |  |  |  |- data
|     |  |  |  |_ metadata.json
|     |  |  |_ ...
|     |  |- features
|     |  |  |- name_a
|     |  |  |  |_ metadata.json
|     |  |  |_ ...
|     |  |- metrics
|     |  |  |- name_a
|     |  |  |  |_ metadata.json
|     |  |  |_ ...
|     |  |- parameters
|     |  |  |- name_a
|     |  |  |  |_ metadata.json
|     |  |  |_ ...
|     |  |_ metadata.json
|     |- id_b
|     |  |_ ...
|     |_ ...
|- project_b
|  |_ ...
|_ ...
```

In [1]:
from rubicon_ml import Rubicon

rb = Rubicon(persistence="filesystem", root_dir="../quick-look/rubicon-root")
pr = rb.get_project(name="classifying penguins")

project_json = {"project": [pr._domain.__dict__]}
project_json

{'project': [{'name': 'classifying penguins',
   'id': 'c9674165-07bb-439f-b5c4-ea85d5bb3729',
   'description': None,
   'github_url': 'https://github.com/capitalone/rubicon-ml.git',
   'training_metadata': None,
   'created_at': datetime.datetime(2022, 9, 7, 18, 35, 56, 138561)}]}

In [2]:
NUM_EXPERIMENTS = 4

project_json["project"][0]["experiment"] = []
for i, e in enumerate(pr.experiments()):
    if i >= NUM_EXPERIMENTS:
        break

    experiment_json = e._domain.__dict__

    experiment_json["feature"] = []
    for f in e.features():
        experiment_json["feature"].append(f._domain.__dict__)

    experiment_json["parameter"] = []
    for p in e.parameters():
        experiment_json["parameter"].append(p._domain.__dict__)

    experiment_json["metric"] = []
    for m in e.metrics():
        experiment_json["metric"].append(m._domain.__dict__)

    project_json["project"][0]["experiment"].append(experiment_json)

In [None]:
project_json

### equivalent JSON

constructed from the domain level dictionaries that represent each rubicon object

```json
{'project': [{'created_at': datetime.datetime(2022, 9, 7),
              'description': None,
              'experiment': [{'branch_name': 'main',
                              'commit_hash': '4b357a46f8dee744f384130766abf5c2372b1320',
                              'created_at': datetime.datetime(2022, 9, 7),
                              'description': None,
                              'feature': [{'created_at': datetime.datetime(2022, 9, 7),
                                           'description': None,
                                           'id': 'b5254f32-3bf0-48e8-9b89-630f10f29e5f',
                                           'importance': None,
                                           'name': 'island'},
                                          {'created_at': datetime.datetime(2022, 9, 7),
                                           'description': None,
                                           'id': '41ea3125-a20c-4ab0-a6dc-d5d7057c690c',
                                           'importance': None,
                                           'name': 'bill_length_mm'},
                                          {'created_at': datetime.datetime(2022, 9, 7),
                                           'description': None,
                                           'id': '64edb25b-a57d-4d02-93dc-fdbbf5a401b9',
                                           'importance': None,
                                           'name': 'bill_depth_mm'},
                                          {'created_at': datetime.datetime(2022, 9, 7),
                                           'description': None,
                                           'id': 'bc2698d2-4733-436c-aeaf-2ce23db8347a',
                                           'importance': None,
                                           'name': 'flipper_length_mm'},
                                          {'created_at': datetime.datetime(2022, 9, 7),
                                           'description': None,
                                           'id': '9a40be69-a574-48f7-a8c8-2319474fd2b8',
                                           'importance': None,
                                           'name': 'body_mass_g'},
                                          {'created_at': datetime.datetime(2022, 9, 7),
                                           'description': None,
                                           'id': 'b7b5ee92-12c4-44eb-8af9-0f4edb004c18',
                                           'importance': None,
                                           'name': 'sex'},
                                          {'created_at': datetime.datetime(2022, 9, 7),
                                           'description': None,
                                           'id': '6538676e-28f7-4e53-b9b7-ff2c961fd3c4',
                                           'importance': None,
                                           'name': 'year'}],
                              'id': '61feebc5-c75f-45bc-8db4-31355f657164',
                              'metric': [{'created_at': datetime.datetime(2022, 9, 7),
                                          'description': None,
                                          'directionality': 'score',
                                          'id': '2bcffdd9-2a11-479b-a439-70bad24d1805',
                                          'name': 'accuracy',
                                          'value': 0.7403846153846154}],
                              'model_name': None,
                              'name': None,
                              'parameter': [{'created_at': datetime.datetime(2022, 9, 7),
                                             'description': None,
                                             'id': 'a3650f88-86a5-4c53-bf22-ced84ef5bdd5',
                                             'name': 'strategy',
                                             'value': 'mean'},
                                            {'created_at': datetime.datetime(2022, 9, 7),
                                             'description': None,
                                             'id': '6b5ca832-fa09-4924-92bf-5909a148d57b',
                                             'name': 'n_neighbors',
                                             'value': 5}],
                              'project_name': 'classifying penguins',
                              'tags': [],
                              'training_metadata': None}
                             ],
              'github_url': 'https://github.com/capitalone/rubicon-ml.git',
              'id': 'c9674165-07bb-439f-b5c4-ea85d5bb3729',
              'name': 'classifying penguins',
              'training_metadata': None}]}
```

### `jsonpath_ng`

this seems to be the most recommended Python implementation - not active tho
> https://github.com/h2non/jsonpath-ng

#### get all metrics from each experiment

In [3]:
from jsonpath_ng.ext import parse

expr = parse("$..experiment[*].metric")
res = expr.find(project_json)

print(len(res))
for match in res:
    print(match.value)

4
[{'name': 'accuracy', 'value': 0.7403846153846154, 'id': '2bcffdd9-2a11-479b-a439-70bad24d1805', 'description': None, 'directionality': 'score', 'created_at': datetime.datetime(2022, 9, 7, 18, 35, 56, 250355)}]
[{'name': 'accuracy', 'value': 0.7403846153846154, 'id': '2afd29f2-268c-4875-bca4-5e5031547ef4', 'description': None, 'directionality': 'score', 'created_at': datetime.datetime(2022, 9, 7, 18, 41, 22, 380741)}]
[{'name': 'accuracy', 'value': 0.7211538461538461, 'id': '5387eed6-6e9b-402d-9a0a-d88d85b74ee4', 'description': None, 'directionality': 'score', 'created_at': datetime.datetime(2022, 9, 7, 18, 41, 22, 549001)}]
[{'name': 'accuracy', 'value': 0.6923076923076923, 'id': '5fe10610-f999-4b36-9259-4154541a0757', 'description': None, 'directionality': 'score', 'created_at': datetime.datetime(2022, 9, 7, 18, 41, 22, 721612)}]


#### get all metrics named 'accuracy' with a value greater than 0.7 from each experiment

In [4]:
expr = parse("$..experiment[*].metric[?(@.name=='accuracy' & @.value>=0.7)]")
res = expr.find(project_json)

print(len(res))
for match in res:
    print(match.value)

3
{'name': 'accuracy', 'value': 0.7403846153846154, 'id': '2bcffdd9-2a11-479b-a439-70bad24d1805', 'description': None, 'directionality': 'score', 'created_at': datetime.datetime(2022, 9, 7, 18, 35, 56, 250355)}
{'name': 'accuracy', 'value': 0.7403846153846154, 'id': '2afd29f2-268c-4875-bca4-5e5031547ef4', 'description': None, 'directionality': 'score', 'created_at': datetime.datetime(2022, 9, 7, 18, 41, 22, 380741)}
{'name': 'accuracy', 'value': 0.7211538461538461, 'id': '5387eed6-6e9b-402d-9a0a-d88d85b74ee4', 'description': None, 'directionality': 'score', 'created_at': datetime.datetime(2022, 9, 7, 18, 41, 22, 549001)}


#### get all experiments that contian a metric named 'accuracy' with a value greater than 0.7

In [5]:
expr = parse("$..experiment[?(@..metric[*].name=='accuracy' & @..metric[*].value>=0.7)]")
res = expr.find(project_json)

print(len(res))
for match in res:
    print(match.value)

3
{'project_name': 'classifying penguins', 'id': '61feebc5-c75f-45bc-8db4-31355f657164', 'name': None, 'description': None, 'model_name': None, 'branch_name': 'main', 'commit_hash': '4b357a46f8dee744f384130766abf5c2372b1320', 'training_metadata': None, 'tags': [], 'created_at': datetime.datetime(2022, 9, 7, 18, 35, 56, 234328), 'feature': [{'name': 'island', 'id': 'b5254f32-3bf0-48e8-9b89-630f10f29e5f', 'description': None, 'importance': None, 'created_at': datetime.datetime(2022, 9, 7, 18, 35, 56, 237973)}, {'name': 'bill_length_mm', 'id': '41ea3125-a20c-4ab0-a6dc-d5d7057c690c', 'description': None, 'importance': None, 'created_at': datetime.datetime(2022, 9, 7, 18, 35, 56, 239872)}, {'name': 'bill_depth_mm', 'id': '64edb25b-a57d-4d02-93dc-fdbbf5a401b9', 'description': None, 'importance': None, 'created_at': datetime.datetime(2022, 9, 7, 18, 35, 56, 240975)}, {'name': 'flipper_length_mm', 'id': 'bc2698d2-4733-436c-aeaf-2ce23db8347a', 'description': None, 'importance': None, 'created_a

#### returning `rubicon_ml` objects

In [6]:
from rubicon_ml.domain import Metric as DomainMetric
from rubicon_ml.client import Metric

expr = parse("$..experiment[*].metric")
res = expr.find(project_json)

metrics = []
for match in res:
    for val in match.value:
        metrics.append(Metric(DomainMetric(**val)))
        
metrics

[<rubicon_ml.client.metric.Metric at 0x16975b220>,
 <rubicon_ml.client.metric.Metric at 0x1697c6950>,
 <rubicon_ml.client.metric.Metric at 0x1697c6c20>,
 <rubicon_ml.client.metric.Metric at 0x1697c7010>]

In [7]:
print(metrics[0].name)
print(metrics[0].value)

accuracy
0.7403846153846154
