# JSONPath proof-of-concept

exploring a potential JSONPath-like interface for retrieving rubicon logs

### filesystem directory structure

```
|- project_a
|  |- artifacts
|  |  |- id_a
|  |  |  |- data
|  |  |  |_ metadata.json
|  |  |_ ...
|  |- dataframes
|  |  |- id_a
|  |  |  |- data
|  |  |  |_ metadata.json
|  |  |_ ...
|  |- experiments
|     |- id_a
|     |  |- artifacts
|     |  |  |- id_a
|     |  |  |  |- data
|     |  |  |  |_ metadata.json
|     |  |  |_ ...
|     |  |- dataframes
|     |  |  |- id_a
|     |  |  |  |- data
|     |  |  |  |_ metadata.json
|     |  |  |_ ...
|     |  |- features
|     |  |  |- name_a
|     |  |  |  |_ metadata.json
|     |  |  |_ ...
|     |  |- metrics
|     |  |  |- name_a
|     |  |  |  |_ metadata.json
|     |  |  |_ ...
|     |  |- parameters
|     |  |  |- name_a
|     |  |  |  |_ metadata.json
|     |  |  |_ ...
|     |  |_ metadata.json
|     |- id_b
|     |  |_ ...
|     |_ ...
|- project_b
|  |_ ...
|_ ...
```

In [1]:
import random

random.seed(24)

In [2]:
import pandas as pd
from rubicon_ml import Rubicon

NUM_EXPERIMENTS = 4

rb = Rubicon(persistence="memory")
pr = rb.get_or_create_project(name="jsonpath")

for _ in range(NUM_EXPERIMENTS):
    tags = [random.choice(["a", "b", "c"])]
    ex = pr.log_experiment(tags=tags)
        
    for feature in ["f", "g", "h", "i"]:
        ex.log_feature(name=feature)
            
    for parameter in [("d", 100), ("e", 1000), ("f", 1000)]:
        name, value = parameter
        ex.log_parameter(name=name, value=value)
        
    for metric in ["j", "k"]:
        value = random.choice([0.0, 1.0])
        tags = [random.choice(["l", "m", "n"])]
        ex.log_metric(name=metric, value=value, tags=tags)
        
    ex.log_artifact(name="o", data_bytes=b"o")
    ex.log_dataframe(pd.DataFrame([[0, 1], [1, 0]]))
    
pr.log_artifact(name="p", data_bytes=b"p")
pr.log_dataframe(pd.DataFrame([[0, 1], [1, 0]]))

pr

<rubicon_ml.client.project.Project at 0x1058a7a90>

In [3]:
import warnings
from rubicon_ml import Project, Experiment

def convert_to_json(rubicon_objects=None, projects=None, experiments=None):

    json = None
    if rubicon_objects is not None:
        json = rubicon_to_json(rubicon_objects)  
    if projects is not None:
        if json is None:
            json = project_to_json(projects)
        else:
            new_json = project_to_json(projects)
            for pr in new_json["project"]:
                json["project"].append(pr)
    if experiments is not None:
        if json is None:
            json = experiment_to_json(experiments)
        else:
            new_json = experiment_to_json(experiments)
            if json.get("experiment") is None:
                json["experiment"] = []
            for e in new_json["experiment"]:
                json["experiment"].append(e)
        
    return json

def experiment_to_json(experiment):
    if not isinstance(experiment, Experiment):
        if not isinstance(experiment, list) or not all([isinstance(e, Experiment) for e in experiment]):
            raise ValueError("`experiment` must be of type `Experiment` or `list` of type `Experiment`")
    
    if not isinstance(experiment, list):
        experiment = [experiment]
    
    json = {}
    json["experiment"] = []
    for e in experiment:
        experiment_json = e._domain.__dict__
        experiment_json["feature"] = []
        for f in e.features():
            experiment_json["feature"].append(f._domain.__dict__)

        experiment_json["parameter"] = []
        for p in e.parameters():
            experiment_json["parameter"].append(p._domain.__dict__)

        experiment_json["metric"] = []
        for m in e.metrics():
            experiment_json["metric"].append(m._domain.__dict__)

        experiment_json["artifact"] = []
        for a in e.artifacts():
            experiment_json["artifact"].append(a._domain.__dict__)

        experiment_json["dataframe"] = []
        for d in e.dataframes():
            experiment_json["dataframe"].append(d._domain.__dict__)
        
        json["experiment"].append(experiment_json)
        
    return json

def project_to_json(project):
    if not isinstance(project, Project):
        if not isinstance(project, list) or not all([isinstance(pr, Project) for pr in project]):
            raise ValueError("`project` must be of type `Project` or `list` of type `Project`")
    
    if not isinstance(project, list):
        project = [project]
    
    json = {}
    json["project"] = []
    for pr in project:
        project_json = pr._domain.__dict__
        project_json["artifact"] = []
        for a in pr.artifacts():
            project_json["artifact"].append(a._domain.__dict__)

        project_json["dataframe"] = []
        for d in pr.dataframes():
            project_json["dataframe"].append(d._domain.__dict__)

        experiment_json = experiment_to_json(pr.experiments())
        project_json["experiment"] = experiment_json["experiment"]
        
        json["project"].append(project_json)

    return json

def rubicon_to_json(rubicon): 
    if not isinstance(rubicon, Rubicon):
        if not isinstance(rubicon, list) or not all([isinstance(rb, Rubicon) for rb in rubicon]):
            raise ValueError("`rubicon_objects` must be of type `Rubicon` or `list` of type `Rubicon`")
    
    if not isinstance(rubicon, list):
        rubicon = [rubicon]
    
    json = None
    for rb in rubicon:
        for pr in rb.projects():
            if json is None:
                json = project_to_json(pr)
            else:
                new_json = project_to_json(pr)
                for p in new_json["project"]:
                    json["project"].append(p)
            
    return json

In [4]:
project_json = project_to_json(pr)
project_json

{'project': [{'name': 'jsonpath',
   'id': 'f7623f85-6304-4bd5-8d40-0cad0c650589',
   'description': None,
   'github_url': None,
   'training_metadata': None,
   'created_at': datetime.datetime(2022, 12, 9, 15, 20, 41, 615660),
   'artifact': [{'name': 'p',
     'id': 'b1678dd5-b0f9-463d-9ae9-2f8f5e4c6da3',
     'description': None,
     'created_at': datetime.datetime(2022, 12, 9, 15, 20, 41, 620330),
     'tags': [],
     'parent_id': 'f7623f85-6304-4bd5-8d40-0cad0c650589'}],
   'dataframe': [{'id': '75befadd-3b35-4178-935c-d6eb0d7a70ca',
     'name': None,
     'description': None,
     'tags': [],
     'created_at': datetime.datetime(2022, 12, 9, 15, 20, 41, 620485),
     'parent_id': 'f7623f85-6304-4bd5-8d40-0cad0c650589'}],
   'experiment': [{'project_name': 'jsonpath',
     'id': 'ae3e613d-243c-4526-a22f-4ee97e006ca6',
     'name': None,
     'description': None,
     'model_name': None,
     'branch_name': None,
     'commit_hash': None,
     'training_metadata': None,
     't

In [5]:
rubicon_json = rubicon_to_json(rb)
rubicon_json

{'project': [{'name': 'jsonpath',
   'id': 'f7623f85-6304-4bd5-8d40-0cad0c650589',
   'description': None,
   'github_url': None,
   'training_metadata': None,
   'created_at': datetime.datetime(2022, 12, 9, 15, 20, 41, 615660),
   'artifact': [{'name': 'p',
     'id': 'b1678dd5-b0f9-463d-9ae9-2f8f5e4c6da3',
     'description': None,
     'created_at': datetime.datetime(2022, 12, 9, 15, 20, 41, 620330),
     'tags': [],
     'parent_id': 'f7623f85-6304-4bd5-8d40-0cad0c650589'}],
   'dataframe': [{'id': '75befadd-3b35-4178-935c-d6eb0d7a70ca',
     'name': None,
     'description': None,
     'tags': [],
     'created_at': datetime.datetime(2022, 12, 9, 15, 20, 41, 620485),
     'parent_id': 'f7623f85-6304-4bd5-8d40-0cad0c650589'}],
   'experiment': [{'project_name': 'jsonpath',
     'id': 'ae3e613d-243c-4526-a22f-4ee97e006ca6',
     'name': None,
     'description': None,
     'model_name': None,
     'branch_name': None,
     'commit_hash': None,
     'training_metadata': None,
     't

In [6]:
ex = pr.log_experiment(name="test_experiment_to_json")
experiment_json = experiment_to_json(ex)
experiment_json

{'experiment': [{'project_name': 'jsonpath',
   'id': '5ee82ceb-f37c-4a40-91b3-b3baf3794877',
   'name': 'test_experiment_to_json',
   'description': None,
   'model_name': None,
   'branch_name': None,
   'commit_hash': None,
   'training_metadata': None,
   'tags': [],
   'created_at': datetime.datetime(2022, 12, 9, 15, 20, 41, 652830),
   'feature': [],
   'parameter': [],
   'metric': [],
   'artifact': [],
   'dataframe': []}]}

In [7]:
converted_to_json = convert_to_json(rubicon_objects=rb, projects=pr, experiments=ex)
converted_to_json

{'project': [{'name': 'jsonpath',
   'id': 'f7623f85-6304-4bd5-8d40-0cad0c650589',
   'description': None,
   'github_url': None,
   'training_metadata': None,
   'created_at': datetime.datetime(2022, 12, 9, 15, 20, 41, 615660),
   'artifact': [{'name': 'p',
     'id': 'b1678dd5-b0f9-463d-9ae9-2f8f5e4c6da3',
     'description': None,
     'created_at': datetime.datetime(2022, 12, 9, 15, 20, 41, 620330),
     'tags': [],
     'parent_id': 'f7623f85-6304-4bd5-8d40-0cad0c650589'}],
   'dataframe': [{'id': '75befadd-3b35-4178-935c-d6eb0d7a70ca',
     'name': None,
     'description': None,
     'tags': [],
     'created_at': datetime.datetime(2022, 12, 9, 15, 20, 41, 620485),
     'parent_id': 'f7623f85-6304-4bd5-8d40-0cad0c650589'}],
   'experiment': [{'project_name': 'jsonpath',
     'id': 'ae3e613d-243c-4526-a22f-4ee97e006ca6',
     'name': None,
     'description': None,
     'model_name': None,
     'branch_name': None,
     'commit_hash': None,
     'training_metadata': None,
     't

### `jsonpath_ng`

this seems to be the most recommended Python implementation - not active tho
> https://github.com/h2non/jsonpath-ng

In [8]:
from jsonpath_ng.ext import parse

class RubiconJSON:
    def __init__(self, rubicon_json):
        self._rubicon_json = rubicon_json

    def search(self, query):
        return parse(query).find(self._rubicon_json)
    
rb_json = RubiconJSON(rubicon_json)

#### get all metrics from each experiment

In [9]:
res = rb_json.search("$..experiment[*].metric")

print(f"{len(res)} experiments")
for match in res:
    print(f"{len(match.value)} metrics")
    print(match.value)

4 experiments
2 metrics
[{'name': 'j', 'value': 1.0, 'id': '0abc9d98-f52a-46ac-b01d-a54c946b6295', 'description': None, 'directionality': 'score', 'created_at': datetime.datetime(2022, 12, 9, 15, 20, 41, 616390), 'tags': ['n']}, {'name': 'k', 'value': 0.0, 'id': '5f719b8d-02fc-42f3-b50b-4dfe4a286836', 'description': None, 'directionality': 'score', 'created_at': datetime.datetime(2022, 12, 9, 15, 20, 41, 616458), 'tags': ['l']}]
2 metrics
[{'name': 'j', 'value': 0.0, 'id': 'f94817d4-1a04-4a6e-bd86-c664b004a120', 'description': None, 'directionality': 'score', 'created_at': datetime.datetime(2022, 12, 9, 15, 20, 41, 618325), 'tags': ['l']}, {'name': 'k', 'value': 0.0, 'id': '226053c2-3c74-4009-a586-e9b12f5d17a0', 'description': None, 'directionality': 'score', 'created_at': datetime.datetime(2022, 12, 9, 15, 20, 41, 618398), 'tags': ['n']}]
2 metrics
[{'name': 'j', 'value': 1.0, 'id': '95977f76-534c-445a-a6a1-98d0e8f04d40', 'description': None, 'directionality': 'score', 'created_at': d

#### get all experiments with tag 'b'

In [10]:
res = rb_json.search("$..experiment[?(@.tags[*]=='b')]")

print(f"{len(res)} experiments")
for match in res:
    print(match.value)

1 experiments
{'project_name': 'jsonpath', 'id': '09aac206-1adf-4198-b518-5ee90b6d0d3d', 'name': None, 'description': None, 'model_name': None, 'branch_name': None, 'commit_hash': None, 'training_metadata': None, 'tags': ['b'], 'created_at': datetime.datetime(2022, 12, 9, 15, 20, 41, 619501), 'feature': [{'name': 'f', 'id': '661f505f-7a42-4d58-9536-68d4fbfc6f7c', 'description': None, 'importance': None, 'tags': [], 'created_at': datetime.datetime(2022, 12, 9, 15, 20, 41, 619545)}, {'name': 'g', 'id': '27526299-065a-4230-98ce-bb7a87c73baf', 'description': None, 'importance': None, 'tags': [], 'created_at': datetime.datetime(2022, 12, 9, 15, 20, 41, 619612)}, {'name': 'h', 'id': '5620f41b-e02a-47b9-b93c-675cbfe45ee7', 'description': None, 'importance': None, 'tags': [], 'created_at': datetime.datetime(2022, 12, 9, 15, 20, 41, 619659)}, {'name': 'i', 'id': 'a62c1c91-ce80-4c74-ac64-3c207c203ca1', 'description': None, 'importance': None, 'tags': [], 'created_at': datetime.datetime(2022, 12,

#### get all metrics named 'j' with a value greater than 0.5 from each experiment

In [11]:
res = rb_json.search("$..experiment[*].metric[?(@.name=='j' & @.value>=0.5)]")

print(f"{len(res)} metrics")
for match in res:
    print(match.value)

2 metrics
{'name': 'j', 'value': 1.0, 'id': '0abc9d98-f52a-46ac-b01d-a54c946b6295', 'description': None, 'directionality': 'score', 'created_at': datetime.datetime(2022, 12, 9, 15, 20, 41, 616390), 'tags': ['n']}
{'name': 'j', 'value': 1.0, 'id': '95977f76-534c-445a-a6a1-98d0e8f04d40', 'description': None, 'directionality': 'score', 'created_at': datetime.datetime(2022, 12, 9, 15, 20, 41, 619115), 'tags': ['n']}


#### get all experiments that contian a metric named 'j' with a value less than 0.5

In [12]:
res = rb_json.search("$..experiment[?(@.metric[?(@.name=='j')].value<=0.5)]")

print(f"{len(res)} experiments")
for match in res:
    print(match.value)

2 experiments
{'project_name': 'jsonpath', 'id': '91188e82-3aa4-4ada-b3d9-4f6c12f9f0a3', 'name': None, 'description': None, 'model_name': None, 'branch_name': None, 'commit_hash': None, 'training_metadata': None, 'tags': ['a'], 'created_at': datetime.datetime(2022, 12, 9, 15, 20, 41, 617934), 'feature': [{'name': 'f', 'id': 'dc015594-b228-442b-bf79-77891d262011', 'description': None, 'importance': None, 'tags': [], 'created_at': datetime.datetime(2022, 12, 9, 15, 20, 41, 617982)}, {'name': 'g', 'id': '38a752ca-49ab-4083-b8d8-009e88c9494c', 'description': None, 'importance': None, 'tags': [], 'created_at': datetime.datetime(2022, 12, 9, 15, 20, 41, 618043)}, {'name': 'h', 'id': 'ab19afb6-019d-4fa6-87e5-204bd5aa4385', 'description': None, 'importance': None, 'tags': [], 'created_at': datetime.datetime(2022, 12, 9, 15, 20, 41, 618089)}, {'name': 'i', 'id': '943afafe-69b1-41e3-a961-2e7bf1995e19', 'description': None, 'importance': None, 'tags': [], 'created_at': datetime.datetime(2022, 12,

#### returning `rubicon_ml` objects

In [13]:
from rubicon_ml.domain import Metric as DomainMetric
from rubicon_ml.client import Metric

class NoOpParent:
    """A read-only parent object"""
    @property
    def _config(self):
        return None

In [14]:
res = rb_json.search("$..experiment[*].metric[?(@.name=='j' & @.value>=0.5)]")

metrics = []
for match in res:
    metrics.append(Metric(DomainMetric(**match.value), NoOpParent()))
        
metrics

[<rubicon_ml.client.metric.Metric at 0x1058a7ca0>,
 <rubicon_ml.client.metric.Metric at 0x13ed72320>]

In [15]:
for m in metrics:
    print(m.name, m.value)

j 1.0
j 1.0
