# Dataset-to-Hub pipeline: reduce Time-to-Publish in onboarding datasets 

Goal: from a PI System with associated AF database, publish the dataset through OCS with a set of default asset-centric Data Views. 

#### Showcase datasets: Deschutes and UC Davis Facilities. Next: NC State paper machines, USC drill data

## Step 1: Populate GraphQL-enabled "AF" 

#### From an AF path element, collect pipoints/static/etc about all its children elements. 

#### This data is then directly accessible through a GraphQL endpoint for Step 1, creation of (graph) data views. 


In [None]:
# import to deal with PIWebAPI
# !pip install httpx
import requests
import json
import time
from time import process_time
import yaml
import asyncio
import httpx

from gql import gql, Client
from gql.transport.requests import RequestsHTTPTransport
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# from py2neo import Graph

# Graph module
import py2neo
from py2neo import Node, Relationship

In [None]:
# config_file = "config-omf-health.yaml"
# config_file = "config-prod-ucd-v2.yaml"
config_file = "config-windfarm.yaml"
# config_file = "config-acad-prod-desc-v2.yaml"
# config_file = "config-acad-prod-deschutes.yaml"
#  config_file = "config-acad-prod-ucd.yaml"

## Overview

From an anchor element path in AF, populate a GraphQL-enabled "mirror" of its children elements and their attributes, extracted from PIWebAPI 

### Input parameters

* PIWebAPI base URL for asset servers
* PIWebAPI authentication credentials (basic)
* Anchor element path

In [None]:
with open(config_file) as f:
    config = yaml.safe_load(f)
# config

In [None]:
base_url = config["piwebapi"]["base_url"]
dataserver = config["piwebapi"]["dataserver"]  # "uni-pida-vm0"

auth = (config["piwebapi"]["username"], config["piwebapi"]["password"])
timeout = 45.0

asset_url = base_url + "/assetservers"

print(f"piwebapi={base_url}, dataserver={dataserver}, auth=(*****, *****)")

element_roots = config["piwebapi"]["element_roots"]

test_asset = None

ocs_asset_db = config["ocs"]["configuration"]["asset_db"]

EXPAND_LEAF_ELEMENTS = config["piwebapi"]["expand_leaf_elements"]  # False
MAX_LEAF_ELEMENTS = 10000
print(f"roots = {element_roots}, expand leaves: {EXPAND_LEAF_ELEMENTS}")

### Functions to navigate PIWebAPI structure 

In [None]:
def map_webid(webid):
    return f"{ocs_asset_db}:{webid}"


# From PIWebAPI asset node, build directory of children links and self WebID
# If extract_key is present, return this directory item
# Otherwise return full dictionary (key is children node name)
async def extract_url(url, key, extract_key=None, debug=False):
    async with httpx.AsyncClient() as client:
        start = process_time()
        print("<", end="")
        r = await client.get(url, auth=auth, timeout=timeout)
        # print(f"[{process_time() - start:.2f}:{url}]")
        if r.status_code != 200:
            print(f"@error  code={r.status_code}, url={url}")
            return
        js = r.json()
        if debug:
            print(f"js={js}")
        d = {
            i["Name"]: (
                i["Links"][f"{key}"],
                map_webid(i["WebId"]),
                i.get("TemplateName", ""),
                i.get("Description", ""),
            )
            for i in js["Items"]
        }
        if extract_key:
            result = d.get(extract_key, None)
            if result is None:
                print(
                    f"[@error url={url}, key={key}, extract={extract_key}, d={d}]"
                )  # " r={js}, d={d}")
            return result
        else:
            return d


# Extract static value v2
async def extract_static_value2(attr_info, client):
    return await extract_point_or_value(
        attr_info, client, "Value", "Value", lambda t: t
    )


# Extract point data v2
async def extract_point2(attr_info, client):
    # "Path", lambda t: extract_tag(t)
    return await extract_point_or_value(
        attr_info, client, "Point", "Name", lambda t: f"tag__{t}"
    )


# Extract point or static value from attribute
async def extract_point_or_value(attr_info, client, link_key, value_key, value_f):
    if attr_info["Links"].get(link_key, None) is None:
        return None, None, None, None
    start = process_time()
    print(".", end="")
    r = await client.get(attr_info["Links"][link_key], auth=auth, timeout=timeout)
    # print(f"[{process_time() - start:.2f}:{attr_info['Links'][link_key]}]")
    if r.status_code != 200:
        print(f"@get error code={r.status_code} url={attr_info['Links'][link_key]}")
        return
    js = r.json()
    point_attributes = None
    if link_key == "Point":
        r2 = await client.get(js["Links"]["Attributes"], auth=auth, timeout=timeout)
        if r2.status_code != 200:
            print(f"@get error code={r.status_code} url2={js['Links']['Attributes']}")
            return
        point_attributes = r2.json()["Items"]
    return attr_info, value_f(js[value_key]), js, point_attributes


def create_or_update_node(
    node_type, name, webid, template="", asset_db="", parent=None
):
    ##query = f"MATCH (node) WHERE node.id = '{webid}' RETURN node"

    nodes = []  ## [i["node"] for i in graph.run(query).data()]
    if len(nodes) == 1:
        print("@", end="")
        return nodes[0]
    node = Node(node_type, name=name)  ### node_type, "Node"
    node.update(id=webid, af_template=template, asset_db=asset_db)
    # if node_type == "PIPoint":
    #    node.update(on_ocs=1)
    return node


important_nodes = {}

# Navigate fully AF `path` in PIWebAPI
# If `graph` is not null, update it while traversing tree
# (graph at this point is a list of Node and Relationship)
async def elements_of(url, path, lgraph=None):
    path_components = enumerate([i for i in path.split("\\") if len(i) > 0])
    # url = base_url
    parent = None
    asset_db = ""
    for i, component in path_components:
        # url, webid, template, description
        result = await extract_url(
            url, "Databases" if i == 0 else "Elements", component
        )
        if result is None:
            print(f"[@@@error: url={url}, i={i}, component={component}]")
            continue
        url, webid, template, description = result
        if lgraph is not None:
            node_type = "Server" if i == 0 else ("Database" if i == 1 else "Element")
            if node_type == "Database":
                important_nodes[node_type] = webid

            rel_type = (
                "NOT_POSSIBLE"
                if i == 0
                else ("HAS_DATABASE" if i == 1 else "HAS_ELEMENT")
            )
            if node_type == "Database":
                asset_db = ocs_asset_db  # component
            node = create_or_update_node(
                node_type, component, webid, template, asset_db
            )
            if node_type == "Database":
                node.update(
                    name=config["db"]["database_name"],
                    informationURL=config["db"]["infoURL"],
                    description=config["db"]["description"],
                    namespace=config["ocs"]["configuration"]["namespace"],
                    status="onboarding",
                )
                important_nodes[node_type] = node
            elif node_type == "Element":
                node.update(description=description if description != "" else template)
            if parent:
                lgraph.append(Relationship(parent, rel_type, node))
            lgraph.append(node)
            parent = node

    if important_nodes.get("Element", None) is None:
        important_nodes["Element"] = node
    print(f"important nodes={important_nodes}")
    start = process_time()
    urls_dict = await extract_url(url, "Attributes")
    # print(f"[{process_time() - start:.2f}:extract_url_{url}]")
    return urls_dict, parent


# From an element URL, return a dictionary of attribute and their PIWebAPI JSON info
# PIWebAPI attribute JSON to be parsed and transfered to graph
async def attributes(url):
    async with httpx.AsyncClient() as client:
        start = process_time()
        print(">", end="")
        r = await client.get(url, auth=auth, timeout=timeout)
        # print(f"[{process_time() - start:.2f}:{url}]")
        if r.status_code != 200:
            print("@error")
            return
        js = r.json()
        d = {
            i["Name"]: i for i in js["Items"] if i["DataReferencePlugIn"] == "PI Point"
        }
        for k in d.keys():  # key_list
            # print(f"d-key={k}")
            d[k] = await extract_point2(d[k], client)

        d2 = {i["Name"]: i for i in js["Items"] if i["DataReferencePlugIn"] == ""}
        for k in d2.keys():
            # print(f"d2-key={k}")
            d2[k] = await extract_static_value2(d2[k], client)

        return {**d, **d2}


def convert_config_data(value):
    if type(value) == float:
        if str(value)[-2:] == ".0":
            return int(value)
        else:
            return value
    try:
        return int(value)
    except (ValueError, TypeError):
        try:
            val = value["Name"]
            return val.replace("'", "")  # f"'{val}'"
        except:
            return value.replace("'", "")  # "'{value}'"

### Transform linear graph (list of Node and Relationship) into list of GraphQL mutations

In [None]:
# GraphQL client object
sample_transport = RequestsHTTPTransport(
    url=config["graphql"]["endpoint"], verify=False, retries=3
)
client = Client(transport=sample_transport, fetch_schema_from_transport=True)

In [None]:
def commit_graph(g, debug=True):
    start_time = time.perf_counter()
    # tx = graph.begin()
    if debug:
        print(f"\n>> Graph root={g[0]}")
    # [tx.create(i) for i in g if i is not None]
    for node in g:
        if node is None:
            print("### [None]")
        else:
            if type(node) == py2neo.data.Node:
                ntype = str(node.labels).replace(":", "").replace("Node", "")
                d = dict(node)
                d.pop("step", None)  ## temporary
                d.pop("on_ocs", None)  ## temporary
                args = ",".join([f' {key}: "{d[key]}"' for key in d.keys()])
                print("@@@@@", ntype, d)
                node_mutation = gql(
                    f"""
                mutation 
                   {{ Merge{ntype}({args}) {{
                       id 
                       name
                   }}
                   }}
                """
                )
                client.execute(node_mutation)
    for relation in g:
        if relation is None:
            print("### [None]")
        else:
            if type(relation) != py2neo.data.Node:  # must a relationship
                start_node = relation.start_node
                end_node = relation.end_node
                ntype = str(start_node.labels).replace(":", "").replace("Node", "")
                merge_suffix = type(relation).__name__.capitalize()
                args = f"""from: {{id: \"{start_node['id']}\"}}, to: {{id: \"{end_node['id']}\"}}"""
                mutation = f""" 
                mutation {{ 
                    Merge{ntype}{merge_suffix}({args}) {{
                        from {{
                            name
                        }}
                        to {{
                            name
                        }}
                    }}
                }}
                """
                relation_mutation = gql(mutation)
                ## print(">>>>> mutation=", mutation)
                reply = client.execute(relation_mutation)
                print("<<<<<<", reply)
    # tx.commit()
    if debug:
        print(f">> commit done in {time.perf_counter() - start_time:.2f} secs")

### Create all children element nodes of anchor element and their attributes

For attributes, transfer relevent information onto associated graph node

In [None]:
def reject_element(element_name):
    return any(ss in element_name.lower() for ss in ["_cache", "baseline", "health"])

try:
    exec(config["reject_element"])
except:
    pass

In [None]:
async def generate_leaf_element_attributes(elements_path):
    lgraph = []
    elem_attr_urls, elem_anchor = await elements_of(
        asset_url, elements_path, lgraph=lgraph
    )
    print(f"\n>> Current anchor: {elem_anchor}")
    # elem_anchor, len(lgraph if lgraph else []), list(elem_attr_urls.keys())
    for element_name in elem_attr_urls.keys():
        # UC Davis + CMU specific
        if reject_element(element_name):
            print(f"@@ rejected: {element_name}")
            continue
        name = element_name
        # print(elem_attr_urls[element_name])
        elem_url, webid, template, description = elem_attr_urls[element_name]
        elem_node = create_or_update_node(
            "Element", name, webid, template, asset_db=elem_anchor["asset_db"]
        )
        elem_node.update(description=description if description != "" else template)
        elem_rel = Relationship(elem_anchor, "HAS_ELEMENT", elem_node)
        attributes_info = await attributes(elem_attr_urls[element_name][0])
        static_attributes = []
        for attr in attributes_info.keys():
            try:
                attr_info, stream_or_val, stream_js, point_attrs = attributes_info[attr]
            except ValueError:
                print(f"attr={attr}  val={attributes_info[attr]}")
                return
            if attr_info is None:
                continue
            if "tag__" not in str(stream_or_val):
                node = create_or_update_node(
                    "Attribute",
                    attr,
                    map_webid(attr_info["WebId"]),
                    template=elem_node["af_template"],
                    asset_db=elem_node["asset_db"],
                )
                node.update(
                    value=convert_config_data(stream_or_val), type=attr_info["Type"]
                )
                rel = Relationship(elem_node, "HAS_ATTRIBUTE", node)
                lgraph.extend([node, rel])
                static_attributes += [attr]
                continue

            stream_name = stream_or_val.replace("tag__", "")
            if "Analysis" in attr:  # NOTE: duplicate tag in attributes for FV
                # skip attribute with duplicate tag
                print("=", end="")
                continue
            if stream_js["Future"]:
                # future tag not transferred by PItoOCS
                print(f"${attr}$", end="")
                continue
            node = create_or_update_node(
                "PIPoint",
                attr,
                map_webid(attr_info["WebId"]),
                elem_node["af_template"],
                elem_node["asset_db"],
            )
            print("+", end="")  # stream_name
            pattributes = {i["Name"]: i["Value"] for i in point_attrs}
            node.update(
                asset_id=name,
                column_name=attr,
                stream_name=stream_name,
                type=attr_info["Type"],
                uom=attr_info["DefaultUnitsName"],
                step=attr_info["Step"],
                categories=attr_info["CategoryNames"],
                description=attr_info["Description"],
                pointsource=pattributes["pointsource"],
                stream_id=f"PI_{config['piwebapi']['dataserver']}_{pattributes['pointid']}",
            )
            if (
                attr_info["Type"] == "EnumerationValue"
                or stream_js["PointType"] == "Digital"
            ):
                # print(f"node={node}, js={stream_js}")
                node.update(digital_set_name=stream_js["DigitalSetName"])
            rel = Relationship(elem_node, "HAS_DYNAMIC", node)
            lgraph.extend([node, rel])
        elem_node.update(static_attributes=static_attributes)
        lgraph.extend([elem_node, elem_rel])
    commit_graph(lgraph)

### Generate and commit graphs (one per anchor elements)

In [None]:
async def build_graphs(roots):
    for elements_path in roots:
        # lgraph = []
        await generate_leaf_element_attributes(elements_path)  # , lgraph)

In [None]:
print(element_roots)
start_time = time.perf_counter()
await build_graphs(element_roots)
print(f"> runtime {time.perf_counter() - start_time:.2f} secs")

In [None]:
db_node = important_nodes["Database"]
elem_root = important_nodes["Element"]
root_webid = elem_root["id"]
print(elem_root)

### (Optional) From a test element, produce the list of all attributes

Data Views are created from different subset of this list

In [None]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i : i + n]


async def gather_func(f, items, ndiv):
    div = (len(items) // ndiv) + 1
    chunk_list = list(chunks(items, div))
    coroutines = [f(chunk_list[i]) for i in range(0, ndiv)]
    start_time = time.perf_counter()
    print("-OK-") if await asyncio.gather(*coroutines) else print("@oops")
    print(f"> runtime {time.perf_counter() - start_time:.2f} secs")

In [None]:
# define reject_element function if needed in config yaml
try:
    exec(config["reject_leaf_element"])
except:
    pass

query_leaves = gql(
    """
query Database($asset_db: String) {
    Database(asset_db: $asset_db) {
        leaf_elements {
            name
        }
    }
}"""
)


if EXPAND_LEAF_ELEMENTS:
    ndiv = 3
    root = element_roots[0]
    # query = f"MATCH (e:Element) WHERE NOT ((e)-[:HAS_ELEMENT]->()) and e.asset_db='{ocs_asset_db}' RETURN DISTINCT e.name AS name ORDER BY e.name"
    # leaf_elements = [i["name"] for i in graph.run(query).data()]
    result = client.execute(query_leaves, variable_values={"asset_db": ocs_asset_db})
    print(result)
    leaf_elements = [i["name"] for i in result["Database"][0]["leaf_elements"]]
    roots = [root + f"\\{element}" for element in leaf_elements[:MAX_LEAF_ELEMENTS]]
    print(len(roots), roots)
    # await gather_func(build_graphs, roots, ndiv)

In [None]:
dataserver_urls = (requests.get(base_url + "/dataservers", auth=auth).json())["Items"]
for ds in dataserver_urls:
    if dataserver == ds["Name"]:
        print(ds)
        enums_url = ds["Links"]["EnumerationSets"]
        break
print(enums_url)
enums = requests.get(enums_url, auth=auth)
enums

In [None]:
# Find all Digital Sets referred to by dataset PIPoint
query_pipoint = gql(
    """
query PIPoint($asset_db: String!) { 
    PIPoint(asset_db: $asset_db) {
        digital_set_name
        id
    }
}
"""
)

result = client.execute(
    query_pipoint, variable_values={"asset_db": ocs_asset_db}
)  # ocs_asset_db})
print(result)

digital_sets = [
    i["digital_set_name"]
    for i in result["PIPoint"]
    if i["digital_set_name"] is not None
]
# print(set(digital_sets))

dataset_enums = list(set(digital_sets))
dataset_enums

In [None]:
# Build DigitalState node with all states, add relationship :STATE_FROM from node with values in that DigitalState set
for digital_set in dataset_enums:
    # print("digital_set:", digital_set)
    d = {}
    for i in (enums.json())["Items"]:
        if i["Name"] == digital_set:
            print(f"==> processing {i}...")
            vals = requests.get(i["Links"]["Values"], auth=auth).json()
            for v in vals["Items"]:
                if v["Name"] != "undefined":
                    # print(f"{v['Value']} - {v['Name']}")
                    d[str(v["Value"])] = v["Name"]
            node = create_or_update_node(
                "DigitalState",
                digital_set,
                map_webid(i["WebId"]),
                asset_db=ocs_asset_db,
            )
            node.update(states=str(d))
            lgraph = [node]
            commit_graph(lgraph)
            digital_query = gql(
                """
                query PIPoint($asset_db: String, $digital_set: String) {
                    PIPoint(asset_db: $asset_db, digital_set_name: $digital_set) {
                        id
                    }
                }
            """
            )
            result = client.execute(
                digital_query,
                variable_values={
                    "asset_db": ocs_asset_db,
                    "digital_set": digital_set,
                },  # ocs_asset_db
            )
            print(result)
            digital_mutation = gql(
                """
               mutation Digital($from: _PIPointInput!, $to: _DigitalStateInput!) {
                   MergePIPointState_from(from: $from, to: $to) {
                       from {
                           name
                       }
                       to {
                           name
                       }
                   }
               }
               
            """
            )
            for p in result["PIPoint"]:
                result = client.execute(
                    digital_mutation,
                    variable_values={"from": {"id": p["id"]}, "to": {"id": node["id"]}},
                )
                print(">>>>", result)
            """
            points = [
                i["p"]
                for i in graph.run(
                    f"MATCH (p:PIPoint) WHERE p.digital_set_name = '{digital_set}' RETURN p"
                ).data()
            ]
            for p in points:
                rel = Relationship(p, "STATE_FROM", node)
                lgraph.extend([rel])

            break
            """