In [0]:
%pip install python-docx mlflow  --upgrade --pre

dbutils.library.restartPython()

In [0]:
#Python utilities
import os, json, uuid, requests, datetime, re
from docx import Document
from pathlib import Path

#Spark utilities
from pyspark.sql import functions as F, types as T

#Databricks utilities
from dbruntime.databricks_repl_context import get_context

In [0]:
#Set the variables for the PAT in the Databricks Secrets store
secret_scope_name = "general"
secret_key_name = "genie_access"

#Inject the variables into the agent for use
os.environ["DB_MODEL_SERVING_HOST_URL"] = "https://" + get_context().workspaceUrl
assert os.environ["DB_MODEL_SERVING_HOST_URL"] is not None

#Inject the databricks personal access token for use
os.environ["DATABRICKS_GENIE_PAT"] = dbutils.secrets.get(
    scope=secret_scope_name, key=secret_key_name
)
assert os.environ["DATABRICKS_GENIE_PAT"] is not None, (
    "The DATABRICKS_GENIE_PAT was not properly set to the PAT secret"
)

In [0]:
#Keeping this separate makes it easy to find - it's the most likely to need to get updated on a frequent basis
catalog = "ademianczuk"
db = "suncor_ehs"

In [0]:
#Set the operating environment details
DATABRICKS_HOST = os.environ.get("DB_MODEL_SERVING_HOST_URL")
DATABRICKS_TOKEN = os.environ.get("DATABRICKS_GENIE_PAT")
FM_ENDPOINT = "databricks-llama-4-maverick"  # your foundation model endpoint name

#Set the storage details
VOL_ROOT = f"/Volumes/{catalog}/{db}/data"
DOC_OUT_DIR = f"{VOL_ROOT}/docs"
Path(DOC_OUT_DIR).mkdir(parents=True, exist_ok=True)

#Set the table details
SCENARIO_TABLE = f"{catalog}.{db}.scenarios"
DOCS_TABLE = f"{catalog}.{db}.docs"

In [0]:
import traceback
import tempfile
import shutil

def call_chat(messages, temperature=0.2, max_tokens=1200):
    """
    Calls Databricks Foundation Model endpoint (chat-style).
    API schema per Foundation Model REST API docs.
    """
    # url = f"{DATABRICKS_HOST}/api/2.0/serving-endpoints/{FM_ENDPOINT}/invocations"
    url = "https://dbc-9c7dbe12-0a2f.cloud.databricks.com/serving-endpoints/databricks-llama-4-maverick/invocations"
    headers = {"Authorization": f"Bearer {DATABRICKS_TOKEN}", "Content-Type": "application/json"}
    payload = {
        "messages": messages,
        "temperature": temperature,
        "max_tokens": max_tokens
    }
    r = requests.post(url, headers=headers, data=json.dumps(payload), timeout=120)
    r.raise_for_status()
    resp = r.json()
    
    # Databricks FM APIs mirror OpenAI-like schema; adjust if your endpoint returns a different shape.
    # Try common fields first:
    content = None
    if isinstance(resp, dict):
        # 'choices' structure
        choices = resp.get("choices")
        if choices and len(choices) > 0:
            msg = choices[0].get("message") or {}
            content = msg.get("content")
    
    return content or str(resp)

In [0]:
sum_system = """You are an expert maintenance prioritization analyst.
Given a corrective-action document, extract each action and score it using this rubric:
1) RiskReduction (0-5), 2) DowntimeAvoided (0-5), 3) CostEffectiveness (0-5), 4) TimeToImplement (0-5, invert score so faster=5), 5) Repeatability (0-5).
Compute ImpactScore = 0.35*RiskReduction + 0.25*DowntimeAvoided + 0.20*CostEffectiveness + 0.10*TimeToImplement + 0.10*Repeatability.
Return JSON with fields: actions: [{title, justification, scores:{...}, ImpactScore}], plus a brief summary."""

docs = spark.table(DOCS_TABLE).collect()
rank_rows = []
for d in docs:
    summary = call_chat([
        {"role":"system","content": sum_system},
        {"role":"user","content": d.raw_text[:120000]}  # keep under context window
    ], temperature=0.5, max_tokens=5000)

    # Optional: light validation
    m = re.search(r"\{.*\}", summary, flags=re.S)
    json_blob = m.group(0) if m else "{}"

    rank_rows.append((d.scenario_id, d.title, d.docx_path, json_blob, summary))

schema = "scenario_id string, title string, docx_path string, ranking_json string, summary string"
spark.createDataFrame(rank_rows, schema).write.mode("overwrite").saveAsTable(f"{catalog}.{db}.rankings")

print(f"Ranking complete. Inspect table {catalog}.{db}.rankings for top actions per doc.")

In [0]:
content = call_chat([
    {"role":"system","content": "You are a general purpose assistant. You will answer truthfully and completely regarding the source and origin of where your data comes from and what is available."},
    {"role":"user","content": "Where are you getting the remaining context for the AI4I 2020 data? When I look at the dataset I have it only consists of maybe 14 columns with no descriptors. How are you generating the context, corrective and actions based on this data?"}
], temperature=0.2, max_tokens=5000)

print(content)