In [0]:
# %pip install -U -qqqq unitycatalog-ai[databricks] mlflow-skinny[databricks] langgraph==0.3.4 databricks-langchain databricks-agents python-dotenv uv
# %restart_python

# Setup


In [0]:
dbutils.widgets.removeAll()

In [0]:
# Create widgets in order to bind environment variables to the SQL session variables so they can be used in %sql cells
dbutils.widgets.text("TARGET_CATALOG", "workshop")
dbutils.widgets.text("TARGET_SCHEMA", "invoices")

TARGET_CATALOG = dbutils.widgets.get("TARGET_CATALOG")
TARGET_SCHEMA = dbutils.widgets.get("TARGET_SCHEMA")


print(f"TARGET_CATALOG: {TARGET_CATALOG}")
print(f"TARGET_SCHEMA: {TARGET_SCHEMA}")

In [0]:
%sql

-- Depending on your Unity Catalog setup, you may need to create manually, if not uncomment and rerun
CREATE CATALOG IF NOT EXISTS IDENTIFIER(:TARGET_CATALOG);

CREATE SCHEMA IF NOT EXISTS IDENTIFIER(:TARGET_CATALOG || "." || :TARGET_SCHEMA);
CREATE VOLUME IF NOT EXISTS IDENTIFIER(:TARGET_CATALOG || "." || :TARGET_SCHEMA || ".data");
USE IDENTIFIER(:TARGET_CATALOG || "." || :TARGET_SCHEMA);

In [0]:
spark.sql(
    f"GRANT USE SCHEMA, SELECT, EXECUTE, READ VOLUME ON SCHEMA {TARGET_CATALOG}.{TARGET_SCHEMA} TO `account users`"
)

In [0]:
import os

DATA_PATH = os.getcwd() + '/data'  # current path of invoices 
VOLUME_PATH = f"/Volumes/{TARGET_CATALOG}/{TARGET_SCHEMA}/data"

print(f"Removing existing Invoices from {VOLUME_PATH}...")
os.system(f"rm -f {VOLUME_PATH}/*")

print(f"Copying Invoices from {DATA_PATH} to {VOLUME_PATH}...")
os.system(f"cp {DATA_PATH}/* {VOLUME_PATH}")

# 🧠 Databricks AI Functions

Databricks SQL includes built-in AI functions that let you call foundation models directly in SQL for parsing documents or extracting entities.  

---

## 📄 `ai_parse_document(content)`

**Purpose:**  
Parses binary documents (PDF, Word, PowerPoint, images) into structured JSON output — extracting text, tables, and figures.  

📘 Docs: [ai_parse_document](https://docs.databricks.com/aws/en/sql/language-manual/functions/ai_parse_document)

---

## 📄 `ai_extract(content, labels)`

**Purpose:**  
Extracts specific entities, fields, or attributes from text or document content using LLMs — ideal for structured extraction tasks.  

📘 Docs: [ai_extract](https://docs.databricks.com/aws/en/sql/language-manual/functions/ai_extract)

In [0]:
%sql
SELECT
  ai_parse_document(content)
FROM READ_FILES(
  '/Volumes/' || :TARGET_CATALOG || '/' || :TARGET_SCHEMA || '/data/HandwrittenInvoice1.jpg',
  format => 'binaryFile'
);

In [0]:
%sql
SELECT
  path,
  extracted_fields['Supplier'] AS supplier,
  extracted_fields['PO #'] AS po_number,
  extracted_fields['Invoice Number'] AS invoice_number,
  extracted_fields['Total Amount'] AS amount
FROM (
  SELECT
    path,
    ai_extract(
      to_json(ai_parse_document(content)),
      ARRAY(
        'Supplier',
        'PO #',
        'Invoice Number',
        'Total Amount'
      )
    ) AS extracted_fields
  FROM READ_FILES(
  '/Volumes/' || :TARGET_CATALOG || '/' || :TARGET_SCHEMA || '/data/',
  format => 'binaryFile'
  )
)