In [1]:
markdown_path = "output/2025-airport-charges-terms-and-conditions/2025-airport-charges-terms-and-conditions.md"
tiny_markdown_path = "output/2025-airport-charges-terms-and-conditions/tinychargesmarkdown.md"

In [2]:
from typing import Optional, Literal, Union
from pydantic import BaseModel, Field, validator
from __future__ import annotations

Op     = Literal["+", "-", "*", "/", "and", "or", "=="]
Scalar = Union[int, float, bool]  
Value  = Union[Scalar, str]

In [3]:
from pydantic import field_validator

class Node(BaseModel):
    childA  : Optional["Node"] = Field(None, description="left sub-expression")
    childB  : Optional["Node"] = Field(None, description="right sub-expression")
    operator: Optional[Op] = None
    value   : Optional[Value] = None


    @field_validator("operator", mode="after")
    @classmethod
    def _op_required(cls, v, info):
        if info.data.get("value") is None and v is None:
            raise ValueError("operator required when value is None")
        return v

    def is_leaf(self) -> bool:
        return self.value is not None

    def evaluate(self, env: dict[str, Scalar] | None = None) -> Scalar:
        if self.is_leaf():
            return env.get(self.value, self.value) if env else self.value
        a, b = self.childA.evaluate(env), self.childB.evaluate(env)
        match self.operator:
            case "+":   return a + b
            case "-":   return a - b
            case "*":   return a * b
            case "/":   return a / b
            case "and": return a and b
            case "or":  return a or b
            case "==":  return a == b
        raise ValueError(f"Unsupported operator {self.operator!r}")

Node.model_rebuild()


In [4]:

# Build the expression: 3 + (4 * 2)
tree = Node(
    operator="+",
    childA=Node(value=3),
    childB=Node(
        operator="*",
        childA=Node(value=4),
        childB=Node(value=2),
    ),
)

assert tree.evaluate({}) == 11
print("✅  Expression tree returns:", tree.evaluate({}))


✅  Expression tree returns: 11


In [5]:
charges = {"airport_charges1": {"atm": "landing", "period": "summer", "mtow": 10},
           "airport_charges2": {"atm": "takeoff", "period": "summer", "mtow": 2}}

# does outlines only allow valid json as in it throws away anything else generated or does it fundamentally change the prompt to the model so that the model is told to output in that format

In [6]:
from __future__ import annotations
from typing import Literal, Optional, Union
from pydantic import BaseModel, Field, field_validator

Op     = Literal["+", "-", "*", "/", "and", "or", "=="]
Scalar = Union[int, float, bool]
Value  = Union[Scalar, str]

class Node(BaseModel):
    childA  : Optional["Node"] = Field(None, description="left operand / sub-expression")
    childB  : Optional["Node"] = Field(None, description="right operand / sub-expression")
    operator: Optional[Op] = None          # present on internal nodes
    value   : Optional[Value] = None       # present on leaves

    @field_validator("operator", mode="after")
    @classmethod
    def _operator_required(cls, v, info):
        if info.data.get("value") is None and v is None:
            raise ValueError("Non-leaf nodes must have an operator")
        return v

    def is_leaf(self):            return self.value is not None
    def evaluate(self, env=None):               # same logic you already tested
        if self.is_leaf():
            return env.get(self.value, self.value) if env else self.value
        a, b = self.childA.evaluate(env), self.childB.evaluate(env)
        return {"+": a+b, "-": a-b, "*": a*b, "/": a/b,
                "and": a and b, "or": a or b, "==": a == b}[self.operator]

Node.model_rebuild()                            # resolves forward refs (v2 syntax) :contentReference[oaicite:0]{index=0}
schema = Node.model_json_schema()               # one-liner to JSON-Schema :contentReference[oaicite:1]{index=1}


In [7]:
with open(tiny_markdown_path, "r", encoding="utf-8") as file:
    content = file.read()

In [8]:
prompt = f"""\
You are a deterministic parser.

**Input Markdown**
{content}


**Task**
1. Identify the arithmetic expression.
2. Emit JSON that is a valid `Node` tree *exactly* matching the schema you have been trained on.
3. Use numbers as `value` leaves; use '+' and '*' for internal `operator` keys.

Return ONLY that JSON.
"""


In [9]:
import outlines

In [None]:
# @outlines.prompt
# def take_order(content):
#     """You are an expert aviation cost analyst. Your task is to extract all aircraft charging rules from the provided document text.
#     Your job is to then convert the rules into a computation graph.

#     Document Text to Analyze:
#     ---
#     {{content}}
#     ---

#     # EXAMPLE

#     content: # 3.5. Transfer Passenger Charge  /n <html> <table><thead><tr><th>Charging Basis (€)</th><th>Summer Airline<br>Scheduling Season</th><th>Winter Airline<br>Scheduling Season</th></tr></thead><tbody><tr><td>Transfer Passenger Charge</td><td><span style="background-color: yellow;">3.90</span></td><td><span style="background-color: yellow;">2.80</span></td></tr></tbody></table></html> bullet Transfer Passenger information shall be provided via passenger transfer messages (PTM). Where valid information is provided in this manner the transfer rate will be charged to the Operator through the normal billing process. \bullet Airline positioning crews are not exempt from these charges. - A QRF will have its Transfer Passenger Charge exempted. This means that the QRF departing Transfer Passenger Charge will not be raised on the first departure. The subsequent second departure of that flight will attract the standard Transfer Passenger Charge. This Transfer Passenger Charge contributes to airport security, passenger screening and provision of infrastructure for hold baggage screening.  
 
#     RESULT: {"pizza": "Margherita", "number": 1}

#     # OUTPUT INSTRUCTIONS

#     Answer in valid computation graph. Here are the different objects relevant for the output:

#     Order:
#         pizza (str): name of the pizza
#         number (int): number of pizzas

#     Return a valid JSON of type "Order"

#     # OUTPUT

#     ORDER: {{ order }}
#     RESULT: """

In [11]:
from __future__ import annotations
from typing import Optional, Literal, Union
from pydantic import BaseModel, Field, field_validator

Op     = Literal["+", "-", "*", "/", "and", "or", "=="]
Scalar = Union[int, float, bool]
Value  = Union[Scalar, str]

class Node(BaseModel):
    childA  : Optional["Node"] = Field(None, description="left subtree / operand")
    childB  : Optional["Node"] = Field(None, description="right subtree / operand")
    operator: Optional[Op]     = None          # required on internal nodes
    value   : Optional[Value]  = None          # present on leaves

    @field_validator("operator", mode="after")
    @classmethod
    def _op_required(cls, v, info):
        if info.data.get("value") is None and v is None:
            raise ValueError("operator required when value is None")
        return v

    def evaluate(self, env: dict[str, Scalar] | None = None) -> Scalar:
        if self.value is not None:
            return env.get(self.value, self.value) if env else self.value
        a, b = self.childA.evaluate(env), self.childB.evaluate(env)
        return {"+": a+b, "-": a-b, "*": a*b, "/": a/b,
                "and": a and b, "or": a or b, "==": a == b}[self.operator]

Node.model_rebuild()                                  # resolves forward refs :contentReference[oaicite:1]{index=1}
schema = Node.model_json_schema()                     # 1-liner to JSON Schema :contentReference[oaicite:2]{index=2}


In [12]:
MODEL_ID = "Qwen/Qwen3-30B-A3B"
import torch, outlines
from transformers import AutoModelForCausalLM, AutoTokenizer

# 1️⃣  HF loads & shards the model – ONE LINE does the heavy lifting.
hf_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,       # Qwen-3 is bfloat16-friendly
    device_map="auto",                # shards across all visible GPUs
    low_cpu_mem_usage=True,           # avoids a large RAM peak
)

# 2️⃣  Wrap in Outlines.
tok        = AutoTokenizer.from_pretrained(MODEL_ID)
model      = outlines.from_transformers(hf_model, tok)          # note new API
json_match = outlines.json_schema(schema)                       # as before


Loading checkpoint shards:   0%|          | 0/16 [00:00<?, ?it/s]

In [17]:
content

'# 3.5. Transfer Passenger Charge  \n\n<html>\n<table><thead><tr><th>Charging Basis (€)</th><th>Summer Airline<br>Scheduling Season</th><th>Winter Airline<br>Scheduling Season</th></tr></thead><tbody><tr><td>Transfer Passenger Charge</td><td><span style="background-color: yellow;">3.90</span></td><td><span style="background-color: yellow;">2.80</span></td></tr></tbody></table>\n</html>  \n\n\\bullet Transfer Passenger information shall be provided via passenger transfer messages (PTM). Where valid information is provided in this manner the transfer rate will be charged to the Operator through the normal billing process. \\bullet Airline positioning crews are not exempt from these charges.  \n\n- A QRF will have its Transfer Passenger Charge exempted. This means that the QRF departing Transfer Passenger Charge will not be raised on the first departure. The subsequent second departure of that flight will attract the standard Transfer Passenger Charge.  \n\nThis Transfer Passenger Charge 

In [31]:
conditions = "season: winter airline scheduling season"


In [32]:
prompt = f"""You are an **aviation cost analyst**.  
Extract every aircraft-charging rule from the supplied *Document Text* based on the *Conditions* and
return **one JSON object** that encodes the rule as a recursive `Node`
computation graph **AND NOTHING ELSE**.

Conditions:
{ conditions }

Document Text:
---
{ content }
---
""" 
instructions = """
# OUTPUT INSTRUCTIONS
1. Your output **must** be valid JSON that conforms to the `Node` schema you know.
2. The root object represents the final numeric expression for the charge.
3. Use numeric literals for constants; use lowercase snake-case strings for variables
   (e.g. `"transfer_pax"`).  
4. Do **not** wrap your JSON in markdown fences or additional keys.

# EXAMPLE (for guidance only)
Input snippet:  
“Fee formula is 3 + (4 × 2).”

Expected JSON:
{
  "operator": "+",
  "childA": { "value": 3 },
  "childB": {
      "operator": "*",
      "childA": { "value": 4 },
      "childB": { "value": 2 }
  }
}
"""


In [33]:
prompt = prompt + instructions
prompt

'You are an **aviation cost analyst**.  \nExtract every aircraft-charging rule from the supplied *Document Text* based on the *Conditions* and\nreturn **one JSON object** that encodes the rule as a recursive `Node`\ncomputation graph **AND NOTHING ELSE**.\n\nConditions:\nseason: winter airline scheduling season\n\nDocument Text:\n---\n# 3.5. Transfer Passenger Charge  \n\n<html>\n<table><thead><tr><th>Charging Basis (€)</th><th>Summer Airline<br>Scheduling Season</th><th>Winter Airline<br>Scheduling Season</th></tr></thead><tbody><tr><td>Transfer Passenger Charge</td><td><span style="background-color: yellow;">3.90</span></td><td><span style="background-color: yellow;">2.80</span></td></tr></tbody></table>\n</html>  \n\n\\bullet Transfer Passenger information shall be provided via passenger transfer messages (PTM). Where valid information is provided in this manner the transfer rate will be charged to the Operator through the normal billing process. \\bullet Airline positioning crews a

In [34]:
from outlines import Generator
from outlines.types import JsonSchema

to_node    = JsonSchema(schema)
generator  = Generator(model, to_node)   # FSM compiled only once
graph_json = generator(prompt, max_new_tokens=512)

node_obj = Node.model_validate_json(graph_json)


In [35]:
node_obj

Node(childA=None, childB=None, operator='+', value=3.9)