In [1]:
from llm_engine import Engine
from openai import AsyncOpenAI

NGROK_LINK = "https://" + "fitting-shrimp-pleasantly.ngrok-free.app"

client_vllm = AsyncOpenAI(
    api_key="None",
    base_url=str(NGROK_LINK + "/v1")
)

# model = "Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8"
model = "Qwen/Qwen3-14B"
engine = Engine(client_vllm, chat=True, model=model)

In [None]:
from llm_engine import Engine
from openai import AsyncOpenAI
from dotenv import load_dotenv
import os

load_dotenv(".env")

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

client_vllm = AsyncOpenAI(api_key=OPENAI_API_KEY)

model = "gpt-4o-mini"
engine = Engine(client_vllm, chat=True, model=model)

In [8]:
import os
from task_tester import TaskRunner, TaskType
from pddl_translation.translator import StandardTranslator, NatLangTranslator

context = "I am playing with a set of blocks where I need to arrange the blocks into stacks. Here are the actions I can do\n\nPick up a block\nUnstack a block from on top of another block\nPut down a block\nStack a block on top of another block\n\nI have the following restrictions on my actions:\nI can only pick up or unstack one block at a time.\nI can only pick up or unstack a block if my hand is empty.\nI can only pick up a block if the block is on the table and the block is clear. A block is clear if the block has no other blocks on top of it and if the block is not picked up.\nI can only unstack a block from on top of another block if the block I am unstacking was really on top of the other block.\nI can only unstack a block from on top of another block if the block I am unstacking is clear.\nOnce I pick up or unstack a block, I am holding the block.\nI can only put down a block that I am holding.\nI can only stack a block on top of another block if I am holding the block being stacked.\nI can only stack a block on top of another block if the block onto which I am stacking the block is clear.\nOnce I put down or stack a block, my hand becomes empty.\nOnce you stack a block on top of a second block, the second block is no longer clear."
translator = StandardTranslator()
thinking = False
# set env var
os.environ["FAST_DOWNWARD"] = "/home/leon/bachelor_arbeit/fast-downward-24.06/"
runner = TaskRunner(None, translator, context)

translator = NatLangTranslator()
runner = TaskRunner(None, translator, context)

#import logging
#import sys
#logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler(sys.stdout)], force=True)


In [9]:
import json
import os

# print results summaries
results_dir = "results"
model = "Qwen/Qwen3-14B"
tasks = [
    "action_gen",
    "action_gen_single",
    "succ_state_mapping",
    "succ_state_mapping_seperate",
    "goal_state_verification",
    "novelty",
    "novelty_no_duplicates",
    "novelty_estimation",
    "novelty_estimation_no_duplicates",
    "problem_width_estimation"
]
for task in tasks:
    for translator in ["standard", "nat_lang"]:
        for thinking in ["non_thinking", "thinking"]:
            file_path = os.path.join(results_dir, model, thinking, translator, f"results_{task}.json")
            if os.path.exists(file_path):
                with open(file_path, "r") as f:
                    results = json.load(f)
                if task == "action_gen_single":
                    # seperate valid and optimal
                    results_valid = [result for result in results if "optimal" not in result["task_id"]]
                    results_optimal = [result for result in results if "optimal" in result["task_id"]]
                    total_valid = len(results_valid)
                    successes_valid = sum(1 for r in results_valid if r["success"])
                    total_optimal = len(results_optimal)
                    successes_optimal = sum(1 for r in results_optimal if r["success"])
                    print(f"Model: {model}, Translator: {translator}, Thinking: {thinking}, Task: {task} -> Valid Success Rate: {successes_valid}/{total_valid} ({(successes_valid/total_valid)*100:.2f}%)")
                    print(f"Model: {model}, Translator: {translator}, Thinking: {thinking}, Task: {task} -> Optimal Success Rate: {successes_optimal}/{total_optimal} ({(successes_optimal/total_optimal)*100:.2f}%)")
                    continue
                elif task == "succ_state_mapping_seperate":
                    # only use first 50 results
                    results = results[:50]

                total = len(results)
                successes = sum(1 for r in results if r["success"])
                print(f"Model: {model}, Translator: {translator}, Thinking: {thinking}, Task: {task} -> Success Rate: {successes}/{total} ({(successes/total)*100:.2f}%)")
            else:
                print(f"File {file_path} does not exist.")


Model: Qwen/Qwen3-14B, Translator: standard, Thinking: non_thinking, Task: action_gen -> Success Rate: 1/50 (2.00%)
Model: Qwen/Qwen3-14B, Translator: standard, Thinking: thinking, Task: action_gen -> Success Rate: 50/50 (100.00%)
Model: Qwen/Qwen3-14B, Translator: nat_lang, Thinking: non_thinking, Task: action_gen -> Success Rate: 1/50 (2.00%)
Model: Qwen/Qwen3-14B, Translator: nat_lang, Thinking: thinking, Task: action_gen -> Success Rate: 48/50 (96.00%)
Model: Qwen/Qwen3-14B, Translator: standard, Thinking: non_thinking, Task: action_gen_single -> Valid Success Rate: 39/50 (78.00%)
Model: Qwen/Qwen3-14B, Translator: standard, Thinking: non_thinking, Task: action_gen_single -> Optimal Success Rate: 27/50 (54.00%)
Model: Qwen/Qwen3-14B, Translator: standard, Thinking: thinking, Task: action_gen_single -> Valid Success Rate: 45/50 (90.00%)
Model: Qwen/Qwen3-14B, Translator: standard, Thinking: thinking, Task: action_gen_single -> Optimal Success Rate: 41/50 (82.00%)
Model: Qwen/Qwen3-1

In [10]:
import json
import os

# print engine stats for tasks
engine_stats_dir = "task_engine_stats"
model = "Qwen/Qwen3-14B"
tasks = [
    "action_gen",
    "action_gen_single",
    "succ_state_mapping",
    "succ_state_mapping_seperate",
    "goal_state_verification",
    "novelty",
    "novelty_no_duplicates",
    "novelty_estimation",
    "novelty_estimation_no_duplicates",
]
stats_list = []
for task in tasks:
    for translator in ["standard", "nat_lang"]:
        for thinking in ["non_thinking", "thinking"]:
            file_path = os.path.join(engine_stats_dir, model, thinking, translator, f"stats_{task}.json")
            if os.path.exists(file_path):
                with open(file_path, "r") as f:
                    stats = json.load(f)
                print(f"Model: {model}, Translator: {translator}, Thinking: {thinking}, Task: {task} -> Stats:")
                print(stats)
                stats_list.append(stats)
            else:
                print(f"File {file_path} does not exist.")
total_stats = {}
for d in stats_list:
    for k, v in d.items():
        if isinstance(v, (int, float)):
            total_stats[k] = total_stats.get(k, 0) + v

print("Total stats across all tasks:")
print(total_stats)


Model: Qwen/Qwen3-14B, Translator: standard, Thinking: non_thinking, Task: action_gen -> Stats:
{'total_token_usage': 748233, 'num_early_stops': 0, 'num_max_context_exceeded': 0, 'num_max_retries_exceeded': 0, 'num_retries': 0, 'num_truncated_queries': 0}
Model: Qwen/Qwen3-14B, Translator: standard, Thinking: thinking, Task: action_gen -> Stats:
{'total_token_usage': 1145377, 'num_early_stops': 0, 'num_max_context_exceeded': 0, 'num_max_retries_exceeded': 0, 'num_retries': 0, 'num_truncated_queries': 0}
Model: Qwen/Qwen3-14B, Translator: nat_lang, Thinking: non_thinking, Task: action_gen -> Stats:
{'total_token_usage': 832060, 'num_early_stops': 0, 'num_max_context_exceeded': 0, 'num_max_retries_exceeded': 0, 'num_retries': 0, 'num_truncated_queries': 0}
Model: Qwen/Qwen3-14B, Translator: nat_lang, Thinking: thinking, Task: action_gen -> Stats:
{'total_token_usage': 1272552, 'num_early_stops': 0, 'num_max_context_exceeded': 0, 'num_max_retries_exceeded': 0, 'num_retries': 0, 'num_trunc

In [3]:
results_action_gen_single = await runner.run(TaskType.ACTION_GEN_SINGLE, num_tasks=50)

num_correct_action_gen_single_valid = sum([result[0].success for result in results_action_gen_single])
num_correct_action_gen_single_optimal = sum([result[1].success for result in results_action_gen_single])

print(f"Action Generation Single: {num_correct_action_gen_single_valid}/{len(results_action_gen_single)} valid, {num_correct_action_gen_single_optimal}/{len(results_action_gen_single)} optimal")
    

DEBUG:openai._base_client:Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'user', 'content': 'I am playing with a set of blocks where I need to arrange the blocks into stacks. Here are the actions I can do\n\nPick up a block\nUnstack a block from on top of another block\nPut down a block\nStack a block on top of another block\n\nI have the following restrictions on my actions:\nI can only pick up or unstack one block at a time.\nI can only pick up or unstack a block if my hand is empty.\nI can only pick up a block if the block is on the table and the block is clear. A block is clear if the block has no other blocks on top of it and if the block is not picked up.\nI can only unstack a block from on top of another block if the block I am unstacking was really on top of the other block.\nI can only unstack a block from on top of another block if the block I am unstacking is clear.\nOnce I pick up or unstack a block, I am h

DEBUG:httpcore.connection:connect_tcp.complete return_value=<httpcore._backends.anyio.AnyIOStream object at 0x7feea6f97220>
DEBUG:httpcore.connection:start_tls.started ssl_context=<ssl.SSLContext object at 0x7feea74e7d40> server_hostname='fitting-shrimp-pleasantly.ngrok-free.app' timeout=5.0
DEBUG:httpcore.connection:connect_tcp.complete return_value=<httpcore._backends.anyio.AnyIOStream object at 0x7feea6f39720>
DEBUG:httpcore.connection:start_tls.started ssl_context=<ssl.SSLContext object at 0x7feea74e7d40> server_hostname='fitting-shrimp-pleasantly.ngrok-free.app' timeout=5.0
DEBUG:httpcore.connection:connect_tcp.complete return_value=<httpcore._backends.anyio.AnyIOStream object at 0x7feea6fa8490>
DEBUG:httpcore.connection:start_tls.started ssl_context=<ssl.SSLContext object at 0x7feea74e7d40> server_hostname='fitting-shrimp-pleasantly.ngrok-free.app' timeout=5.0
DEBUG:httpcore.connection:connect_tcp.complete return_value=<httpcore._backends.anyio.AnyIOStream object at 0x7feea6fa84c

In [4]:
from task_tester import Result
import json

def log_results(results: list[Result], model: str, task_type: TaskType, thinking: bool = False, translator: StandardTranslator | NatLangTranslator = StandardTranslator()):
    thinking_dir = "/thinking/" if thinking else "/non_thinking/"
    results_dir = "results/" + model + thinking_dir + translator.type
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)
    with open(os.path.join(results_dir, f"results_{task_type.name.lower()}.json"), "w") as f:
        json.dump([result.model_dump() for result in results], f, indent=4)

flat_results = [result for sublist in results_action_gen_single for result in sublist]
log_results(flat_results, model, TaskType.ACTION_GEN_SINGLE, thinking, translator)

In [None]:

results_action_gen = await runner.run(TaskType.ACTION_GEN, num_tasks=10)

In [None]:
results_succ_state_mapping = await runner.run(TaskType.SUCC_STATE_MAPPING_SEPERATE, num_tasks=1)

In [None]:
for result in results_succ_state_mapping[0]:
    print(f"Successor State Mapping: {result.success}")

print(results_succ_state_mapping)

In [None]:
results_goal_state_ver = await runner.run(TaskType.GOAL_STATE_VERIFICATION, num_tasks=10)

In [None]:
import json
import os
thinking_dir = "/thinking/" if thinking else "/non_thinking/"
results_dir = "results/" + model + thinking_dir + translator.type
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

# save results to files in the results directory
with open(os.path.join(results_dir, "results_action_gen.json"), "w") as f:
    json.dump([result.model_dump() for result in results_action_gen], f, indent=4)

with open(os.path.join(results_dir, "results_succ_state_mapping.json"), "w") as f:
    json.dump([result.model_dump() for result in results_succ_state_mapping], f, indent=4)

with open(os.path.join(results_dir, "results_goal_state_ver.json"), "w") as f:
    json.dump([result.model_dump() for result in results_goal_state_ver], f, indent=4)


In [None]:
num_correct_action_gen = sum([result.success for result in results_action_gen])
num_correct_succ_state_mapping = sum([result.success for result in results_succ_state_mapping])
num_correct_goal_state_ver = sum([result.success for result in results_goal_state_ver])

print(f"Action Generation: {num_correct_action_gen}/{len(results_action_gen)} correct")
print(f"Success State Mapping: {num_correct_succ_state_mapping}/{len(results_succ_state_mapping)} correct")
print(f"Goal State Verification: {num_correct_goal_state_ver}/{len(results_goal_state_ver)} correct")

In [None]:
from plan_gen import compute_optimal_plan

domain_path = 'test_data/PDDL/Blocksworld/domain.pddl'
instance_dir = 'test_data/PDDL/Blocksworld/instances/instance-1.pddl'
plan = compute_optimal_plan(domain_path, instance_dir)
print(plan)

In [None]:
import os
from task_tester import TaskRunner, TaskType
from pddl_translation.translator import StandardTranslator, NatLangTranslator

context = "I am playing with a set of blocks where I need to arrange the blocks into stacks. Here are the actions I can do\n\nPick up a block\nUnstack a block from on top of another block\nPut down a block\nStack a block on top of another block\n\nI have the following restrictions on my actions:\nI can only pick up or unstack one block at a time.\nI can only pick up or unstack a block if my hand is empty.\nI can only pick up a block if the block is on the table and the block is clear. A block is clear if the block has no other blocks on top of it and if the block is not picked up.\nI can only unstack a block from on top of another block if the block I am unstacking was really on top of the other block.\nI can only unstack a block from on top of another block if the block I am unstacking is clear.\nOnce I pick up or unstack a block, I am holding the block.\nI can only put down a block that I am holding.\nI can only stack a block on top of another block if I am holding the block being stacked.\nI can only stack a block on top of another block if the block onto which I am stacking the block is clear.\nOnce I put down or stack a block, my hand becomes empty.\nOnce you stack a block on top of a second block, the second block is no longer clear."
translator = StandardTranslator()
thinking = False
# set env var
os.environ["FAST_DOWNWARD"] = "/home/leon/bachelor_arbeit/fast-downward-24.06/"
runner = TaskRunner(None, translator, context)

import logging
import sys
logging.basicConfig(level=logging.DEBUG, handlers=[logging.StreamHandler(sys.stdout)], force=True)
