Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .env.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# TARS API
TARS_URL=
TARS_API_KEY=

# Anthropic API
ANTHROPIC_API_KEY=
14 changes: 13 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ To get started, set the environment variables required to authenticate with your
</details>


### 3b. Test with 🤗 Hugging Face Model Demos
### 3b. Test with 🤗 Hugging Face Spaces API

You can test the Vision Agent with Hugging Face models via their Spaces API. Please note that the API is rate-limited so for production use cases, it is recommended to choose step 3a.

Expand All @@ -122,6 +122,18 @@ You can test the Vision Agent with Hugging Face models via their Spaces API. Ple
agent.click("search field", model_name="OS-Copilot/OS-Atlas-Base-7B")
```

### 3c. Custom Model Implementations

#### UI-TARS

You can use Vision Agent with UI-TARS if you provide your own UI-TARS API endpoint.

1. Step: Host the model locally or in the cloud. More information about hosting UI-TARS can be found [here](https://github.com/bytedance/UI-TARS?tab=readme-ov-file#deployment).

2. Step: Provide the `TARS_URL` and `TARS_API_KEY` environment variables to Vision Agent.

3. Step: Use the `model_name="tars"` parameter in your `click()`, `get()` and `act()` commands.


## ▶️ Start Building

Expand Down
147 changes: 145 additions & 2 deletions pdm.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 14 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "askui"
version = "0.1.11"
version = "0.1.12"
description = "Automate computer tasks in Python"
authors = [
{name = "askui GmbH", email = "info@askui.com"},
Expand All @@ -19,6 +19,7 @@ dependencies = [
"pydantic-settings>=2.7.0",
"tenacity>=9.0.0",
"python-dateutil>=2.9.0.post0",
"openai>=1.61.1",
]
requires-python = ">=3.10"
readme = "README.md"
Expand All @@ -31,10 +32,18 @@ build-backend = "hatchling.build"
[tool.pdm]
distribution = true

[tool.pdm.dev-dependencies]
[tool.pdm.scripts]
test = "pytest"
sort = "isort ."
format = "black ."
lint = "ruff check ."
checks = {composite = ["sort . -c", "format . --check"]}
fix = {composite = ["sort", "format", "lint"]}

[dependency-groups]
test = [
"pytest>=8.3.4",
"isort>=6.0.0",
"black>=25.1.0",
"ruff>=0.9.5",
]

[tool.pdm.scripts]
test = "pytest"
14 changes: 7 additions & 7 deletions src/askui/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@
MODIFIER_KEY,
)
from .models.anthropic.claude import ClaudeHandler
from .models.anthropic.claude_agent import ClaudeComputerAgent
from .logging import logger, configure_logging
from .tools.toolbox import AgentToolbox
from .models.router import ModelRouter
from .reporting.report import SimpleReportGenerator
import time
from dotenv import load_dotenv


class InvalidParameterError(Exception):
Expand All @@ -30,6 +30,7 @@ def __init__(
enable_report: bool = False,
enable_askui_controller: bool = True,
):
load_dotenv()
configure_logging(level=log_level)
self.report = None
if enable_report:
Expand All @@ -42,7 +43,7 @@ def __init__(
self.client = AskUiControllerClient(display, self.report)
self.client.connect()
self.client.set_display(display)
self.model_router = ModelRouter(log_level)
self.model_router = ModelRouter(log_level, self.report)
self.claude = ClaudeHandler(log_level=log_level)
self.tools = AgentToolbox(os_controller=self.client)

Expand Down Expand Up @@ -101,13 +102,13 @@ def type(self, text: str) -> None:
logger.debug("VisionAgent received instruction to type '%s'", text)
self.client.type(text) # type: ignore

def get(self, instruction: str) -> str:
def get(self, instruction: str, model_name: Optional[str] = None) -> str:
self._check_askui_controller_enabled()
if self.report is not None:
self.report.add_message("User", f'get: "{instruction}"')
logger.debug("VisionAgent received instruction to get '%s'", instruction)
screenshot = self.client.screenshot() # type: ignore
response = self.claude.get_inference(screenshot, instruction)
response = self.model_router.get_inference(screenshot, instruction, model_name)
if self.report is not None:
self.report.add_message("Agent", response)
return response
Expand Down Expand Up @@ -162,15 +163,14 @@ def key_down(self, key: PC_AND_MODIFIER_KEY):
self.report.add_message("User", f'key_down "{key}"')
self.client.keyboard_pressed(key)

def act(self, goal: str) -> None:
def act(self, goal: str, model_name: Optional[str] = None) -> None:
self._check_askui_controller_enabled()
if self.report is not None:
self.report.add_message("User", f'act: "{goal}"')
logger.debug(
"VisionAgent received instruction to act towards the goal '%s'", goal
)
agent = ClaudeComputerAgent(self.client, self.report)
agent.run(goal)
self.model_router.act(self.client, goal, model_name)

def keyboard(
self, key: PC_AND_MODIFIER_KEY, modifier_keys: list[MODIFIER_KEY] | None = None
Expand Down
Loading