From 7988a24c87d582235ebe1773764febbe50a5fc7a Mon Sep 17 00:00:00 2001 From: Nikhil Date: Thu, 16 Mar 2023 08:27:41 -0400 Subject: [PATCH 1/2] Add tokenizer utils --- .gitignore | 3 +++ README.md | 1 + anthropic/__init__.py | 1 + anthropic/tokenizer.py | 20 ++++++++++++++++++++ examples/count_tokens.py | 9 +++++++++ 5 files changed, 34 insertions(+) create mode 100644 anthropic/tokenizer.py create mode 100644 examples/count_tokens.py diff --git a/.gitignore b/.gitignore index 62fe86da..815aff97 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,6 @@ __pycache__ .DS_Store **/.DS_Store + +anthropic.egg-info/ +build/ diff --git a/README.md b/README.md index de62730a..faff2565 100644 --- a/README.md +++ b/README.md @@ -11,4 +11,5 @@ pip install . export ANTHROPIC_API_KEY= python examples/basic_sync.py python examples/basic_stream.py +python examples/count_tokens.py ``` diff --git a/anthropic/__init__.py b/anthropic/__init__.py index 0b17d8ba..e26dbf4c 100644 --- a/anthropic/__init__.py +++ b/anthropic/__init__.py @@ -1,2 +1,3 @@ from .api import Client from .constants import HUMAN_PROMPT, AI_PROMPT, ANTHROPIC_CLIENT_VERSION +from .tokenizer import count_tokens, get_tokenizer diff --git a/anthropic/tokenizer.py b/anthropic/tokenizer.py new file mode 100644 index 00000000..db429cef --- /dev/null +++ b/anthropic/tokenizer.py @@ -0,0 +1,20 @@ +import requests +from tokenizers import Tokenizer + +CLAUDE_TOKENIZER_REMOTE_FILE = "https://public-json-tokenization-0d8763e8-0d7e-441b-a1e2-1c73b8e79dc3.storage.googleapis.com/claude-v1-tokenization.json" + +claude_tokenizer = None + +def get_tokenizer() -> Tokenizer: + global claude_tokenizer + + if not claude_tokenizer: + tokenizer_data = requests.get(CLAUDE_TOKENIZER_REMOTE_FILE) + claude_tokenizer = Tokenizer.from_str(tokenizer_data.text) + + return claude_tokenizer + +def count_tokens(text: str) -> int: + tokenizer = get_tokenizer() + encoded_text = tokenizer.encode(text) + return len(encoded_text.ids) diff --git a/examples/count_tokens.py b/examples/count_tokens.py new file mode 100644 index 00000000..17630a40 --- /dev/null +++ b/examples/count_tokens.py @@ -0,0 +1,9 @@ +import anthropic + +def main(sample_str: str = "Hello world!"): + num_tokens = anthropic.count_tokens(sample_str) + print(f"Number of tokens: {num_tokens}") + + +if __name__ == "__main__": + main() From f7f8933c2b9ba523d2a7be491ac595167578daba Mon Sep 17 00:00:00 2001 From: Nikhil Date: Thu, 16 Mar 2023 11:12:43 -0400 Subject: [PATCH 2/2] Add file caching and use httpx --- anthropic/tokenizer.py | 22 +++++++++++++++++++--- pyproject.toml | 6 ++++-- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/anthropic/tokenizer.py b/anthropic/tokenizer.py index db429cef..8b1fdcf8 100644 --- a/anthropic/tokenizer.py +++ b/anthropic/tokenizer.py @@ -1,16 +1,32 @@ -import requests +import os +import tempfile + +import httpx from tokenizers import Tokenizer CLAUDE_TOKENIZER_REMOTE_FILE = "https://public-json-tokenization-0d8763e8-0d7e-441b-a1e2-1c73b8e79dc3.storage.googleapis.com/claude-v1-tokenization.json" claude_tokenizer = None +def _get_cached_tokenizer_file_as_str() -> str: + cache_dir = os.path.join(tempfile.gettempdir(), "anthropic") + + tokenizer_file = os.path.join(cache_dir, 'claude_tokenizer_file.json') + if not os.path.exists(tokenizer_file): + os.makedirs(cache_dir, exist_ok=True) + response = httpx.get(CLAUDE_TOKENIZER_REMOTE_FILE) + with open(tokenizer_file, 'w') as f: + f.write(response.text) + + with open(tokenizer_file, 'r') as f: + return f.read() + def get_tokenizer() -> Tokenizer: global claude_tokenizer if not claude_tokenizer: - tokenizer_data = requests.get(CLAUDE_TOKENIZER_REMOTE_FILE) - claude_tokenizer = Tokenizer.from_str(tokenizer_data.text) + tokenizer_data = _get_cached_tokenizer_file_as_str() + claude_tokenizer = Tokenizer.from_str(tokenizer_data) return claude_tokenizer diff --git a/pyproject.toml b/pyproject.toml index 4de3bbb6..a0e9b09c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,8 +21,10 @@ classifiers = [ "Operating System :: OS Independent" ] dependencies = [ - "requests", + "httpx", 'importlib-metadata; python_version<"3.8"', + "requests", + "tokenizers", ] [project.urls] @@ -35,4 +37,4 @@ repository = "https://github.com/anthropics/anthropic-sdk-python.git" dev = [ "black >= 22.3.0", "pytest", -] \ No newline at end of file +]