# 演示 ByzerStorageCache 的使用

本演示展示了如何使用 ByzerStorageCache 进行文档的存储和检索。

In [1]:
import os
from autocoder.rag.cache.byzer_storage_cache import ByzerStorageCache
from autocoder.common import SourceCode

## 1. 初始化 ByzerStorageCache

首先我们需要创建一个 ByzerStorageCache 实例。

In [2]:
# 设置示例路径和过滤规则
path = "/Users/allwefantasy/projects/ray/doc/source"
ignore_spec = None  # 可以使用 pathspec 创建忽略规则
required_exts = [".md", ".rst"]  # 只处理这些扩展名的文件

# 创建 ByzerStorageCache 实例
cache = ByzerStorageCache(path, ignore_spec, required_exts)

[32m2024-10-23 20:46:25.714[0m | [1mINFO    [0m | [36mbyzerllm.utils.connect_ray[0m:[36mconnect_cluster[0m:[36m48[0m - [1mJDK 21 will be used (/Users/allwefantasy/.auto-coder/jdk-21.0.2.jdk/Contents/Home)...[0m
2024-10-23 20:46:25,853	INFO worker.py:1564 -- Connecting to existing Ray cluster at address: 127.0.0.1:6379...
2024-10-23 20:46:25,945	INFO worker.py:1740 -- Connected to Ray cluster. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


21


## 2. 获取缓存内容

In [3]:
# 获取所有缓存的文档
cache.build_cache()


[32m2024-10-23 20:46:34.812[0m | [1mINFO    [0m | [36mautocoder.rag.cache.byzer_storage_cache[0m:[36mbuild_cache[0m:[36m110[0m - [1mBuilding cache for path: /Users/allwefantasy/projects/ray/doc/source[0m
[32m2024-10-23 20:46:34.949[0m | [31m[1mERROR   [0m | [36mautocoder.rag.token_counter[0m:[36mcount_tokens_worker[0m:[36m50[0m - [31m[1mError counting tokens: name 'tokenizer_model' is not defined[0m
[32m2024-10-23 20:46:34.950[0m | [1mINFO    [0m | [36mautocoder.rag.utils[0m:[36mprocess_file_in_multi_process[0m:[36m68[0m - [1mLoad file /Users/allwefantasy/projects/ray/doc/source/_includes/_help.rst in 0.001455068588256836[0m
[32m2024-10-23 20:46:34.951[0m | [1mINFO    [0m | [36mautocoder.rag.cache.byzer_storage_cache[0m:[36mbuild_cache[0m:[36m129[0m - [1mProcessing file: ##File: /Users/allwefantasy/projects/ray/doc/source/_includes/_help.rst[0m
[32m2024-10-23 20:46:34.953[0m | [31m[1mERROR   [0m | [36mautocoder.rag.token_counter[

[33m(raylet)[0m Oct 23, 2024 8:51:50 PM org.apache.lucene.internal.vectorization.PanamaVectorizationProvider <init>
[33m(raylet)[0m INFO: Java vector incubator API enabled; uses preferredBitSize=256


## 3. 搜索缓存内容

In [5]:
# 设置搜索选项
search_options = {
    "query": "Ray Actor 创建",
    "enable_vector_search": True,
    "enable_text_search": True
}

# 执行搜索
search_results = list(cache.get_cache(search_options))

# 打印搜索结果
for doc in search_results:
    print(f"\nFound in: {doc.module_name}")
    print(f"Content preview: {doc.source_code[:100]}...")

[32m2024-10-23 20:52:15.996[0m | [1mINFO    [0m | [36mautocoder.rag.cache.byzer_storage_cache[0m:[36mupdate_cache[0m:[36m199[0m - [1mChecking for file updates...[0m
[32m2024-10-23 20:52:16.101[0m | [1mINFO    [0m | [36mautocoder.rag.cache.byzer_storage_cache[0m:[36mupdate_cache[0m:[36m266[0m - [1mNo file updates found[0m



Found in: ##File: /Users/allwefantasy/projects/ray/doc/source/cluster/kubernetes/user-guides/configuring-autoscaling.md
Content preview: (kuberay-autoscaling)=

# KubeRay Autoscaling

This guide explains how to configure the Ray Autoscal...

Found in: ##File: /Users/allwefantasy/projects/ray/doc/source/ray-observability/user-guides/debug-apps/debug-memory.rst
Content preview: .. _ray-core-mem-profiling:

Debugging Memory Issues


.. _troubleshooting-o...

Found in: ##File: /Users/allwefantasy/projects/ray/doc/source/ray-core/actors/terminating-actors.rst
Content preview: Terminating Actors

Actor processes will be terminated automatically when all cop...

Found in: ##File: /Users/allwefantasy/projects/ray/doc/source/ray-core/actors/task-orders.rst
Content preview: .. _actor-task-order:

Actor Task Execution Order

Synchronous, Single-Th...

Found in: ##File: /Users/allwefantasy/projects/ray/doc/source/ray-observability/user-guides/debug-apps/debug-failures.rst
Content preview: .. _obs

## 说明

1. ByzerStorageCache 会自动将文档分块并存储在 Byzer Storage 中
2. 支持向量搜索和全文检索
3. 文档会被自动向量化以支持语义搜索
4. 缓存会在第一次访问时自动构建