# BRICK 实体检索示例（模糊检索 + 混合检索）

这份 Notebook 演示如何通过 `entity_index` 包调用字符串通道的模糊检索与混合检索流程，方便图谱、Agent 等其他模块快速复用现有索引能力。

## 环境准备

- 运行前需完成实体索引的构建与写入，确保 Elasticsearch 中存在字符串/向量索引。
- `.env` 或外部环境变量需提供 `ES_*`、`EMBEDDING_*`、`HYBRID_*` 等配置；特别注意 `HYBRID_TYPE_MIX` 应为 JSON 字典字符串。
- Notebook 建议放置在 `entity_index/` 目录下执行，如在其他路径运行，请调整 `project_root`。

In [1]:

# -*- coding: utf-8 -*-
from __future__ import annotations

import json
import os
import sys
from pathlib import Path

# 将仓库根目录加入 sys.path，方便直接 import entity_index 包
project_root = Path.cwd().resolve().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

# 可选：读取 .env 中的环境变量以便本地调试（线上部署可直接依赖外部环境）
env_path = project_root / ".env"
if env_path.exists():
    for raw_line in env_path.read_text(encoding="utf-8").splitlines():
        line = raw_line.strip()
        if not line or line.startswith("#") or "=" not in line:
            continue
        key, value = line.split("=", 1)
        os.environ.setdefault(key, value)

# Notebook 中演示时若未显式设置类型权重，可提供一个默认 JSON 值
os.environ.setdefault("HYBRID_TYPE_MIX", json.dumps({
    "Gene|Protein": 0.25,
    "Disease|Phenotype": 0.25,
    "Process|Function|Pathway|Cell_Component": 0.20,
    "Chemical": 0.10,
    "Species": 0.10,
    "Cell|Tissue": 0.10,
    "Mutation": 0.0,
}))

import pandas as pd

from entity_index.search.settings import get_search_config
from entity_index.search import adapters, string_client
from entity_index.search.hybrid_searcher import HybridEntitySearcher
from entity_index.search.schema import HYBRID_TYPE_KEYS

search_config = get_search_config()
es_client = search_config.es.create_client()

print(f"字符串索引: {search_config.string_index_name}")
print(f"向量索引: {search_config.vector_index_name or '已禁用'}")


Invalid ES_CONFIG JSON, falling back to individual environment variables.


字符串索引: brick_index_*_string
向量索引: brick_index_*_vector


In [2]:
from entity_index.search.settings import get_search_config
cfg = get_search_config()
print('ES host:', cfg.es.host)
print('string index:', cfg.string_index_name)


Invalid ES_CONFIG JSON, falling back to individual environment variables.


ES host: http://localhost:9200
string index: brick_index_*_string


In [3]:
import os, json
os.environ["ES_CONFIG"] = json.dumps({
    "hosts": ["http://es-cn-uax3fxy8w000e2tuf.elasticsearch.aliyuncs.com:9200"],
    "username": "elastic",
    "password": "3fg51u8ZyRYWUxPXwVpl",
})
os.environ["HYBRID_ES_INDEX"] = "brick_index"

from entity_index.search.settings import get_search_config
search_config = get_search_config()
es_client = search_config.es.create_client()
print(search_config.string_index_name)


brick_index_string


In [4]:
# 直接用 elasticsearch-py 测试远程连接
from elasticsearch import Elasticsearch
es_test = Elasticsearch(
    "http://es-cn-uax3fxy8w000e2tuf.elasticsearch.aliyuncs.com:9200",
    basic_auth=("elastic", "3fg51u8ZyRYWUxPXwVpl"),
    verify_certs=True
 )
print(es_test.info())

# 能够连接，数据库配置没有问题



{'name': 'es-cn-uax3fxy8w000e2tuf-data-f-2', 'cluster_name': 'es-cn-uax3fxy8w000e2tuf', 'cluster_uuid': 'm2lo3WFsQKO5L6SAQHeOFg', 'version': {'number': '8.9.1', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '652eb44fa71049c1fcdc08c967ef9f751d986ece', 'build_date': '2023-08-18T06:51:36.666651565Z', 'build_snapshot': False, 'lucene_version': '9.7.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


In [None]:
import os, json
os.environ["HYBRID_ES_INDEX"] = "brick_index_*"      # 字符串/向量都会指向 brick_index_*_string / brick_index_*_vector
os.environ["ES_CONFIG"] = json.dumps({
    "hosts": ["http://es-cn-uax3fxy8w000e2tuf.elasticsearch.aliyuncs.com:9200"],
    "username": "elastic",
    "password": "3fg51u8ZyRYWUxPXwVpl",
})

from entity_index.search.settings import get_search_config
from entity_index.search.hybrid_searcher import HybridEntitySearcher
from entity_index.search import string_client

search_config = get_search_config()
es_client = search_config.es.create_client()
print(search_config.string_index_name)   # 应显示 brick_index_*_string
print(search_config.vector_index_name)   # 应显示 brick_index_*_vector


## 模糊检索（字符串通道）

此部分展示如何仅调用字符串索引完成模糊匹配，适用于别名、拼音等快速召回需求。

In [5]:

print(os.environ.get("ES_CONFIG"))
# 构造模糊检索的 payload，并确保七类实体字段齐全
payload_fuzzy = {
    "query_id": "demo-string-001",
    "options": {"top_k": 10, "return_diagnostics": True},
}
for type_key in HYBRID_TYPE_KEYS:
    payload_fuzzy.setdefault(type_key, [])

# 按需填入候选词，其余类型保持空列表
payload_fuzzy["Gene|Protein"] = ["EGFR", "表皮生长因子受体"]
payload_fuzzy["Disease|Phenotype"] = ["非小细胞肺癌"]

payload_fuzzy


{"hosts": ["http://es-cn-uax3fxy8w000e2tuf.elasticsearch.aliyuncs.com:9200"], "username": "elastic", "password": "3fg51u8ZyRYWUxPXwVpl"}


{'query_id': 'demo-string-001',
 'options': {'top_k': 10, 'return_diagnostics': True},
 'Gene|Protein': ['EGFR', '表皮生长因子受体'],
 'Mutation': [],
 'Chemical': [],
 'Disease|Phenotype': ['非小细胞肺癌'],
 'Process|Function|Pathway|Cell_Component': [],
 'Species': [],
 'Cell|Tissue': []}

In [6]:
import requests
r = requests.get(
    "http://es-cn-uax3fxy8w000e2tuf.elasticsearch.aliyuncs.com:9200/",
    auth=("elastic", "3fg51u8ZyRYWUxPXwVpl")
)
print(r.status_code, r.text)

200 {
  "name" : "es-cn-uax3fxy8w000e2tuf-data-f-2",
  "cluster_name" : "es-cn-uax3fxy8w000e2tuf",
  "cluster_uuid" : "m2lo3WFsQKO5L6SAQHeOFg",
  "version" : {
    "number" : "8.9.1",
    "build_flavor" : "default",
    "build_type" : "docker",
    "build_hash" : "652eb44fa71049c1fcdc08c967ef9f751d986ece",
    "build_date" : "2023-08-18T06:51:36.666651565Z",
    "build_snapshot" : false,
    "lucene_version" : "9.7.0",
    "minimum_wire_compatibility_version" : "7.17.0",
    "minimum_index_compatibility_version" : "7.0.0"
  },
  "tagline" : "You Know, for Search"
}



In [7]:

# 标准化 payload 并执行字符串通道检索，返回 HybridHit 列表
normalized_fuzzy = adapters.normalize_payload(payload_fuzzy, search_config)
string_hits = string_client.search_string_channel(
    es_client,
    normalized_fuzzy,
    search_config.string_index_name,
)

# 将主要字段整理为 DataFrame 方便展示
string_df = pd.DataFrame([
    {
        "entity_id": hit.entity_id,
        "primary_name": hit.primary_name,
        "type_key": hit.type_key,
        "node_type": hit.node_type,
        "string_score": getattr(hit.scores, "string_score", 0.0),
        "matched_alias": hit.matched_alias,
    }
    for hit in string_hits
])
string_df.head(search_config.top_k)


String search failed for type Gene|Protein on index brick_index_string: NotFoundError(404, 'index_not_found_exception', 'no such index [brick_index_string]', brick_index_string, index_or_alias)
String search failed for type Disease|Phenotype on index brick_index_string: NotFoundError(404, 'index_not_found_exception', 'no such index [brick_index_string]', brick_index_string, index_or_alias)
String search failed for type Disease|Phenotype on index brick_index_string: NotFoundError(404, 'index_not_found_exception', 'no such index [brick_index_string]', brick_index_string, index_or_alias)


字符串检索会返回命中实体的 ID、主名称、匹配别名与得分，上层可据此做筛选或继续走知识图谱问答。

## 混合检索（字符串 + 向量融合）

下面演示完整混合检索流程：在字符串与向量通道召回候选后，通过权重融合得到最终排序。

In [8]:

import copy

hybrid_searcher = HybridEntitySearcher(es_client, search_config)

payload_hybrid = copy.deepcopy(payload_fuzzy)
payload_hybrid["query_id"] = "demo-hybrid-001"
payload_hybrid["options"].update({
    "top_k": 5,
    "return_diagnostics": True,
    "debug": False,
    "type_mix_override": {
        "Gene|Protein": 0.6,
        "Disease|Phenotype": 0.4,
    },
})
payload_hybrid


{'query_id': 'demo-hybrid-001',
 'options': {'top_k': 5,
  'return_diagnostics': True,
  'debug': False,
  'type_mix_override': {'Gene|Protein': 0.6, 'Disease|Phenotype': 0.4}},
 'Gene|Protein': ['EGFR', '表皮生长因子受体'],
 'Mutation': [],
 'Chemical': [],
 'Disease|Phenotype': ['非小细胞肺癌'],
 'Process|Function|Pathway|Cell_Component': [],
 'Species': [],
 'Cell|Tissue': []}

In [9]:

# 执行混合检索，结果为 HybridResponse 对象
hybrid_response = hybrid_searcher.search(payload_hybrid)
hybrid_response


String search failed for type Gene|Protein on index brick_index_string: NotFoundError(404, 'index_not_found_exception', 'no such index [brick_index_string]', brick_index_string, index_or_alias)
String search failed for type Disease|Phenotype on index brick_index_string: NotFoundError(404, 'index_not_found_exception', 'no such index [brick_index_string]', brick_index_string, index_or_alias)
String search failed for type Disease|Phenotype on index brick_index_string: NotFoundError(404, 'index_not_found_exception', 'no such index [brick_index_string]', brick_index_string, index_or_alias)
Vector knn search failed for type Gene|Protein on index brick_index_vector, fallback to script_score: NotFoundError(404, 'index_not_found_exception', 'no such index [brick_index_vector]', brick_index_vector, index_or_alias)
Vector knn search failed for type Gene|Protein on index brick_index_vector, fallback to script_score: NotFoundError(404, 'index_not_found_exception', 'no such index [brick_index_vector

HybridResponse(query_id='demo-hybrid-001', standardized={}, diagnostics=[], logs=None)

In [10]:

# 将标准化输出与诊断信息转换为 DataFrame，便于分析
standardized_rows = [
    {"type_key": type_key, "entity_name": entity_name}
    for type_key, entities in hybrid_response.standardized.items()
    for entity_name in entities
]
standardized_df = pd.DataFrame(standardized_rows)
standardized_df

diagnostics = hybrid_response.diagnostics or []
diagnostics_df = pd.DataFrame([
    {
        "type_key": item.type_key,
        "entity_id": item.entity_id,
        "primary_name": item.primary_name,
        "final_score": item.final_score,
        "string_score": getattr(item.channel_scores, "string_score", 0.0),
        "vector_score": getattr(item.channel_scores, "vector_score", 0.0),
        "matched_alias": item.matched_alias,
    }
    for item in diagnostics
])
diagnostics_df


`HybridResponse.standardized` 提供面向业务的最终候选；当需要定位权重或问题时，可对照 `diagnostics_df` 中的通道得分与匹配信息。