## 读取文件

In [1]:
import os
import config.env

In [2]:
from util.file import load_file

load_file

<function util.file.load_file(path)>

## 文本分割

In [3]:
headers_to_split_on = [
    ("#", "header1"),
    ("##", "header2"),
]

In [4]:
from langchain_text_splitters import MarkdownHeaderTextSplitter

text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)


In [5]:
texts = text_splitter.split_text(load_file("data/荣耀Magic V Flip.txt"))

In [6]:
texts2 = text_splitter.split_text(load_file("data/Xiaomi MIX Flip.txt"))

In [7]:
texts3 = text_splitter.split_text(load_file("data/Find N3 Flip.txt"))

In [8]:
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

db = Chroma.from_documents(
    texts+texts2+texts3,
    OpenAIEmbeddings(),
)


### 元数据字段的信息

In [9]:
from langchain.chains.query_constructor.schema import AttributeInfo

metadata_field_info = [
    AttributeInfo(
        name="header1",
        description="商品名称。这个产品可以是手机，电脑等",
        type="string",
    ),
    AttributeInfo(
        name="header2",
        description="商品详细参数的类别信息。包括但不限于：颜色，价格，影像参数，电池，cpu等信息",
        type="string",
    ),
]

## 构建

In [None]:
document_content_description = "商品详细参数的类别信息的正文，里面不会有具体商品和类型信息，全部由metadata提供。例如：header1：小米6，header2：电池，正文：电池容量为4000mAh，支持快充。"

In [12]:
from langchain.chains.query_constructor.base import StructuredQueryOutputParser, get_query_constructor_prompt

prompt = get_query_constructor_prompt(
    document_content_description,
    metadata_field_info,
)
# output_parser = StructuredQueryOutputParser.from_components()
# query_constructor = prompt | llm | output_parser

In [13]:
prompt.format(query="荣耀Magic V Flip的颜色")

'Your goal is to structure the user\'s query to match the request schema provided below.\n\n<< Structured Request Schema >>\nWhen responding use a markdown code snippet with a JSON object formatted in the following schema:\n\n```json\n{\n    "query": string \\ text string to compare to document contents\n    "filter": string \\ logical condition statement for filtering documents\n}\n```\n\nThe query string should contain only text that is expected to match the contents of documents. Any conditions in the filter should not be mentioned in the query as well.\n\nA logical condition statement is composed of one or more comparison and logical operation statements.\n\nA comparison statement takes the form: `comp(attr, val)`:\n- `comp` (eq | ne | gt | gte | lt | lte | contain | like | in | nin): comparator\n- `attr` (string):  name of attribute to apply the comparison to\n- `val` (string): is the comparison value\n\nA logical operation statement takes the form `op(statement1, statement2, ...)

In [10]:
from langchain_openai import ChatOpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever


llm = ChatOpenAI(temperature=0)
retriever = SelfQueryRetriever.from_llm(
    llm,
    db,
    document_content_description,
    metadata_field_info,
)

In [11]:
retriever.invoke("荣耀Magic V Flip的颜色")

[Document(metadata={'header1': 'Xiaomi MIX Flip', 'header2': '价格'}, page_content='RMB 5999 (12GB+256GB)\nRMB 6499 (12GB+512GB)\nRMB 7299 (16GB+1TB 幻影紫，白色)'),
 Document(metadata={'header1': '荣耀Magic V Flip', 'header2': '操作系统'}, page_content='MagicOS 8.0（基于Android 14）\n用户界面\nMagicOS 8.0'),
 Document(metadata={'header1': 'Find N3 Flip', 'header2': '颜色'}, page_content='月光缪斯 | 薄雾玫瑰 | 镜中之夜  \n入网型号：PHT110'),
 Document(metadata={'header1': '荣耀Magic V Flip', 'header2': '包装清单'}, page_content='外包装采用塑封，彩盒内标配：\n1）手机（含内置电池）x 1\n2）快速指南x 1\n3）HONOR SuperCharge充电器x 1\n4）USB Type-C 数据线x 1\n5）取卡针x 1\n6）保护壳x 1\n7）新机权益卡（含新机权益和电子三包凭证） x1\n8）内屏保护膜（出厂贴在屏幕上）x 1\n9）外屏保护膜（出厂贴在屏幕上）x 1\n高定款：\n外包装采用塑封，彩盒内标配：\n1）手机（含内置电池）x 1\n2）快速指南x 1\n3）HONOR SuperCharge充电器x 1\n4）USB Type-C 数据线x 1\n5）取卡针x 1\n6）保护壳x 2\n7）新机权益卡（含新机权益和电子三包凭证） x1\n8）内屏保护膜（出厂贴在屏幕上）x 1\n9）外屏保护膜（出厂贴在屏幕上）x 1\n备注：最终以实物为准。')]