In [1]:
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)
import os

In [2]:
from textlong import idea
from langchain_zhipu import ChatZhipuAI, ZhipuAIEmbeddings
from textlong.knowledge import LocalFilesLoader, collect_docs
from textlong.hub import load_prompt, find_resource_prompt

# 创建资料

## python语法摘要.md

In [None]:
idea(
    ChatZhipuAI(),
    "帮我摘要总结200字以内的python关键语法",
    output_file="python语法摘要.md",
    base_folder="python"
)

## pandas语法摘要.md

In [None]:
idea(
    ChatZhipuAI(),
    "帮我摘要总结200字以内的pandas关键语法",
    output_file="pandas语法摘要.md",
    base_folder="python"
)

# 查询资料

## 按照 markdown 加载

In [3]:
%pwd

'/Users/xuehongwei/github/textlong/notes/03 knowledge'

In [5]:
kg = LocalFilesLoader("python")
kg.get_files()

['python/pandas语法摘要.md', 'python/python语法摘要.md']

In [6]:
for x in kg.load():
    print(">->>>", x.metadata['source'])
    # print(x.page_content)

>->>> python/pandas语法摘要.md
>->>> python/python语法摘要.md


## 过滤文档

In [7]:
fl = LocalFilesLoader("python", path_regex="pandas")
fl.get_files()

['python/pandas语法摘要.md']

## 加载文档

In [2]:
kg = LocalFilesLoader("python")
kg.load()

[Document(page_content="Pandas是一个强大的Python数据分析库，提供了快速、灵活和表达力强的数据结构，用于处理结构化数据。以下是关键语法摘要：\n\n\n\n\n- **导入库**：\n  `import pandas as pd`\n\n- **创建数据结构**：\n  `df = pd.DataFrame()`\n  `s = pd.Series()`\n\n- **读取数据**：\n  `pd.read_csv('file.csv')`\n  `pd.read_excel('file.xlsx')`\n\n- **选择数据**：\n  `df['column']`\n  `df.loc[rows, columns]`\n  `df.iloc[rows, columns]`\n\n- **筛选数据**：\n  `df[df['column'] &gt; value]`\n\n- **数据操作**：\n  `df['new_column'] = df['column'] * value`\n  `df.drop('column', axis=1)`\n\n- **缺失值处理**：\n  `df.isnull()`\n  `df.dropna()`\n  `df.fillna(value)`\n\n- **描述统计**：\n  `df.describe()`\n  `df.mean()`\n  `df.corr()`\n\n- **分组聚合**：\n  `df.groupby('column').sum()`\n  `df.groupby('column').agg({'another_column': 'mean'})`\n\n- **排序**：\n  `df.sort_values(by='column')`\n\n- **索引重置**：\n  `df.reset_index(drop=True)`\n\n- **数据合并**：\n  `pd.concat([df1, df2])`\n  `df1.merge(df2, on='column')`\n\n\n以上语法涵盖了Pandas的基本操作，是进行数据分析的重要工具。", metadata={'source': 'python/pandas语法摘要.md'}),
 Docume

## 缓存文本嵌入

In [16]:
from langchain_zhipu import ZhipuAIEmbeddings
model = ZhipuAIEmbeddings()

In [17]:
kg.cache_embeddings(model)

[32mNo embeddings to cached![0m


False

## 加载文本嵌入缓存

In [18]:
emb, metadata = kg.load_embeddings()

In [19]:
metadata

[{'source': 'python/pandas语法摘要.md'}, {'source': 'python/python语法摘要.md'}]

## 基于文本嵌入缓存查询

In [20]:
from langchain_community.vectorstores import FAISS
db = FAISS.from_embeddings(emb, model, metadata)
retriever = db.as_retriever()

In [21]:
retriever.invoke('PANDAS')

[Document(page_content="Pandas是一个强大的Python数据分析库，提供了快速、灵活和表达力强的数据结构，用于处理结构化数据。以下是关键语法摘要：\n\n\n\n\n- **导入库**：\n  `import pandas as pd`\n\n- **创建数据结构**：\n  `df = pd.DataFrame()`\n  `s = pd.Series()`\n\n- **读取数据**：\n  `pd.read_csv('file.csv')`\n  `pd.read_excel('file.xlsx')`\n\n- **选择数据**：\n  `df['column']`\n  `df.loc[rows, columns]`\n  `df.iloc[rows, columns]`\n\n- **筛选数据**：\n  `df[df['column'] &gt; value]`\n\n- **数据操作**：\n  `df['new_column'] = df['column'] * value`\n  `df.drop('column', axis=1)`\n\n- **缺失值处理**：\n  `df.isnull()`\n  `df.dropna()`\n  `df.fillna(value)`\n\n- **描述统计**：\n  `df.describe()`\n  `df.mean()`\n  `df.corr()`\n\n- **分组聚合**：\n  `df.groupby('column').sum()`\n  `df.groupby('column').agg({'another_column': 'mean'})`\n\n- **排序**：\n  `df.sort_values(by='column')`\n\n- **索引重置**：\n  `df.reset_index(drop=True)`\n\n- **数据合并**：\n  `pd.concat([df1, df2])`\n  `df1.merge(df2, on='column')`\n\n\n以上语法涵盖了Pandas的基本操作，是进行数据分析的重要工具。", metadata={'source': 'python/pandas语法摘要.md'}),
 Docume

# RAG 查询

## VectorStore 使用技巧可以参考文档

**VectorStore初始化参数：**

- Args:
    - **search_type** (Optional[str]): Defines the type of search that
        the Retriever should perform.
        Can be "similarity" (default), "mmr", or
        "similarity_score_threshold".
    - **search_kwargs** (Optional[Dict]): Keyword arguments to pass to the
        search function. Can include things like:
        - **k**: Amount of documents to return (Default: 4)
        - **score_threshold**: Minimum relevance threshold
            for similarity_score_threshold
        - **fetch_k**: Amount of documents to pass to MMR algorithm (Default: 20)
        - **lambda_mult**: Diversity of results returned by MMR;
            1 for minimum diversity and 0 for maximum. (Default: 0.5)
        - **filter**: Filter by document metadata

- Returns:
    **VectorStoreRetriever**: Retriever class for VectorStore.

**典例示范:**

```python
    # Retrieve more documents with higher diversity
    # Useful if your dataset has many similar documents
    docsearch.as_retriever(
        search_type="mmr",
        search_kwargs={'k': 6, 'lambda_mult': 0.25}
    )

    # Fetch more documents for the MMR algorithm to consider
    # But only return the top 5
    docsearch.as_retriever(
        search_type="mmr",
        search_kwargs={'k': 5, 'fetch_k': 50}
    )

    # Only retrieve documents that have a relevance score
    # Above a certain threshold
    docsearch.as_retriever(
        search_type="similarity_score_threshold",
        search_kwargs={'score_threshold': 0.8}
    )

    # Only get the single most similar document from the dataset
    docsearch.as_retriever(search_kwargs={'k': 1})

    # Use a filter to only retrieve documents from a specific paper
    docsearch.as_retriever(
        search_kwargs={'filter': {'paper_title':'GPT-4 Technical Report'}}
    )
```

## 提示语

In [7]:
prompt = load_prompt("RAG", tag="chat")
print(prompt.partial(history="历史对话记录").format(context="<我是查找到的资料>", question="请问..."))

你是一名咨询专家，只负责根据资料回答相关提问，禁止回答与此无关的问题。

你在回答时必须遵循以下约束：
1. 如果你获得的参考例子无法回答问题，可以查询互联网，但务必注意资料的真实性，不要做任何编造
2. 请使用简洁的语言回答，不要啰嗦
3. 不要生成"根据提供的资料..."等字眼

>>>>>>>>>>>>>> 你在回答问题时可以参考这些资料：
<我是查找到的资料>

>>>>>>>>>>>>>> 你必须按照如下格式输出：
问题答案：xxx。

相关规范解释：xxxxxxxx。

>>>>>>>>>>>>>> 之前的对话如下：
历史对话记录

我的问题是：请问...

你的回答：


## 构建RAG应用

In [2]:
from textlong.hub import load_prompt
from textlong.memory import MemoryManager, WithMemoryBinding
from textlong.knowledge import LocalFilesLoader, collect_docs
from langchain_community.vectorstores import FAISS
from langchain_zhipu import ChatZhipuAI, ZhipuAIEmbeddings

emb, metadata = LocalFilesLoader("python").load_embeddings()
db = FAISS.from_embeddings(emb, ZhipuAIEmbeddings(), metadata)

prompt = load_prompt("RAG", tag="chat")

chain = {
    "context":  (lambda x: x['input']) | db.as_retriever() | collect_docs,
    "question": lambda x: x['input'],
    "history":  lambda x: x['history'],
} | prompt | ChatZhipuAI()

memory = MemoryManager()
withMemoryChain = WithMemoryBinding(chain, memory)
config = {"configurable": {"session_id": "1"}}

In [3]:
for x in withMemoryChain.stream({"input": "字典有什么用？"}, config):
    print(x.content, end="")

问题答案：字典用于存储键值对数据。

相关规范解释：在Python中，字典是一种可变容器模型，它可以存储任意类型对象，其中每个对象都存储为一个键值对。键用于唯一地标识数据，值是与键关联的数据。由于键是唯一的，可以通过键快速检索、插入或删除对应的值，这使得字典成为在Python中进行数据查找的理想数据结构。