In [1]:
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)
import os

In [2]:
from textlong import idea
from langchain_zhipu import ChatZhipuAI, ZhipuAIEmbeddings
from textlong.knowledge import LocalFilesLoader

# 创建资料

## python语法摘要.md

In [None]:
idea(
    ChatZhipuAI(),
    "帮我摘要总结200字以内的python关键语法",
    output_file="python语法摘要.md",
    base_folder="python"
)

## pandas语法摘要.md

In [None]:
idea(
    ChatZhipuAI(),
    "帮我摘要总结200字以内的pandas关键语法",
    output_file="pandas语法摘要.md",
    base_folder="python"
)

# 加载资料文档

## 按照 markdown 加载

In [3]:
%pwd

'/Users/xuehongwei/github/textlong/notes/03 knowledge'

In [5]:
kg = LocalFilesLoader("python")
kg.get_files()

['python/pandas语法摘要.md', 'python/python语法摘要.md']

In [6]:
for x in kg.load():
    print(">->>>", x.metadata['source'])
    # print(x.page_content)

>->>> python/pandas语法摘要.md
>->>> python/python语法摘要.md


## 过滤文档

In [7]:
fl = LocalFilesLoader("python", path_regex="pandas")
fl.get_files()

['python/pandas语法摘要.md']

## 按一般切分规则加载文档

In [3]:
kg = LocalFilesLoader("python")
kg.load()

[Document(page_content="Pandas是一个强大的Python数据分析库，提供了快速、灵活和表达力强的数据结构，用于处理结构化数据。以下是关键语法摘要：\n\n\n\n\n- **导入库**：\n  `import pandas as pd`\n\n- **创建数据结构**：\n  `df = pd.DataFrame()`\n  `s = pd.Series()`\n\n- **读取数据**：\n  `pd.read_csv('file.csv')`\n  `pd.read_excel('file.xlsx')`\n\n- **选择数据**：\n  `df['column']`\n  `df.loc[rows, columns]`\n  `df.iloc[rows, columns]`\n\n- **筛选数据**：\n  `df[df['column'] &gt; value]`\n\n- **数据操作**：\n  `df['new_column'] = df['column'] * value`\n  `df.drop('column', axis=1)`\n\n- **缺失值处理**：\n  `df.isnull()`\n  `df.dropna()`\n  `df.fillna(value)`\n\n- **描述统计**：\n  `df.describe()`\n  `df.mean()`\n  `df.corr()`\n\n- **分组聚合**：\n  `df.groupby('column').sum()`\n  `df.groupby('column').agg({'another_column': 'mean'})`\n\n- **排序**：\n  `df.sort_values(by='column')`\n\n- **索引重置**：\n  `df.reset_index(drop=True)`\n\n- **数据合并**：\n  `pd.concat([df1, df2])`\n  `df1.merge(df2, on='column')`\n\n\n以上语法涵盖了Pandas的基本操作，是进行数据分析的重要工具。", metadata={'source': 'python/pandas语法摘要.md'}),
 Docume

## 获得向量编码

In [4]:
from langchain_zhipu import ZhipuAIEmbeddings
model = ZhipuAIEmbeddings()

In [6]:
kg.cache_embeddings(model)

[32m<python_python语法摘要_md> Pandas是一个强大的Python数据分析库，提供了快速、灵活和表达力强的数据结构，用于处理结构化...[0m
[32mCached 1 embeddings to python/__CACHE_EMBEDDINGS__/ ![0m


True

In [7]:
emb = kg.load_embeddings()

In [8]:
from langchain_community.vectorstores import FAISS
db = FAISS.from_embeddings(emb, model)
retriever = db.as_retriever()

In [9]:
retriever.invoke('PANDAS')

[Document(page_content="Pandas是一个强大的Python数据分析库，提供了快速、灵活和表达力强的数据结构，用于处理结构化数据。以下是关键语法摘要：\n\n\n\n\n- **导入库**：\n  `import pandas as pd`\n\n- **创建数据结构**：\n  `df = pd.DataFrame()`\n  `s = pd.Series()`\n\n- **读取数据**：\n  `pd.read_csv('file.csv')`\n  `pd.read_excel('file.xlsx')`\n\n- **选择数据**：\n  `df['column']`\n  `df.loc[rows, columns]`\n  `df.iloc[rows, columns]`\n\n- **筛选数据**：\n  `df[df['column'] &gt; value]`\n\n- **数据操作**：\n  `df['new_column'] = df['column'] * value`\n  `df.drop('column', axis=1)`\n\n- **缺失值处理**：\n  `df.isnull()`\n  `df.dropna()`\n  `df.fillna(value)`\n\n- **描述统计**：\n  `df.describe()`\n  `df.mean()`\n  `df.corr()`\n\n- **分组聚合**：\n  `df.groupby('column').sum()`\n  `df.groupby('column').agg({'another_column': 'mean'})`\n\n- **排序**：\n  `df.sort_values(by='column')`\n\n- **索引重置**：\n  `df.reset_index(drop=True)`\n\n- **数据合并**：\n  `pd.concat([df1, df2])`\n  `df1.merge(df2, on='column')`\n\n\n以上语法涵盖了Pandas的基本操作，是进行数据分析的重要工具。"),
 Document(page_content='- Python是一种高级、解释型、交互式、面向对象的编

In [33]:
import re

def clean_filename(filename):
    # 只允许字母、数字、中文、下划线和短横线
    cleaned_filename = re.sub(r'[^\w\s.-]', '_', filename)
    # 将多个连续的下划线或短横线替换为单个下划线
    cleaned_filename = re.sub(r'[-_ ]+', '_', cleaned_filename)
    return cleaned_filename

# 假设我们有一个包含特殊字符的字符串
filename = "中文.名字的This is a test!@#$%^&*()_+{}|[]\\:;\"'<>,.?/file-名.txt"

# 清理文件名
cleaned_filename = clean_filename(filename)

# 输出清理后的文件名
print(cleaned_filename)


中文.名字的This_is_a_test_._file_名.txt


# 2 使用Q文档做RAG应用

In [50]:
local_qa = LocalFilesQALoader("./notes/documents", included_prefixes=["docs/供应商资料/"], answer_filenames=["answer.txt", "example.txt"])
all_docs = local_qa.load()

for chunk in all_docs[0:10]:
  print(chunk.metadata['source'], len(chunk.page_content))

/Users/xuehongwei/github/langchain_chinese/notes/documents/docs/供应商资料/001/input.txt 8
/Users/xuehongwei/github/langchain_chinese/notes/documents/docs/供应商资料/003/input.txt 10
/Users/xuehongwei/github/langchain_chinese/notes/documents/docs/供应商资料/002/input.txt 8


In [51]:
all_docs[0]

Document(page_content='北京科技有限公司', metadata={'source': '/Users/xuehongwei/github/langchain_chinese/notes/documents/docs/供应商资料/001/input.txt'})

## 切分文本

In [52]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap = 200,
    length_function = len,
    is_separator_regex = False,
)
chunks = text_splitter.split_documents(all_docs)
chunks

[Document(page_content='北京科技有限公司', metadata={'source': '/Users/xuehongwei/github/langchain_chinese/notes/documents/docs/供应商资料/001/input.txt'}),
 Document(page_content='深圳创新科技有限公司', metadata={'source': '/Users/xuehongwei/github/langchain_chinese/notes/documents/docs/供应商资料/003/input.txt'}),
 Document(page_content='上海音响有限公司', metadata={'source': '/Users/xuehongwei/github/langchain_chinese/notes/documents/docs/供应商资料/002/input.txt'})]