# RAG

## 环境准备

In [1]:
from Utils import *

RUNNABLE_BASE_URL:  http://localhost:8000


In [None]:
gpt35("""我在jupyterlab中无法加载修改过的python代码，是有缓存吗？""")

## 建立langchain知识库

<div class="alert alert-warning">
<b>兼容性问题：</b><br/>
    较新的BeautifulSoup版本是4.12.3，与python3.10兼容性较好，无法适应3.9或3.12，否则无法找到lxml或html5lib。
</div>

In [2]:
from bs4 import BeautifulSoup, SoupStrainer
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
from langchain_community.document_loaders.sitemap import SitemapLoader
from langchain_core.utils.html import PREFIXES_TO_IGNORE_REGEX, SUFFIXES_TO_IGNORE_REGEX
import re

In [3]:
# 仅在jupyter中需要
import nest_asyncio
nest_asyncio.apply()

### 提取langchain文档

#### 提取langchain的Docs文档

In [7]:
def metadata_extractor(meta: dict, soup: BeautifulSoup) -> dict:
    title = soup.find("title")
    description = soup.find("meta", attrs={"name": "description"})
    html = soup.find("html")
    return {
        "source": meta["loc"],
        "title": title.get_text() if title else "",
        "description": description.get("content", "") if description else "",
        "language": html.get("lang", "") if html else "",
        **meta,
    }

def load_langchain_docs():
    return SitemapLoader(
        "https://python.langchain.com/sitemap.xml",
        filter_urls = ["https://python.langchain.com/"],
        parsing_function = web_page_extractor,
        default_parser = "lxml",
        bs_kwargs = {
            "parse_only": SoupStrainer(
                name = ("article", "title", "html", "lang", "content")
            ),
        },
        meta_function = metadata_extractor,
    ).load()

In [5]:
langchain_docs = load_langchain_docs()

Fetching pages: 100%|##########| 1180/1180 [07:51<00:00,  2.50it/s]


#### 提取langchain的API文档

In [8]:
def simple_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    return re.sub(r"\n\n+", "\n\n", soup.text).strip()

def load_api_docs():
    return RecursiveUrlLoader(
        url = "https://api.python.langchain.com/en/stable/langchain_api_reference.html",
        max_depth = 8,
        extractor = simple_extractor,
        prevent_outside = True,
        use_async = True,
        timeout = 600,
        # Drop trailing / to avoid duplicate pages.
        link_regex = (
            f"href=[\"']{PREFIXES_TO_IGNORE_REGEX}((?:{SUFFIXES_TO_IGNORE_REGEX}.)*?)"
            r"(?:[\#'\"]|\/[\#'\"])"
        ),
        check_response_status = True,
        exclude_dirs = (
            "https://api.python.langchain.com/en/latest/_sources",
            "https://api.python.langchain.com/en/latest/_modules",
        ),
    ).load()

In [9]:
api_docs = load_api_docs()

#### 提取langsmith的docs文档

In [8]:
def load_langsmith_docs():
    return RecursiveUrlLoader(
        url = "https://docs.smith.langchain.com/",
        max_depth = 8,
        extractor = simple_extractor,
        prevent_outside = True,
        use_async = True,
        timeout = 600,
        # Drop trailing / to avoid duplicate pages.
        link_regex = (
            f"href=[\"']{PREFIXES_TO_IGNORE_REGEX}((?:{SUFFIXES_TO_IGNORE_REGEX}.)*?)"
            r"(?:[\#'\"]|\/[\#'\"])"
        ),
        check_response_status = True,
    ).load()

In [13]:
langsmith_docs = load_langsmith_docs()

  soup = BeautifulSoup(html, "lxml")
  k = self.parse_starttag(i)


### 将文档入库到duckdb

#### 连接duckdb

In [2]:
web_store = WebPageDataset(db_name = "data/langchain.duckdb")

#### 保存到duckdb

In [10]:
# https://python.langchain.com/
for d in langchain_docs:
    print(".", end = "")
    web_store.upsert(d, topic = "langchain_docs")

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [12]:
web_store.upsert(api_docs[0], topic = "langchain_api_docs")

In [17]:
# https://docs.smith.langchain.com/
for d in langsmith_docs:
    print(".", end = "")
    web_store.upsert(d, topic = "langsmith_docs")

.....................................................................

#### 从duckdb查询

In [3]:
import re

In [4]:
docs = web_store.read_data(topic = "langchain_docs")

In [5]:
result = [obj for obj in docs if re.compile('lancedb', re.IGNORECASE).search(obj.source)]
for obj in result:
    print(obj.source)

https://python.langchain.com/docs/integrations/providers/lancedb
https://python.langchain.com/docs/integrations/vectorstores/lancedb


### 拆分文本块

#### 加载文本向量

In [2]:
langchain_ds = WebPageDataset(db_name = "data/langchain.duckdb")
docs = langchain_ds.read_data(topic = None)

#### 剔除对RAG无实质作用的文本

<div class="alert alert-success">
<b>观察文本大小：</b><br/>
    有很多文档的尺寸超过了50K，最大的达到200K。<br>
    其中，有些是包含了图片的base64编码，有些是包含了示范代码执行时的打印内容，对RAG的支持没有太多帮助。
</div>

In [3]:
dict_list = [{ "source": obj.source, "len": len(obj.page_content) } for obj in docs]
sorted_dict_list = sorted(dict_list, key = lambda x: x['len'], reverse = True)
for obj in sorted_dict_list:
    print(obj['len'], " >> ", obj['source'])

195919  >>  https://python.langchain.com/docs/integrations/retrievers/activeloop
150150  >>  https://python.langchain.com/docs/use_cases/question_answering/citations
74417  >>  https://python.langchain.com/docs/integrations/document_loaders/dropbox
73698  >>  https://python.langchain.com/docs/integrations/vectorstores/timescalevector
66009  >>  https://python.langchain.com/docs/integrations/document_loaders/docugami
65474  >>  https://python.langchain.com/docs/use_cases/code_understanding
63441  >>  https://python.langchain.com/docs/integrations/tools/google_lens
62074  >>  https://python.langchain.com/docs/integrations/chat/ollama
60867  >>  https://python.langchain.com/docs/expression_language/cookbook/prompt_size
59026  >>  https://python.langchain.com/docs/guides/debugging
57438  >>  https://python.langchain.com/docs/integrations/llms/ollama
54666  >>  https://api.python.langchain.com/en/stable/langchain_api_reference.html
53959  >>  https://python.langchain.com/docs/modules/agents

剔除输出的文字块和图像base64部份：

In [4]:
langchain_new_docs = [{
        "content": remove_text_blocks(remove_base64(obj.page_content)), 
        "source": obj.source, 
        "title": obj.title, 
        "description": obj.description
    } for obj in docs]
newDocs = sort_list_by_len(langchain_new_docs, "content")
for obj in newDocs:
    print(obj[1], " >> ", obj[0]['source'])

54666  >>  https://api.python.langchain.com/en/stable/langchain_api_reference.html
30920  >>  https://python.langchain.com/docs/integrations/vectorstores/timescalevector
29616  >>  https://python.langchain.com/docs/modules/data_connection/document_loaders/file_directory
28836  >>  https://docs.smith.langchain.com/tracing/tracing-faq
28812  >>  https://python.langchain.com/docs/integrations/tools/google_lens
26134  >>  https://python.langchain.com/docs/langgraph
24125  >>  https://python.langchain.com/docs/langserve
23943  >>  https://docs.smith.langchain.com/evaluation/quickstart
23617  >>  https://python.langchain.com/docs/integrations/toolkits/github
22722  >>  https://python.langchain.com/docs/integrations/vectorstores/redis
22675  >>  https://python.langchain.com/docs/guides/safety/amazon_comprehend_chain
22587  >>  https://docs.smith.langchain.com/cookbook/testing-examples/comparing-runs
22480  >>  https://python.langchain.com/docs/get_started/quickstart
22014  >>  https://python.

#### 切分字符串：默认按Markdown段落，如果仍然超长就截断切分

In [5]:
from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter

In [6]:
headers_to_split_on = [
    ("#", "H1"),
    ("##", "H2"),
    ("###", "H3"),
]

In [7]:
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on = headers_to_split_on)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap = 200,
    length_function = len,
    is_separator_regex = False,
)

md_header_splits = []
page_index = 0

for obj in newDocs:
    final_texts = []
    # step1: 按markdown标题切分
    first_texts = markdown_splitter.split_text(obj[0]["content"])
    # step2: 超长的继续按文本长度切分
    #
    chunk_index = 0
    for x in first_texts:
        page_index += 1
        # print(x.metadata)
        
        if(len(x.page_content) < 2000):
            final_texts.append({
                "page_index": page_index,
                "chunk_index": -1,
                "H1": x.metadata.get("H1", ""),
                "H2": x.metadata.get("H2", ""),
                "H3": x.metadata.get("H3", ""),
                "content": x.page_content
            })
            # print(" < 2000, 直接加入final_texts", end = ": ")
            # print(len(final_texts))
        else:
            texts_split = [{
                "page_index": page_index,
                "chunk_index": index + 1,
                "H1": x.metadata.get("H1", ""),
                "H2": x.metadata.get("H2", ""),
                "H3": x.metadata.get("H3", ""),
                "content": obj.page_content
            } for index, obj in enumerate(text_splitter.create_documents([x.page_content]))]
            final_texts += texts_split
            # print(" >= 2000, 切分后加入final_texts", end = ": ")
            # print(len(final_texts))
    #
    md_header_splits += [{
        "page_index": chunk["page_index"],
        "chunk_index": chunk["chunk_index"],
        "title": obj[0]["title"],
        "description": obj[0]["description"],
        "source": obj[0]["source"],
        "H1": chunk["H1"],
        "H2": chunk["H2"],
        "H3": chunk["H3"],
        "content": chunk["content"]} for chunk in final_texts]
    #
    print(len(md_header_splits), end = " ")
    

31 55 78 98 116 150 182 198 221 242 260 275 299 328 344 357 375 388 424 441 452 466 481 492 504 520 535 548 559 571 582 592 603 613 623 635 654 665 674 685 705 720 731 743 751 770 786 795 807 821 837 849 865 875 882 900 913 926 935 945 954 961 970 983 996 1007 1016 1027 1035 1042 1051 1063 1079 1093 1101 1113 1120 1129 1138 1146 1153 1167 1176 1191 1199 1211 1222 1239 1248 1258 1265 1275 1288 1297 1305 1314 1321 1329 1336 1342 1350 1357 1365 1372 1377 1398 1408 1413 1421 1430 1438 1450 1456 1467 1472 1478 1487 1495 1502 1507 1516 1525 1531 1537 1547 1554 1560 1570 1577 1583 1588 1596 1602 1611 1618 1626 1633 1638 1643 1655 1661 1669 1674 1682 1692 1698 1704 1711 1716 1724 1730 1735 1743 1752 1762 1768 1775 1781 1786 1794 1799 1806 1811 1817 1824 1835 1840 1850 1858 1863 1873 1878 1882 1888 1895 1902 1910 1917 1924 1932 1939 1949 1956 1961 1969 1980 1986 1994 2003 2009 2015 2024 2028 2035 2045 2052 2057 2064 2072 2079 2088 2095 2101 2106 2113 2120 2129 2136 2142 2150 2162 2167 2173 2183

#### 保存到duckdb

In [22]:
text_block_ds = TextBlockDataset(db_name = "data/langchain.duckdb", drop = False)

In [23]:
for obj in md_header_splits:
    print(obj["chunk_index"], ":", obj["page_index"], ":", obj["source"]) 

1 : 1 : https://api.python.langchain.com/en/stable/langchain_api_reference.html
2 : 1 : https://api.python.langchain.com/en/stable/langchain_api_reference.html
3 : 1 : https://api.python.langchain.com/en/stable/langchain_api_reference.html
4 : 1 : https://api.python.langchain.com/en/stable/langchain_api_reference.html
5 : 1 : https://api.python.langchain.com/en/stable/langchain_api_reference.html
6 : 1 : https://api.python.langchain.com/en/stable/langchain_api_reference.html
7 : 1 : https://api.python.langchain.com/en/stable/langchain_api_reference.html
8 : 1 : https://api.python.langchain.com/en/stable/langchain_api_reference.html
9 : 1 : https://api.python.langchain.com/en/stable/langchain_api_reference.html
10 : 1 : https://api.python.langchain.com/en/stable/langchain_api_reference.html
11 : 1 : https://api.python.langchain.com/en/stable/langchain_api_reference.html
12 : 1 : https://api.python.langchain.com/en/stable/langchain_api_reference.html
13 : 1 : https://api.python.langchain

In [24]:
def check_duplicates(data):
    keys = set()
    for item in data:
        key = (item['source'], item['page_index'], item['chunk_index'])
        if key in keys:
            print(f'Duplicate key found: {key}')
            return True
        else:
            keys.add(key)
    print('No duplicate keys found.')
    return False

In [25]:
import duckdb
conn = duckdb.connect("data/langchain.duckdb") 
def check_primary_key(conn, table_name):
    cursor = conn.cursor()
    cursor.execute(f"PRAGMA table_info({table_name})")
    columns = cursor.fetchall()
    for column in columns:
        if column[5]:  # 如果该列是主键，column[5]的值为1
            print(f"Primary key is set on column: {column[1]}")
check_primary_key(conn, "text_blocks")

Primary key is set on column: source
Primary key is set on column: page_index
Primary key is set on column: chunk_index


In [26]:
check_duplicates(md_header_splits)

No duplicate keys found.


False

In [27]:
for d in md_header_splits:
    print(".", end = "")
    text_block_ds.upsert(d)
print("finish!")

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [28]:
len(text_block_ds.read_data())

5602