# RAG

## 环境准备

In [1]:
from Utils import *

RUNNABLE_BASE_URL:  http://localhost:8000


In [None]:
gpt35("""我在jupyterlab中无法加载修改过的python代码，是有缓存吗？""")

## 建立langchain知识库

<div class="alert alert-warning">
<b>兼容性问题：</b><br/>
    较新的BeautifulSoup版本是4.12.3，与python3.10兼容性较好，无法适应3.9或3.12，否则无法找到lxml或html5lib。
</div>

In [2]:
from bs4 import BeautifulSoup, SoupStrainer
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
from langchain_community.document_loaders.sitemap import SitemapLoader
from langchain_core.utils.html import PREFIXES_TO_IGNORE_REGEX, SUFFIXES_TO_IGNORE_REGEX
import re

In [3]:
# 仅在jupyter中需要
import nest_asyncio
nest_asyncio.apply()

### 提取langchain文档

#### 提取langchain的Docs文档

In [7]:
def metadata_extractor(meta: dict, soup: BeautifulSoup) -> dict:
    title = soup.find("title")
    description = soup.find("meta", attrs={"name": "description"})
    html = soup.find("html")
    return {
        "source": meta["loc"],
        "title": title.get_text() if title else "",
        "description": description.get("content", "") if description else "",
        "language": html.get("lang", "") if html else "",
        **meta,
    }

def load_langchain_docs():
    return SitemapLoader(
        "https://python.langchain.com/sitemap.xml",
        filter_urls = ["https://python.langchain.com/"],
        parsing_function = web_page_extractor,
        default_parser = "lxml",
        bs_kwargs = {
            "parse_only": SoupStrainer(
                name = ("article", "title", "html", "lang", "content")
            ),
        },
        meta_function = metadata_extractor,
    ).load()

In [5]:
langchain_docs = load_langchain_docs()

Fetching pages: 100%|##########| 1180/1180 [07:51<00:00,  2.50it/s]


#### 提取langchain的API文档

In [8]:
def simple_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    return re.sub(r"\n\n+", "\n\n", soup.text).strip()

def load_api_docs():
    return RecursiveUrlLoader(
        url = "https://api.python.langchain.com/en/stable/langchain_api_reference.html",
        max_depth = 8,
        extractor = simple_extractor,
        prevent_outside = True,
        use_async = True,
        timeout = 600,
        # Drop trailing / to avoid duplicate pages.
        link_regex = (
            f"href=[\"']{PREFIXES_TO_IGNORE_REGEX}((?:{SUFFIXES_TO_IGNORE_REGEX}.)*?)"
            r"(?:[\#'\"]|\/[\#'\"])"
        ),
        check_response_status = True,
        exclude_dirs = (
            "https://api.python.langchain.com/en/latest/_sources",
            "https://api.python.langchain.com/en/latest/_modules",
        ),
    ).load()

In [9]:
api_docs = load_api_docs()

#### 提取langsmith的docs文档

In [8]:
def load_langsmith_docs():
    return RecursiveUrlLoader(
        url = "https://docs.smith.langchain.com/",
        max_depth = 8,
        extractor = simple_extractor,
        prevent_outside = True,
        use_async = True,
        timeout = 600,
        # Drop trailing / to avoid duplicate pages.
        link_regex = (
            f"href=[\"']{PREFIXES_TO_IGNORE_REGEX}((?:{SUFFIXES_TO_IGNORE_REGEX}.)*?)"
            r"(?:[\#'\"]|\/[\#'\"])"
        ),
        check_response_status = True,
    ).load()

In [13]:
langsmith_docs = load_langsmith_docs()

  soup = BeautifulSoup(html, "lxml")
  k = self.parse_starttag(i)


### 将文档入库到duckdb

#### 连接duckdb

In [2]:
web_store = WebPageObj(db_name = "data/langchain.duckdb")

#### 保存到duckdb

In [10]:
# https://python.langchain.com/
for d in langchain_docs:
    print(".", end = "")
    web_store.upsert(d, topic = "langchain_docs")

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [12]:
web_store.upsert(api_docs[0], topic = "langchain_api_docs")

In [17]:
# https://docs.smith.langchain.com/
for d in langsmith_docs:
    print(".", end = "")
    web_store.upsert(d, topic = "langsmith_docs")

.....................................................................

#### 从duckdb查询

In [3]:
import re

In [4]:
docs = web_store.read_data(topic = "langchain_docs")

In [5]:
result = [obj for obj in docs if re.compile('lancedb', re.IGNORECASE).search(obj.source)]
for obj in result:
    print(obj.source)

https://python.langchain.com/docs/integrations/providers/lancedb
https://python.langchain.com/docs/integrations/vectorstores/lancedb


### 拆分文本块

#### 加载文本向量

In [6]:
langchain_db = WebPageObj(db_name = "data/langchain.duckdb")
docs = langchain_db.read_data(topic = None)

#### 剔除对RAG无实质作用的文本

<div class="alert alert-success">
<b>观察文本大小：</b><br/>
    有很多文档的尺寸超过了50K，最大的达到200K。<br>
    其中，有些是包含了图片的base64编码，有些是包含了示范代码执行时的打印内容，对RAG的支持没有太多帮助。
</div>

In [7]:
dict_list = [{ "source": obj.source, "len": len(obj.page_content) } for obj in docs]
sorted_dict_list = sorted(dict_list, key = lambda x: x['len'], reverse = True)
for obj in sorted_dict_list:
    print(obj['len'], " >> ", obj['source'])

195919  >>  https://python.langchain.com/docs/integrations/retrievers/activeloop
150150  >>  https://python.langchain.com/docs/use_cases/question_answering/citations
74417  >>  https://python.langchain.com/docs/integrations/document_loaders/dropbox
73698  >>  https://python.langchain.com/docs/integrations/vectorstores/timescalevector
66009  >>  https://python.langchain.com/docs/integrations/document_loaders/docugami
65474  >>  https://python.langchain.com/docs/use_cases/code_understanding
63441  >>  https://python.langchain.com/docs/integrations/tools/google_lens
62074  >>  https://python.langchain.com/docs/integrations/chat/ollama
60867  >>  https://python.langchain.com/docs/expression_language/cookbook/prompt_size
59026  >>  https://python.langchain.com/docs/guides/debugging
57438  >>  https://python.langchain.com/docs/integrations/llms/ollama
54666  >>  https://api.python.langchain.com/en/stable/langchain_api_reference.html
53959  >>  https://python.langchain.com/docs/modules/agents

剔除输出的文字块和图像base64部份：

In [8]:
langchain_new_docs = [{
        "content": remove_text_blocks(remove_base64(obj.page_content)), 
        "source": obj.source, 
        "title": obj.title, 
        "description": obj.description
    } for obj in docs]
newDocs = sort_list_by_len(langchain_new_docs, "content")
for obj in newDocs:
    print(obj[1], " >> ", obj[0]['source'])

54666  >>  https://api.python.langchain.com/en/stable/langchain_api_reference.html
30920  >>  https://python.langchain.com/docs/integrations/vectorstores/timescalevector
29616  >>  https://python.langchain.com/docs/modules/data_connection/document_loaders/file_directory
28836  >>  https://docs.smith.langchain.com/tracing/tracing-faq
28812  >>  https://python.langchain.com/docs/integrations/tools/google_lens
26134  >>  https://python.langchain.com/docs/langgraph
24125  >>  https://python.langchain.com/docs/langserve
23943  >>  https://docs.smith.langchain.com/evaluation/quickstart
23617  >>  https://python.langchain.com/docs/integrations/toolkits/github
22722  >>  https://python.langchain.com/docs/integrations/vectorstores/redis
22675  >>  https://python.langchain.com/docs/guides/safety/amazon_comprehend_chain
22587  >>  https://docs.smith.langchain.com/cookbook/testing-examples/comparing-runs
22480  >>  https://python.langchain.com/docs/get_started/quickstart
22014  >>  https://python.

In [9]:
content = search_from_list(langchain_new_docs, 'question_answering/citations', 'source')
for obj in content:
    print("length:", len(obj["content"]), ">> \n\n")
    print(obj["content"])

length: 15343 >> 


Citations | 🦜️🔗 Langchain

[Skip to main content](#__docusaurus_skipToContent_fallback)# Citations

How can we get a model to cite which parts of the source documents it
referenced in its response?

To explore some techniques for extracting citations, let’s first create
a simple RAG chain. To start we’ll just retrieve from Wikipedia using
the
[WikipediaRetriever](https://api.python.langchain.com/en/latest/retrievers/langchain_community.retrievers.wikipedia.WikipediaRetriever.html).

## Setup​

First we’ll need to install some dependencies and set environment vars
for the models we’ll be using.

```python
%pip install -qU langchain langchain-openai langchain-anthropic langchain-community wikipedia
```

```python
import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass.getpass()
os.environ["ANTHROPIC_API_KEY"] = getpass.getpass()

# Uncomment if you want to log to LangSmith
# os.environ["LANGCHAIN_TRACING_V2"] = "true
# os.environ["LANGCHAIN_API_KEY"] = getpas

#### 切分字符串：按大小截断

In [107]:
texts = [obj["content"] for obj in newDocs]

In [109]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)
documents = text_splitter.create_documents(texts)
len(documents)

5253

In [129]:
sorted_documents = sort_list_by_len(documents, "page_content")
#sorted_documents[1000]
for obj in sorted_documents:
    print(obj[1])

1200
1200
1200
1200
1200
1200
1200
1200
1200
1200
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1196
1196
1196
1196


#### 切分字符串：默认按Markdown段落，如果仍然超长就截断切分

In [111]:
texts = [obj["content"] for obj in newDocs]

In [10]:
from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter

In [48]:
headers_to_split_on = [
    ("#", "H1"),
    ("##", "H2"),
    ("###", "H3"),
]

In [49]:
# [('content': 'xxxx', 'source': 'yyyy', ...},  __len__)]
newDocs[1000][0]

{'content': 'SerpAPI | 🦜️🔗 Langchain\n\n[Skip to main content](#__docusaurus_skipToContent_fallback)# SerpAPI\n\nThis page covers how to use the SerpAPI search APIs within LangChain.\nIt is broken into two parts: installation and setup, and then references to the specific SerpAPI wrapper.\n\n## Installation and Setup\u200b\n\n- Install requirements with `pip install google-search-results`\n\n- Get a SerpAPI api key and either set it as an environment variable (`SERPAPI_API_KEY`)\n\n## Wrappers\u200b\n\n### Utility\u200b\n\nThere exists a SerpAPI utility which wraps this API. To import this utility:\n\n```python\nfrom langchain_community.utilities import SerpAPIWrapper\n```\n\nFor a more detailed walkthrough of this wrapper, see [this notebook](/docs/integrations/tools/serpapi).\n\n### Tool\u200b\n\nYou can also easily load this wrapper as a Tool (to use with an Agent).\nYou can do this with:\n\n```python\nfrom langchain.agents import load_tools\ntools = load_tools(["serpapi"])\n```\n\n

In [83]:
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on = headers_to_split_on)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap = 200,
    length_function = len,
    is_separator_regex = False,
)

md_header_splits = []
page_index = 0

for obj in newDocs:
    final_texts = []
    # step1: 按markdown标题切分
    first_texts = markdown_splitter.split_text(obj[0]["content"])
    # step2: 超长的继续按文本长度切分
    #
    for x in first_texts:
        page_index += 1
        # print(x.metadata)
        
        if(len(x.page_content) < 2000):
            final_texts.append({
                "page_index": page_index,
                "H1": x.metadata.get("H1", ""),
                "H2": x.metadata.get("H2", ""),
                "H3": x.metadata.get("H3", ""),
                "content": x.page_content
            })
            # print(" < 2000, 直接加入final_texts", end = ": ")
            # print(len(final_texts))
        else:
            texts_split = [{
                "page_index": page_index,
                "H1": x.metadata.get("H1", ""),
                "H2": x.metadata.get("H2", ""),
                "H3": x.metadata.get("H3", ""),
                "content": obj.page_content
            } for obj in text_splitter.create_documents([x.page_content])]
            final_texts += texts_split
            # print(" >= 2000, 切分后加入final_texts", end = ": ")
            # print(len(final_texts))
    #
    md_header_splits += [{
        "page_index": chunk["page_index"],
        "title": obj[0]["title"],
        "description": obj[0]["description"],
        "source": obj[0]["source"],
        "H1": chunk["H1"],
        "H2": chunk["H2"],
        "H3": chunk["H3"],
        "content": chunk["content"]} for chunk in final_texts]
    #
    print(len(md_header_splits), end = " ")
    

31 55 78 98 116 150 182 198 221 242 260 275 299 328 344 357 375 388 424 441 452 466 481 492 504 520 535 548 559 571 582 592 603 613 623 635 654 665 674 685 705 720 731 743 751 770 786 795 807 821 837 849 865 875 882 900 913 926 935 945 954 961 970 983 996 1007 1016 1027 1035 1042 1051 1063 1079 1093 1101 1113 1120 1129 1138 1146 1153 1167 1176 1191 1199 1211 1222 1239 1248 1258 1265 1275 1288 1297 1305 1314 1321 1329 1336 1342 1350 1357 1365 1372 1377 1398 1408 1413 1421 1430 1438 1450 1456 1467 1472 1478 1487 1495 1502 1507 1516 1525 1531 1537 1547 1554 1560 1570 1577 1583 1588 1596 1602 1611 1618 1626 1633 1638 1643 1655 1661 1669 1674 1682 1692 1698 1704 1711 1716 1724 1730 1735 1743 1752 1762 1768 1775 1781 1786 1794 1799 1806 1811 1817 1824 1835 1840 1850 1858 1863 1873 1878 1882 1888 1895 1902 1910 1917 1924 1932 1939 1949 1956 1961 1969 1980 1986 1994 2003 2009 2015 2024 2028 2035 2045 2052 2057 2064 2072 2079 2088 2095 2101 2106 2113 2120 2129 2136 2142 2150 2162 2167 2173 2183

In [84]:
md_header_splits[25]

{'page_index': 1,
 'title': 'langchain 0.1.4 — 🦜🔗 LangChain 0.1.4',
 'description': '',
 'source': 'https://api.python.langchain.com/en/stable/langchain_api_reference.html',
 'H1': '',
 'H2': '',
 'H3': '',
 'content': "Retriever that combines embedding similarity with recency in retrieving values.  \nretrievers.web_research.LineList\nList of questions.  \nretrievers.web_research.QuestionListOutputParser\nOutput parser for a list of numbered questions.  \nretrievers.web_research.SearchQueries\nSearch queries to research for the user's goal.  \nretrievers.web_research.WebResearchRetriever\nGoogle Search API retriever.  \nFunctions¶  \nretrievers.document_compressors.chain_extract.default_get_input(...)\nReturn the compression chain input.  \nretrievers.document_compressors.chain_filter.default_get_input(...)\nReturn the compression chain input.  \nretrievers.self_query.deeplake.can_cast_to_float(string)\nCheck if a string can be cast to a float.  \nretrievers.self_query.milvus.process_v

In [85]:
sorted_documents = sort_list_by_len(md_header_splits, "content")
for obj in sorted_documents:
    print(obj[1])

2000
2000
2000
1999
1999
1999
1999
1999
1999
1999
1999
1999
1999
1999
1999
1999
1999
1999
1999
1999
1999
1999
1999
1999
1999
1999
1998
1998
1998
1998
1998
1998
1998
1998
1998
1998
1998
1998
1998
1998
1998
1998
1998
1998
1998
1998
1998
1998
1998
1998
1998
1998
1998
1998
1998
1998
1998
1998
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1997
1996
1996
1996
1996
1996
1996
1996
1996
1996
1996
1996
1996
1996
1996
1996
1996
1996
1996
1996
1996
1996
1996
1996
1996
1996
1996
1996
1995
1995
1995
1995
1995
1995
1995
1995
1995
1995
1995
1995
1995
1995
1995
1995
1995
1994
1994
1994
1994
1994
1994
1994
1994
1994
1994
1994
1993
1993
1993
1993
1993
1993
1993
1993
1993
1993
1993
1993
1992
1992
1992
1992
1992
1992
1992
1992
1992
1992
1992
1992
1992
1992
1991
1991
1991
1991
1991
1991
1991
1990
1990
1990
1990
1990
1990
1990
1990
1990
1990
1990
1989
1989
1989
1989
1989
1989
1989
1989
1989
1989
1989
1989
1989
1989
1988
1988
1988
1988
1987
1987


In [96]:
print(sorted_documents[5000][0]["page_index"])

388


In [156]:
#print(md_header_splits[1007].page_content)
for obj in (search_from_dict_list(md_header_splits, "lancedb", "page_content")):
    print(obj.page_content)

Dependents | 🦜️🔗 Langchain  
[Skip to main content](#__docusaurus_skipToContent_fallback)# Dependents  
Dependents stats for `langchain-ai/langchain`  
[](https://github.com/langchain-ai/langchain/network/dependents)
[](https://github.com/langchain-ai/langchain/network/dependents)
[](https://github.com/langchain-ai/langchain/network/dependents)  
[update: `2023-12-08`; only dependent repositories with Stars > 100]  
| Repository | Stars |
| ---- | ---- |
| AntonOsika/gpt-engineer | 46514 |
| imartinez/privateGPT | 44439 |
| LAION-AI/Open-Assistant | 35906 |
| hpcaitech/ColossalAI | 35528 |
| moymix/TaskMatrix | 34342 |
| geekan/MetaGPT | 31126 |
| streamlit/streamlit | 28911 |
| reworkd/AgentGPT | 27833 |
| StanGirard/quivr | 26032 |
| OpenBB-finance/OpenBBTerminal | 24946 |
| run-llama/llama_index | 24859 |
| jmorganca/ollama | 20849 |
| openai/chatgpt-retrieval-plugin | 20249 |
| chatchat-space/Langchain-Chatchat | 19305 |
| mindsdb/mindsdb | 19172 |
| PromtEngineer/localGPT | 17528 