# RAG

## 环境准备

In [1]:
from Utils import gpt35, gpt4, tongyi, langchain_docs_extractor, WebPageObj

RUNNABLE_BASE_URL:  http://localhost:8000


In [None]:
gpt35("""我在jupyterlab中无法加载修改过的python代码，是有缓存吗？""")

## 从langchain官网收集文档

<div class="alert alert-warning">
<b>兼容性问题：</b><br/>
    较新的BeautifulSoup版本是4.12.3，与python3.10兼容性较好，无法适应3.9或3.12，否则无法找到lxml或html5lib。
</div>

In [5]:
from bs4 import BeautifulSoup, SoupStrainer
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
from langchain_community.document_loaders.sitemap import SitemapLoader
from langchain_core.utils.html import PREFIXES_TO_IGNORE_REGEX, SUFFIXES_TO_IGNORE_REGEX
import re

In [6]:
# 仅在jupyter中需要
import nest_asyncio
nest_asyncio.apply()

### 提取langchain的Docs文档

In [7]:
def metadata_extractor(meta: dict, soup: BeautifulSoup) -> dict:
    title = soup.find("title")
    description = soup.find("meta", attrs={"name": "description"})
    html = soup.find("html")
    return {
        "source": meta["loc"],
        "title": title.get_text() if title else "",
        "description": description.get("content", "") if description else "",
        "language": html.get("lang", "") if html else "",
        **meta,
    }

def load_langchain_docs():
    return SitemapLoader(
        "https://python.langchain.com/sitemap.xml",
        filter_urls=["https://python.langchain.com/"],
        parsing_function=langchain_docs_extractor,
        default_parser="lxml",
        bs_kwargs={
            "parse_only": SoupStrainer(
                name=("article", "title", "html", "lang", "content")
            ),
        },
        meta_function=metadata_extractor,
    ).load()

In [5]:
langchain_docs = load_langchain_docs()

Fetching pages: 100%|##########| 1180/1180 [07:51<00:00,  2.50it/s]


### 提取langchain的API文档

In [8]:
def simple_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    return re.sub(r"\n\n+", "\n\n", soup.text).strip()

def load_api_docs():
    return RecursiveUrlLoader(
        url="https://api.python.langchain.com/en/stable/langchain_api_reference.html",
        max_depth=8,
        extractor=simple_extractor,
        prevent_outside=True,
        use_async=True,
        timeout=600,
        # Drop trailing / to avoid duplicate pages.
        link_regex=(
            f"href=[\"']{PREFIXES_TO_IGNORE_REGEX}((?:{SUFFIXES_TO_IGNORE_REGEX}.)*?)"
            r"(?:[\#'\"]|\/[\#'\"])"
        ),
        check_response_status=True,
        exclude_dirs=(
            "https://api.python.langchain.com/en/latest/_sources",
            "https://api.python.langchain.com/en/latest/_modules",
        ),
    ).load()

In [9]:
api_docs = load_api_docs()

### 提取langsmith的docs文档

In [8]:
def load_langsmith_docs():
    return RecursiveUrlLoader(
        url="https://docs.smith.langchain.com/",
        max_depth=8,
        extractor=simple_extractor,
        prevent_outside=True,
        use_async=True,
        timeout=600,
        # Drop trailing / to avoid duplicate pages.
        link_regex=(
            f"href=[\"']{PREFIXES_TO_IGNORE_REGEX}((?:{SUFFIXES_TO_IGNORE_REGEX}.)*?)"
            r"(?:[\#'\"]|\/[\#'\"])"
        ),
        check_response_status=True,
    ).load()

In [13]:
langsmith_docs = load_langsmith_docs()

  soup = BeautifulSoup(html, "lxml")
  k = self.parse_starttag(i)


### 入库到duckdb

In [10]:
web_store = WebPageObj(db_name = "data/web_pages.db")

In [10]:
# https://python.langchain.com/
for d in langchain_docs:
    print(".", end = "")
    web_store.upsert(d, topic = "langchain_docs")

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [12]:
web_store.upsert(api_docs[0], topic = "langchain_api_docs")

In [17]:
# https://docs.smith.langchain.com/
for d in langsmith_docs:
    print(".", end = "")
    web_store.upsert(d, topic = "langsmith_docs")

.....................................................................

In [13]:
web_store.read_data(topic = "langchain_api_docs")

[WebPage(source='https://api.python.langchain.com/en/stable/langchain_api_reference.html', topic='langchain_api_docs', title='langchain 0.1.4 — 🦜🔗 LangChain 0.1.4', description='', language='en', loc='', changefreq='', priority='', page_content='langchain 0.1.4 — 🦜🔗 LangChain 0.1.4\n\nLangChain\n\nCore\n\nCommunity\n\nExperimental\n\ngoogle-vertexai\n\nrobocorp\n\ngoogle-genai\n\nanthropic\n\nnvidia-trt\n\nopenai\n\nmistralai\n\ntogether\n\nnvidia-ai-endpoints\n\nexa\n\nPartner libs\n\ngoogle-vertexai\nrobocorp\ngoogle-genai\nanthropic\nnvidia-trt\nopenai\nmistralai\ntogether\nnvidia-ai-endpoints\nexa\n\nDocs\n\nToggle Menu\n\nPrev\nUp\nNext\n\nlangchain 0.1.4\nlangchain.agents\nClasses\nFunctions\n\nlangchain.callbacks\nClasses\n\nlangchain.chains\nClasses\nFunctions\n\nlangchain.embeddings\nClasses\nFunctions\n\nlangchain.evaluation\nClasses\nFunctions\n\nlangchain.hub\nFunctions\n\nlangchain.indexes\nClasses\nFunctions\n\nlangchain.memory\nClasses\nFunctions\n\nlangchain.model_labor