In [1]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

# 提取单个网页

## 纯静态页面

In [5]:
from langchain.document_loaders import WebBaseLoader

In [6]:
# 自动提取纯文字部份
docs = WebBaseLoader("http://www.hongmeng-info.com/").load()

创建RAG：

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

In [8]:
documents = RecursiveCharacterTextSplitter(
    chunk_size=500, chunk_overlap=200
).split_documents(docs)

vector = FAISS.from_documents(documents, OpenAIEmbeddings())
retriever = vector.as_retriever()

In [9]:
from langchain.tools.retriever import create_retriever_tool

retriever_tool = create_retriever_tool(
    retriever,
    "鸿蒙",
    "搜索关于鸿蒙的信息。询问任何关于鸿蒙的信息，你都必须使用这个工具!",
)

In [10]:
retriever_tool.invoke("鸿蒙做过哪些项目")

'广州鸿蒙信息科技有限公司\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nToggle navigation\n\n\n\n\n\n\n\n\n\n\n首页\n互联网应用\n信息化服务\n电子税务\n招聘\n联系我们\n\n\n\n\n\n\n\n\n\n\n\n\n\n稳健、高效\r\n                        \t人性化的电子竞价系统,\r\n                        \t千亿级股权交易平台实践检验\n鸿蒙在线竞价系统，微信，APP，智能终端，多媒体控制.\n\n了解更多>>\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nPrevious\n\n\n\nNext\n\n\n\n\n\n会员平台及CRM管理\n社群运营的基础架构系统，支持复杂权益，积分管理，“会员卡”系统，权益/积分商城应用，社群用户关系管理，多种智能行为数据模型，面向客户群/社群运营者提供有效的解决方案.\n详情  »\n\n\n互联网运营平台\n核心组件系统支撑O2O类运营体系，订单系统，合作商/供应商/渠道商管理及结算系统，客服系统，营销支撑与分析，活动及传播系统，为运营提供有效灵活的支撑.\n详情  »\n\n行业客户总监\r\n                    \n\n职位要求\n\r\n                    \t1、税务/互联网/软件业从业经验，熟悉相关行业\r\n\t\t\t\t\t\t2、具备产品规划与设计视角\r\n\t\t\t\t\t\t3、行业市场开拓\r\n\t\t\t\t\t\t4、丰富的商业运作经验\r\n\t\t\t\t\t\t5、责任心与坦诚\r\n\t\t\t\t\t\t6、非常欢迎有创业经历的合作伙伴\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n联系我们\n\n\n\n\n\n\n广州鸿蒙信息科技有限公司\r\n                   \t地址： 广州市海珠区新港东路中洲中心一楼\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0东门A7、A8号                                       \r\n                  

## 有延迟加载的页面

### 直接使用 WebBaseLoader 无法获得内容

In [11]:
WebBaseLoader("https://maas.aminer.cn/dev/howuse/introduction").load()

[Document(page_content="智谱AI开放平台We're sorry but 智谱AI开放平台 doesn't work properly without JavaScript enabled. Please enable it to continue.", metadata={'source': 'https://maas.aminer.cn/dev/howuse/introduction', 'title': '智谱AI开放平台', 'description': '大模型开放平台-新一代国产自主通用AI开放平台，致力于将产品技术与行业场景双轮驱动的中国先进的认知智能技术和千行百业应用相结合，构建更高精度、高效率、通用化的AI开发新模式，实现智谱大模型的产业化，将AI的好处带给每个人。', 'language': 'zh-cn'})]

### 使用 PlayWrightBrowserToolkit

In [12]:
from langchain_community.agent_toolkits import PlayWrightBrowserToolkit
from langchain_community.tools.playwright.utils import (
    create_async_playwright_browser,  # A synchronous browser is available, though it isn't compatible with jupyter.\n",      },
)

In [13]:
# This import is required only for jupyter notebooks, since they have their own eventloop
import nest_asyncio
nest_asyncio.apply()

In [14]:
async_browser = create_async_playwright_browser()
toolkit = PlayWrightBrowserToolkit.from_browser(async_browser=async_browser)
tools = toolkit.get_tools()
tools

[ClickTool(async_browser=<Browser type=<BrowserType name=chromium executable_path=/Users/xuehongwei/Library/Caches/ms-playwright/chromium-1097/chrome-mac/Chromium.app/Contents/MacOS/Chromium> version=121.0.6167.57>),
 NavigateTool(async_browser=<Browser type=<BrowserType name=chromium executable_path=/Users/xuehongwei/Library/Caches/ms-playwright/chromium-1097/chrome-mac/Chromium.app/Contents/MacOS/Chromium> version=121.0.6167.57>),
 NavigateBackTool(async_browser=<Browser type=<BrowserType name=chromium executable_path=/Users/xuehongwei/Library/Caches/ms-playwright/chromium-1097/chrome-mac/Chromium.app/Contents/MacOS/Chromium> version=121.0.6167.57>),
 ExtractTextTool(async_browser=<Browser type=<BrowserType name=chromium executable_path=/Users/xuehongwei/Library/Caches/ms-playwright/chromium-1097/chrome-mac/Chromium.app/Contents/MacOS/Chromium> version=121.0.6167.57>),
 ExtractHyperlinksTool(async_browser=<Browser type=<BrowserType name=chromium executable_path=/Users/xuehongwei/Libr

In [15]:
tools_by_name = {tool.name: tool for tool in tools}
navigate_tool = tools_by_name["navigate_browser"]
get_elements_tool = tools_by_name["get_elements"]

In [16]:
get_elements_tool

GetElementsTool(async_browser=<Browser type=<BrowserType name=chromium executable_path=/Users/xuehongwei/Library/Caches/ms-playwright/chromium-1097/chrome-mac/Chromium.app/Contents/MacOS/Chromium> version=121.0.6167.57>)

In [17]:
await navigate_tool.arun(
    {"url": "https://maas.aminer.cn/dev/howuse/introduction"}
)

'Navigating to https://maas.aminer.cn/dev/howuse/introduction returned status code 200'

In [18]:
await get_elements_tool.arun(
    {"selector": ".how-use-content", "attributes": ["innerText"]}
)

'[{"innerText": "介绍\\n\\n智谱AI 开放平台提供一系列具有不同功能和定价的大模型，包括通用大模型、超拟人大模型、图像大模型、向量大模型等，并且支持使用您的私有数据对模型进行微调。\\n\\n2024年01月16日，我们在「智谱AI技术开放日(ZHIPU DevDay)」推出新一代基座大模型 GLM-4。\\n\\n资源\\n查看模型接口文档\\n体验模型能力体验中心\\n查看您的 API Keys\\n构建知识库\\n创建大模型应用应用中心\\n关键概念\\nGLM\\nGLM 全名 General Language Model ，是一款基于自回归填空的预训练语言模型。ChatGLM 系列模型，支持相对复杂的自然语言指令，并且能够解决困难的推理类问题。该模型配备了易于使用的 API 接口，允许开发者轻松将其融入各类应用，广泛应用于智能客服、虚拟主播、聊天机器人等诸多领域。\\nEmbedding\\nEmbedding 是一种将数据（如文本）转化为向量形式的表示方法，这种表示方式确保了在某些特定方面相似的数据在向量空间中彼此接近，而与之不相关的数据则相距较远。通过将文本字符串转换为向量，使得数据能够有效用于搜索、聚类、推荐系统、异常检测和分类等应用场景。\\nToken\\nToken 是模型用来表示自然语言文本的基本单位，可以直观的理解为“字”或“词”；通常 1 个中文词语、1 个英文单词、1 个数字或 1 个符号计为 1 个token。 一般情况下 ChatGLM 系列模型中 token 和字数的换算比例约为 1:1.6 ，但因为不同模型的分词不同，所以换算比例也存在差异，每一次实际处理 token 数量以模型返回为准，您可以从返回结果的 usage 中查看。"}]'

In [19]:
# If the agent wants to remember the current webpage, it can use the `current_webpage` tool
await tools_by_name["current_webpage"].arun({})

'https://maas.aminer.cn/dev/howuse/introduction'

### 在智能体中使用动态读取的网页内容

In [30]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

system = '''
Respond to the human as helpfully and accurately as possible. 
You have access to the following tools:

{tools}

Use a json blob to specify a tool by providing an action key (tool name) and an action_input key (tool input).

Valid "action" values: "Final Answer" or {tool_names}

Provide only ONE action per $JSON_BLOB, as shown:

```
{{
  "action": $TOOL_NAME,
  "action_input": $INPUT
}}
```

Follow this format:

Question: input question to answer
Thought: consider previous and subsequent steps
Action:
```
$JSON_BLOB
```
Observation: action result
... (repeat Thought/Action/Observation N times)
Thought: I know what to respond
Action:
```
{{
  "action": "Final Answer",
  "action_input": "Final response to human"
}}

Begin! 
ALWAYS respond with a valid json blob of a single action;
Always think and respond with Chinese.

Use tools if necessary. 
Respond directly if appropriate. 
Format is Action:```$JSON_BLOB```then Observation.

'''

human = '''{input}

{agent_scratchpad}

(reminder to respond in a JSON blob no matter what)
'''

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        MessagesPlaceholder("chat_history", optional=True),
        ("human", human),
    ]
)

In [31]:
from langchain.agents import AgentExecutor, create_structured_chat_agent
from langchain_openai import OpenAI
# from langchain import hub

# prompt = hub.pull("hwchase17/structured-chat-agent")
llm = OpenAI().bind(stop=["Observation"])
agent = create_structured_chat_agent(llm, tools, prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)

In [32]:
await agent_executor.ainvoke({"input": "请参考 https://maas.aminer.cn/dev/howuse/model，智谱有哪些模型？"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m 
Thought: I should use the extract_hyperlinks tool to retrieve all hyperlinks on the given webpage, and then find the URLs for each model page.
Action:
```
{
  "action": "extract_hyperlinks",
  "action_input": {
    "absolute_urls": true
  }
}
```
[0m[36;1m[1;3m["http://maas.aminer.cn/", "http://maas.aminer.cn/", "http://maas.aminer.cn/pricing", "http://maas.aminer.cn/overview", "http://maas.aminer.cn/dev/howuse/introduction", "http://maas.aminer.cn/dev/openpower", "http://maas.aminer.cn/knowledge", "http://maas.aminer.cn/appcenter", "javascript:void(0)", "http://maas.aminer.cn/partner", "https://appon94c0bw9146.pc.xiaoe-tech.com/page/4807585", "https://appon94c0bw9146.pc.xiaoe-tech.com/page/4807581", "https://appon94c0bw9146.pc.xiaoe-tech.com/page/4807584", "https://www.zhipuai.cn/aboutus", "http://maas.aminer.cn/overview", "http://maas.aminer.cn/online-book", "http://maas.aminer.cn/dev/api#language", "http://maas.aminer

{'input': '请参考 https://maas.aminer.cn/dev/howuse/model，智谱有哪些模型？',
 'output': '智谱的模型有：ChatGLM-6B, GLM-130B, CodeGeeX, CogView, CogVideo。'}

## 获取 OpenAI 的 API

In [33]:
await navigate_tool.arun(
    {"url": "https://platform.openai.com/docs/models"}
)

'Navigating to https://platform.openai.com/docs/models returned status code 403'

<div class="alert alert-warning">
    <b>⚠️ 注意</b><br>
    访问页面时出现 <b>403</b> 错误，说明该页面所在的服务器拒绝机器访问，需要更强的网页爬取手段才能自动获得内容。
</div>

# 根据网页内的链接循环获取批量网页

In [49]:
from bs4 import BeautifulSoup, SoupStrainer
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
from langchain_core.utils.html import PREFIXES_TO_IGNORE_REGEX, SUFFIXES_TO_IGNORE_REGEX
import re

def simple_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    return re.sub(r"\n\n+", "\n\n", soup.text).strip()

loader = RecursiveUrlLoader(
        url = "https://www.shui5.cn/article/ShuiWuWenDa/",
        max_depth = 3,
        extractor = simple_extractor,
        prevent_outside = True,
        use_async = True,
        timeout = 600,
        # Drop trailing / to avoid duplicate pages.
        link_regex = (
            f"href=[\"']{PREFIXES_TO_IGNORE_REGEX}((?:{SUFFIXES_TO_IGNORE_REGEX}.)*?)"
            r"(?:[\#'\"]|\/[\#'\"])"
        ),
        check_response_status = True,
    )
docs = loader.load()

In [54]:
docs[0].metadata

{'source': 'https://www.shui5.cn/article/ShuiWuWenDa/',
 'title': '税务问答_税 屋——第一时间传递财税政策法规！ ',
 'description': '税务问答、税务疑难问题、税务答疑、税收征管、税务实务、税收问题、税收咨询、税务咨询',
 'language': None}

# 根据 SiteMap 获取批量网页

In [62]:
from langchain_community.document_loaders.sitemap import SitemapLoader

def metadata_extractor(meta: dict, soup: BeautifulSoup) -> dict:
    title = soup.find("title")
    description = soup.find("meta", attrs={"name": "description"})
    html = soup.find("html")
    return {
        "source": meta["loc"],
        "title": title.get_text() if title else "",
        "description": description.get("content", "") if description else "",
        "language": html.get("lang", "") if html else "",
        **meta,
    }

def load_sitemap(url, filter_urls):
    return SitemapLoader(
        url,
        filter_urls = filter_urls,
        parsing_function = simple_extractor,
        default_parser = "lxml",
        bs_kwargs = {
            "parse_only": SoupStrainer(
                name = ("article", "title", "html", "lang", "content")
            ),
        },
        meta_function = metadata_extractor,
    ).load()

In [None]:
#获取 langchain 文档
langchain_docs = load_sitemap(
    "https://python.langchain.com/sitemap.xml",
    ["https://python.langchain.com/"]
)