# RAG

## 环境准备

In [1]:
from Utils import gpt35, gpt4, tongyi, web_page_extractor, WebPageObj

RUNNABLE_BASE_URL:  http://localhost:8000


In [None]:
gpt35("""我在jupyterlab中无法加载修改过的python代码，是有缓存吗？""")

## 建立langchain知识库

<div class="alert alert-warning">
<b>兼容性问题：</b><br/>
    较新的BeautifulSoup版本是4.12.3，与python3.10兼容性较好，无法适应3.9或3.12，否则无法找到lxml或html5lib。
</div>

In [2]:
from bs4 import BeautifulSoup, SoupStrainer
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
from langchain_community.document_loaders.sitemap import SitemapLoader
from langchain_core.utils.html import PREFIXES_TO_IGNORE_REGEX, SUFFIXES_TO_IGNORE_REGEX
import re

In [3]:
# 仅在jupyter中需要
import nest_asyncio
nest_asyncio.apply()

### 提取langchain文档

#### 提取langchain的Docs文档

In [7]:
def metadata_extractor(meta: dict, soup: BeautifulSoup) -> dict:
    title = soup.find("title")
    description = soup.find("meta", attrs={"name": "description"})
    html = soup.find("html")
    return {
        "source": meta["loc"],
        "title": title.get_text() if title else "",
        "description": description.get("content", "") if description else "",
        "language": html.get("lang", "") if html else "",
        **meta,
    }

def load_langchain_docs():
    return SitemapLoader(
        "https://python.langchain.com/sitemap.xml",
        filter_urls = ["https://python.langchain.com/"],
        parsing_function = web_page_extractor,
        default_parser = "lxml",
        bs_kwargs = {
            "parse_only": SoupStrainer(
                name = ("article", "title", "html", "lang", "content")
            ),
        },
        meta_function = metadata_extractor,
    ).load()

In [5]:
langchain_docs = load_langchain_docs()

Fetching pages: 100%|##########| 1180/1180 [07:51<00:00,  2.50it/s]


#### 提取langchain的API文档

In [8]:
def simple_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    return re.sub(r"\n\n+", "\n\n", soup.text).strip()

def load_api_docs():
    return RecursiveUrlLoader(
        url = "https://api.python.langchain.com/en/stable/langchain_api_reference.html",
        max_depth = 8,
        extractor = simple_extractor,
        prevent_outside = True,
        use_async = True,
        timeout = 600,
        # Drop trailing / to avoid duplicate pages.
        link_regex = (
            f"href=[\"']{PREFIXES_TO_IGNORE_REGEX}((?:{SUFFIXES_TO_IGNORE_REGEX}.)*?)"
            r"(?:[\#'\"]|\/[\#'\"])"
        ),
        check_response_status = True,
        exclude_dirs = (
            "https://api.python.langchain.com/en/latest/_sources",
            "https://api.python.langchain.com/en/latest/_modules",
        ),
    ).load()

In [9]:
api_docs = load_api_docs()

#### 提取langsmith的docs文档

In [8]:
def load_langsmith_docs():
    return RecursiveUrlLoader(
        url = "https://docs.smith.langchain.com/",
        max_depth = 8,
        extractor = simple_extractor,
        prevent_outside = True,
        use_async = True,
        timeout = 600,
        # Drop trailing / to avoid duplicate pages.
        link_regex = (
            f"href=[\"']{PREFIXES_TO_IGNORE_REGEX}((?:{SUFFIXES_TO_IGNORE_REGEX}.)*?)"
            r"(?:[\#'\"]|\/[\#'\"])"
        ),
        check_response_status = True,
    ).load()

In [13]:
langsmith_docs = load_langsmith_docs()

  soup = BeautifulSoup(html, "lxml")
  k = self.parse_starttag(i)


### 将文档入库到duckdb

#### 保存到duckdb

In [6]:
web_store = WebPageObj(db_name = "data/langchain.duckdb")

In [10]:
# https://python.langchain.com/
for d in langchain_docs:
    print(".", end = "")
    web_store.upsert(d, topic = "langchain_docs")

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [12]:
web_store.upsert(api_docs[0], topic = "langchain_api_docs")

In [17]:
# https://docs.smith.langchain.com/
for d in langsmith_docs:
    print(".", end = "")
    web_store.upsert(d, topic = "langsmith_docs")

.....................................................................

#### 从duckdb查询

In [15]:
docs = web_store.read_data(topic = "langchain_docs")

In [48]:
result = [obj for obj in docs if re.compile('lancedb', re.IGNORECASE).search(obj.source)]
for obj in result:
    print(obj.source)

https://python.langchain.com/docs/integrations/providers/lancedb
https://python.langchain.com/docs/integrations/vectorstores/lancedb


### 拆分文本块

#### 加载文本向量

In [2]:
langchain_web = WebPageObj(db_name = "data/langchain.duckdb")
docs = langchain_web.read_data(topic = None)

#### 观察文本数据特征

<div class="alert alert-success">
<b>观察文本大小：</b><br/>
    有很多文档的尺寸超过了50K，最大的达到200K。<br>
    其中，主要是因为包含了图片的base64编码。
</div>

In [3]:
dict_list = [{ "source": obj.source, "len": len(obj.page_content) } for obj in docs]
sorted_dict_list = sorted(dict_list, key = lambda x: x['len'], reverse = True)
for obj in sorted_dict_list:
    print(obj['len'], " >> ", obj['source'])

195919  >>  https://python.langchain.com/docs/integrations/retrievers/activeloop
150150  >>  https://python.langchain.com/docs/use_cases/question_answering/citations
74417  >>  https://python.langchain.com/docs/integrations/document_loaders/dropbox
73698  >>  https://python.langchain.com/docs/integrations/vectorstores/timescalevector
66009  >>  https://python.langchain.com/docs/integrations/document_loaders/docugami
65474  >>  https://python.langchain.com/docs/use_cases/code_understanding
63441  >>  https://python.langchain.com/docs/integrations/tools/google_lens
62074  >>  https://python.langchain.com/docs/integrations/chat/ollama
60867  >>  https://python.langchain.com/docs/expression_language/cookbook/prompt_size
59026  >>  https://python.langchain.com/docs/guides/debugging
57438  >>  https://python.langchain.com/docs/integrations/llms/ollama
54666  >>  https://api.python.langchain.com/en/stable/langchain_api_reference.html
53959  >>  https://python.langchain.com/docs/modules/agents

<div class="alert alert-success">
<b>下面对文本内容做瘦身：</b><br/>
    1、删除图像编码。<br>
    2、删除示例脚本中的输出内容。
</div>

In [123]:
import re

# 兼容带有某种属性的类或字典
def getter(obj, key):
    if isinstance(obj, dict):
        return obj.get(key, "")
    else:
        return getattr(obj, key, "")

# 移除base64编码
def remove_base64(content):
    pattern = r'base64,(?:(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?)'
    return re.sub(pattern, '', content)

# 移除示例代码中的输出结果
def remove_text_blocks(content):
    pattern = r'```text\r?\n.*?\r?\n```'
    return re.sub(pattern, '', content, flags=re.DOTALL)

# 按正则表达式过滤字典属性值，从字典、类等对象构成的列表中搜索
def search_from_list(dict_list, patten, key):
    new_dict_list = [obj for obj in dict_list if re.compile(patten, re.IGNORECASE).search(getter(obj, key))]
    resp = []
    for obj in new_dict_list:
        resp.append(obj)
    return resp

# 对字典、类等对象构成的列表按表示文档长度的键排序
def sort_list_by_len(dict_list, key):
    new_dict_list = [(obj, len(getter(obj, key))) for obj in dict_list]
    newDocs = sorted(new_dict_list, key = lambda x: x[1], reverse = True)
    return newDocs

In [124]:
langchain_new_docs = [{"content": remove_text_blocks(remove_base64(obj.page_content)), "source": obj.source, "title": obj.title, "description": obj.description} for obj in docs]
newDocs = sort_list_by_len(langchain_new_docs, "content")
for obj in newDocs:
    print(obj[1], " >> ", obj[0]['source'])

54666  >>  https://api.python.langchain.com/en/stable/langchain_api_reference.html
30920  >>  https://python.langchain.com/docs/integrations/vectorstores/timescalevector
29616  >>  https://python.langchain.com/docs/modules/data_connection/document_loaders/file_directory
28836  >>  https://docs.smith.langchain.com/tracing/tracing-faq
28812  >>  https://python.langchain.com/docs/integrations/tools/google_lens
26134  >>  https://python.langchain.com/docs/langgraph
24125  >>  https://python.langchain.com/docs/langserve
23943  >>  https://docs.smith.langchain.com/evaluation/quickstart
23617  >>  https://python.langchain.com/docs/integrations/toolkits/github
22722  >>  https://python.langchain.com/docs/integrations/vectorstores/redis
22675  >>  https://python.langchain.com/docs/guides/safety/amazon_comprehend_chain
22587  >>  https://docs.smith.langchain.com/cookbook/testing-examples/comparing-runs
22480  >>  https://python.langchain.com/docs/get_started/quickstart
22014  >>  https://python.

In [125]:
content = search_from_list(langchain_new_docs, 'question_answering/citations', 'source')
for obj in content:
    print("length:", len(obj["content"]), ">> \n\n")
    print(res)

length: 15343 >> 


Citations | 🦜️🔗 Langchain

[Skip to main content](#__docusaurus_skipToContent_fallback)# Citations

How can we get a model to cite which parts of the source documents it
referenced in its response?

To explore some techniques for extracting citations, let’s first create
a simple RAG chain. To start we’ll just retrieve from Wikipedia using
the
[WikipediaRetriever](https://api.python.langchain.com/en/latest/retrievers/langchain_community.retrievers.wikipedia.WikipediaRetriever.html).

## Setup​

First we’ll need to install some dependencies and set environment vars
for the models we’ll be using.

```python
%pip install -qU langchain langchain-openai langchain-anthropic langchain-community wikipedia
```

```python
import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass.getpass()
os.environ["ANTHROPIC_API_KEY"] = getpass.getpass()

# Uncomment if you want to log to LangSmith
# os.environ["LANGCHAIN_TRACING_V2"] = "true
# os.environ["LANGCHAIN_API_KEY"] = getpas

#### 切分字符串：按大小截断

In [107]:
texts = [obj["content"] for obj in newDocs]

In [109]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)
documents = text_splitter.create_documents(texts)
len(documents)

5253

In [129]:
sorted_documents = sort_list_by_len(documents, "page_content")
#sorted_documents[1000]
for obj in sorted_documents:
    print(obj[1])

1200
1200
1200
1200
1200
1200
1200
1200
1200
1200
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1199
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1198
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1197
1196
1196
1196
1196


#### 切分字符串：按Markdown段落

In [111]:
texts = [obj["content"] for obj in newDocs]

In [112]:
from langchain.text_splitter import MarkdownHeaderTextSplitter

In [113]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

In [131]:
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
md_header_splits = []
for obj in texts:
    resp = markdown_splitter.split_text(obj)
    md_header_splits += resp
print(len(md_header_splits))

4756


In [135]:
sorted_documents = sort_list_by_len(md_header_splits, "page_content")
for obj in sorted_documents:
    print(obj[1])

47040
28837
28257
26102
23944
22588
20601
18843
17725
17473
16876
16357
15487
15092
15040
14715
13447
13446
12971
12875
12696
11753
11725
11047
10926
10145
10138
10102
10080
9396
8868
8673
8617
8564
8403
8136
8035
7891
7863
7685
7351
7333
7215
7193
7105
6968
6950
6868
6728
6602
6431
6271
6085
5969
5793
5587
5567
5477
5171
5139
5138
4971
4959
4847
4792
4740
4701
4506
4473
4468
4393
4384
4351
4298
4284
4274
4257
4255
4253
4239
4219
4219
4198
4182
4169
4169
4139
4098
4082
4076
4065
3994
3986
3969
3941
3926
3878
3833
3821
3813
3789
3787
3762
3761
3744
3712
3701
3685
3674
3655
3651
3633
3620
3609
3580
3573
3572
3543
3536
3522
3520
3489
3485
3483
3469
3432
3428
3428
3407
3407
3401
3393
3369
3359
3359
3358
3354
3353
3351
3338
3336
3335
3333
3328
3299
3283
3278
3274
3228
3211
3176
3171
3121
3112
3099
3092
3067
3065
3035
3014
3001
3001
3000
2975
2975
2971
2969
2966
2956
2950
2945
2937
2932
2932
2929
2901
2895
2891
2888
2884
2883
2881
2853
2848
2846
2840
2836
2831
2828
2801
2796
2795
2795
2786
2

In [153]:
print(sorted_documents[1140][0].page_content)

Aleph Alpha | 🦜️🔗 Langchain  
[Skip to main content](#__docusaurus_skipToContent_fallback)# Aleph Alpha  
[The Luminous
series](https://docs.aleph-alpha.com/docs/introduction/luminous/) is a
family of large language models.  
This example goes over how to use LangChain to interact with Aleph Alpha
models  
```python
# Install the package
%pip install --upgrade --quiet  aleph-alpha-client
```  
```python
# create a new token: https://docs.aleph-alpha.com/docs/account/#create-a-new-token

from getpass import getpass

ALEPH_ALPHA_API_KEY = getpass()
```  
```python
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_community.llms import AlephAlpha
```  
```python
template = """Q: {question}

A:"""

prompt = PromptTemplate(template=template, input_variables=["question"])
```  
```python
llm = AlephAlpha(
model="luminous-extended",
maximum_tokens=20,
stop_sequences=["Q:"],
aleph_alpha_api_key=ALEPH_ALPHA_API_KEY,
)
```  
```python
llm_chain = L

In [156]:
#print(md_header_splits[1007].page_content)
for obj in (search_from_dict_list(md_header_splits, "lancedb", "page_content")):
    print(obj.page_content)

Dependents | 🦜️🔗 Langchain  
[Skip to main content](#__docusaurus_skipToContent_fallback)# Dependents  
Dependents stats for `langchain-ai/langchain`  
[](https://github.com/langchain-ai/langchain/network/dependents)
[](https://github.com/langchain-ai/langchain/network/dependents)
[](https://github.com/langchain-ai/langchain/network/dependents)  
[update: `2023-12-08`; only dependent repositories with Stars > 100]  
| Repository | Stars |
| ---- | ---- |
| AntonOsika/gpt-engineer | 46514 |
| imartinez/privateGPT | 44439 |
| LAION-AI/Open-Assistant | 35906 |
| hpcaitech/ColossalAI | 35528 |
| moymix/TaskMatrix | 34342 |
| geekan/MetaGPT | 31126 |
| streamlit/streamlit | 28911 |
| reworkd/AgentGPT | 27833 |
| StanGirard/quivr | 26032 |
| OpenBB-finance/OpenBBTerminal | 24946 |
| run-llama/llama_index | 24859 |
| jmorganca/ollama | 20849 |
| openai/chatgpt-retrieval-plugin | 20249 |
| chatchat-space/Langchain-Chatchat | 19305 |
| mindsdb/mindsdb | 19172 |
| PromtEngineer/localGPT | 17528 