# 基础配置

## llm初始化

In [1]:
from dotenv import load_dotenv
import os
from langchain_community.llms.cloudflare_workersai import CloudflareWorkersAI
from langchain_community.llms.tongyi import Tongyi
from langchain_openai import ChatOpenAI

load_dotenv(override=True)

account_id = os.getenv('CF_ACCOUNT_ID')
api_token = os.getenv('CF_API_TOKEN')
print(account_id)
print(api_token)

# CloudflareWorkersAI
model = '@cf/meta/llama-3-8b-instruct'
cf_llm = CloudflareWorkersAI(
    account_id=account_id,
    api_token=api_token,
    model=model
)

DASHSCOPE_API_KEY = os.getenv('DASHSCOPE_API_KEY')
print(DASHSCOPE_API_KEY)

# qwen
qwen_llm = Tongyi(
    model='qwen2-1.5b-instruct'
)

api_key = os.getenv('OPENAI_API_KEY')
base_url = os.getenv('OPENAI_API_BASE')
print(api_key)
print(base_url)

# openai/moonshot
ms_llm = ChatOpenAI(
    openai_api_base=base_url,
    openai_api_key=api_key,
    model_name="moonshot-v1-8k",
    temperature=0.7,
)

8483c3ec7a0cbc54a8d660b5b9002b04
Gcllof8ze6dgtcqFI5FQZ2SD_5tfCD4Db7NuS6jn
sk-01c5003340c3453b934052d737d45e01
sk-UGVpjuTwo2Q8pewoqUDfckw1A0pbSDli9ElFMeS9WareKknG
https://api.moonshot.cn/v1/


## embedding 初始化

In [2]:
# cloudflare_workersai
from langchain_community.embeddings.cloudflare_workersai import (
    CloudflareWorkersAIEmbeddings,
)

# //维度是：384
embeddings = CloudflareWorkersAIEmbeddings(
    account_id=account_id,
    api_token=api_token,
    model_name="@cf/baai/bge-small-en-v1.5",
)

## vector store 初始化

In [3]:
import os
from dotenv import load_dotenv
from supabase.client import Client, create_client

load_dotenv(override=True)

supabase_url = os.environ.get("SUPABASE_URL")
supabase_key = os.environ.get("SUPABASE_SERVICE_KEY")
supabase_token = os.environ.get("SUPABASE_TOKEN")

print(f"supabase_url: " + supabase_url)
print(f"supabase_token: " + supabase_token)
print(f"supabase_key: " + supabase_key)

# https://supabase.com/dashboard/project/infrxrfaftyrxvkwvncf/editor/29610
supabase: Client = create_client(supabase_url, supabase_key)

supabase_url: https://infrxrfaftyrxvkwvncf.supabase.co
supabase_token: sbp_92789ce9c4eba823025bb099698f61b8929a59e2
supabase_key: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImluZnJ4cmZhZnR5cnh2a3d2bmNmIiwicm9sZSI6ImFub24iLCJpYXQiOjE3MTc5MTMwOTMsImV4cCI6MjAzMzQ4OTA5M30.4XcckGc0Uk-jj5j1aNbN2HnuXEi6Z4bjUrsAEhApXeM


# 核心过程

## 数据采集

In [74]:
from langchain_community.document_loaders import UnstructuredURLLoader

urls_1 = [
    "https://www.youxiake.com/lines.html?id=26370&spm=eyJmcm9tIjoyNzEsIm9yaWdpbmFsX2lkIjowfQ",
    "https://www.youxiake.com/lines.html?id=26373&spm=eyJmcm9tIjoyNzIsIm9yaWdpbmFsX2lkIjowfQ",
]

urls_2 = [
    "https://www.youxiake.com/lines.html?id=26371&spm=eyJmcm9tIjoyNzEsIm9yaWdpbmFsX2lkIjowfQ",
    "https://www.youxiake.com/lines.html?id=48451&spm=eyJmcm9tIjoyNzEsIm9yaWdpbmFsX2lkIjowfQ",
    "https://www.youxiake.com/lines.html?id=51587&spm=eyJmcm9tIjoyNzIsIm9yaWdpbmFsX2lkIjowfQ",
    "https://www.youxiake.com/lines.html?id=19531&spm=eyJmcm9tIjoyNzIsIm9yaWdpbmFsX2lkIjowfQ",
    "https://www.youxiake.com/lines.html?id=52054&spm=eyJmcm9tIjoyNzIsIm9yaWdpbmFsX2lkIjowfQ",
    "https://www.youxiake.com/lines.html?id=51644&spm=eyJmcm9tIjoyNzIsIm9yaWdpbmFsX2lkIjowfQ",
    "https://www.youxiake.com/lines.html?id=48779&spm=eyJmcm9tIjoyNzIsIm9yaWdpbmFsX2lkIjowfQ",
    "https://www.youxiake.com/lines.html?id=19725&spm=eyJmcm9tIjoyNzIsIm9yaWdpbmFsX2lkIjowfQ",
    "https://www.youxiake.com/lines.html?id=41384&spm=eyJmcm9tIjoyNzIsIm9yaWdpbmFsX2lkIjowfQ"
]

urls_child = [
    "https://www.youxiake.com/lines.html?id=26372&spm=eyJmcm9tIjoyNzIsIm9yaWdpbmFsX2lkIjowfQ",
    "https://www.youxiake.com/lines.html?id=19546&spm=eyJmcm9tIjoyNzIsIm9yaWdpbmFsX2lkIjowfQ",
]

loader = UnstructuredURLLoader(urls=urls_child)
docs = loader.load()

## 数据分块

In [75]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800, chunk_overlap=100, add_start_index=True
)
splits = text_splitter.split_documents(docs)

In [76]:
for split in splits:
    # 处理每个元素的 page_content 属性
    # strip = split.page_content.replace(' ', '').strip()
    # 换行符"\n"，制表符"\t"，回车符"\t"
    strip = split.page_content.replace("\n", "").replace("\r", "").replace("\t", "").replace(' ', '').strip()
    # 替换掉空格
    strip = strip.replace(' ', '')
    # 将处理后的字符串重新赋值给 page_content 属性
    split.page_content = strip
    old_metadata = split.metadata
    split.metadata['category_en'] = 'Parent-child tour'
    split.metadata['category_cn'] = '亲子游'
splits

[Document(page_content='登录注册我的订单帮助中心我的游侠客我的游侠客订单中心我的社区我的等级账户管理手机APP社群游侠社群旅行爱好者的聚集地扫码即刻加入游侠客-旅行者的社交网络杭州站杭州站上海站广州站成都站北京站武汉站苏州站南京站重庆站厦门站深圳站西安站综合综合线路签证游记攻略视频摄影游侠历史搜索清空热门搜索武功山花鸟岛国内海岛徒步恩施朝鲜吴哥迷城东欧徒步雨崩越南全景田园靖西厦门太行山小众印尼摩洛哥澳大利亚万塔之城-蒲甘国内四川云南北京东北贵州西藏湖南湖北广西西北华南福建华东秦晋华北华中重庆内蒙古新疆出境东南亚欧洲南亚日韩朝蒙尼泊尔越南东欧大洋洲俄罗斯泰国美洲斯里兰卡东非缅甸印度尼西亚摩洛哥服务热线9:00~21:004006706300首页周边游当地体验自由行国内游出境游摄影游户外游亲子游活动赛事主题游定制游游记攻略', metadata={'source': 'https://www.youxiake.com/lines.html?id=26372&spm=eyJmcm9tIjoyNzIsIm9yaWdpbmFsX2lkIjowfQ', 'start_index': 0, 'category_en': 'Parent-child tour', 'category_cn': '亲子游'}),
 Document(page_content='服务热线9:00~21:004006706300首页周边游当地体验自由行国内游出境游摄影游户外游亲子游活动赛事主题游定制游游记攻略1日游2日游3日游花鸟岛武功山', metadata={'source': 'https://www.youxiake.com/lines.html?id=26372&spm=eyJmcm9tIjoyNzIsIm9yaWdpbmFsX2lkIjowfQ', 'start_index': 535, 'category_en': 'Parent-child tour', 'category_cn': '亲子游'}),
 Document(page_content='旅拍写真民宿营地趣杭州·轻旅行酒店民宿', metadata={'source': 'https://www.youxiake.com/lines.html?id=26372&spm=eyJmcm9tIjoyNzIsIm

In [73]:
for split in splits:
    old_metadata = split.metadata
    # print(old_metadata)
    split.metadata['category_en'] = 'Parent-child tour'
    split.metadata['category_cn'] = '亲子游'
    print(split.metadata)

{'source': 'https://www.youxiake.com/lines.html?id=26370&spm=eyJmcm9tIjoyNzEsIm9yaWdpbmFsX2lkIjowfQ', 'start_index': 0, 'category_en': 'Parent-child tour', 'category_ch': '亲子游', 'category_cn': '亲子游'}
{'source': 'https://www.youxiake.com/lines.html?id=26370&spm=eyJmcm9tIjoyNzEsIm9yaWdpbmFsX2lkIjowfQ', 'start_index': 535, 'category_en': 'Parent-child tour', 'category_ch': '亲子游', 'category_cn': '亲子游'}
{'source': 'https://www.youxiake.com/lines.html?id=26370&spm=eyJmcm9tIjoyNzEsIm9yaWdpbmFsX2lkIjowfQ', 'start_index': 1282, 'category_en': 'Parent-child tour', 'category_ch': '亲子游', 'category_cn': '亲子游'}
{'source': 'https://www.youxiake.com/lines.html?id=26370&spm=eyJmcm9tIjoyNzEsIm9yaWdpbmFsX2lkIjowfQ', 'start_index': 1613, 'category_en': 'Parent-child tour', 'category_ch': '亲子游', 'category_cn': '亲子游'}
{'source': 'https://www.youxiake.com/lines.html?id=26370&spm=eyJmcm9tIjoyNzEsIm9yaWdpbmFsX2lkIjowfQ', 'start_index': 2418, 'category_en': 'Parent-child tour', 'category_ch': '亲子游', 'category_c

## 存入向量数据库

In [6]:
# 将上述文件插入数据库。嵌入将自动为每个文档生成。
from langchain_community.vectorstores import SupabaseVectorStore

vectorstore_exist = SupabaseVectorStore(
    embedding=embeddings,
    client=supabase,
    table_name="documents",
    query_name="match_documents",
)

In [77]:
# 将上述文件插入数据库。嵌入将自动为每个文档生成。
from langchain_community.vectorstores import SupabaseVectorStore

vectorstore = SupabaseVectorStore.from_documents(
    splits,
    embeddings,
    client=supabase,
    table_name="documents",
    query_name="match_documents",
)

2024-06-25 15:09:19,873:INFO - HTTP Request: POST https://infrxrfaftyrxvkwvncf.supabase.co/rest/v1/documents?columns=%22id%22%2C%22metadata%22%2C%22embedding%22%2C%22content%22 "HTTP/1.1 201 Created"


## 基于向量数据库的提问

In [78]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [15]:
retriever = vectorstore_exist.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [31]:
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i + 1}:\n\n" + d.page_content for i, d in enumerate(docs)]))


In [63]:
query = "这个团【亲子·坝上双草原6日丨乌兰布统+锡林郭勒+长城】的满减优惠是多少？"

In [64]:
retrieved_docs = retriever.invoke(query)

pretty_print_docs(retrieved_docs)

2024-06-25 14:33:03,111:INFO - HTTP Request: POST https://infrxrfaftyrxvkwvncf.supabase.co/rest/v1/rpc/match_documents?limit=3 "HTTP/1.1 200 OK"


Document 1:

呼和浩特出发|6天5晚



亲子·草原沙漠行6日丨奇遇草原沙漠火山20+体验

￥3880元起
2775人已报名

北京出发|6天5晚



亲子·坝上双草原6日丨乌兰布统+锡林郭勒+长城

￥3980元起
13068人已报名
----------------------------------------------------------------------------------------------------
Document 2:

赤峰市出发|7天6晚



马背旅行·纵马乌兰布统草原，坝上秘境探索7日

￥8800元起
76人已报名

北京出发|6天5晚



坝上秋摄丨乌兰布统坝上草原+金山岭长城6日

￥3980元起
30人已报名
----------------------------------------------------------------------------------------------------
Document 3:

北京出发|6天5晚



坝上秋摄丨乌兰布统坝上草原+金山岭长城6日

￥3980元起
30人已报名

北京出发|7天6晚



越野派·坝上草原7日丨千里草原游牧无人区穿越

￥5180元起
2240人已报名


# 基于大模型的提问

## 定义 prompt

In [39]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate

prompt = '''
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: {question}
Context: {context}
Answer:
'''

prompt_v2 = '''
# Character
You're a knowledgeable assistant capable of providing concise answers to a variety of questions, drawing from the context provided, and admitting when you don't know the answer.

## Skills
1. **Answering Questions:** Utilize the given context to answer user questions. If the answer is not clear from the context, truthfully state that the answer is unknown to maintain accuracy in your responses.
Question: {question}
Context: {context}    

### Answering Questions Format:
- Answer:  

## Constraints:
- Keep answers to a maximum of three sentences to maintain brevity.
- If the answer cannot be determined, simply confess that you do not know. Honesty is paramount in maintaining credibility.
- If the answer is not reflected in the context, please reply: Sorry, I don't know for the moment.
- Focus on gleaning answers from the context provided only.
'''


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
        {"context": (retriever | format_docs), "question": RunnablePassthrough()}
        | ChatPromptTemplate.from_template(prompt_v2)
        | qwen_llm
        | StrOutputParser()
)

In [37]:
rag_chain.invoke(query + "请用中文回答")

2024-06-25 13:58:35,392:INFO - HTTP Request: POST https://infrxrfaftyrxvkwvncf.supabase.co/rest/v1/rpc/match_documents?limit=3 "HTTP/1.1 200 OK"


'Answer: 亲子·坝上双草原6日丨乌兰布统+锡林郭勒+长城的行程包括游览乌兰布统草原、锡林郭勒草原以及金山岭长城。该行程从北京出发，为期6天5晚，价格为3980元起。'

In [40]:
query = '亲子·坝上双草原6日丨乌兰布统+锡林郭勒+长城 这个团有打折吗？'
rag_chain.invoke(query + "请用中文回答")

2024-06-25 14:04:38,872:INFO - HTTP Request: POST https://infrxrfaftyrxvkwvncf.supabase.co/rest/v1/rpc/match_documents?limit=3 "HTTP/1.1 200 OK"


'Answer: 抱歉，我目前不知道这个团是否有打折。提供的信息中没有提到具体的打折情况。'

In [42]:
query = '这个团【亲子·坝上双草原6日丨乌兰布统+锡林郭勒+长城】的儿童价格是多少？'
rag_chain.invoke(query + "请用中文回答")

2024-06-25 14:06:03,519:INFO - HTTP Request: POST https://infrxrfaftyrxvkwvncf.supabase.co/rest/v1/rpc/match_documents?limit=3 "HTTP/1.1 200 OK"


'Answer: 亲子·坝上双草原6日游的儿童价格没有直接给出，但提到了该行程的起始价格为￥3980元。通常，具体的儿童价格可能会根据年龄或其他因素有所不同，需要查看详细的套餐信息或直接咨询活动组织者以获取准确价格。'

### 01_multi_query

In [58]:
from langchain.chains import LLMChain
from langchain.retrievers.multi_query import LineListOutputParser
from langchain.prompts import PromptTemplate

QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from a vector
    database. By generating multiple perspectives on the user question, your goal is to help
    the user overcome some of the limitations of the distance-based similarity search.
    Provide these alternative questions seperated by newlines.
    Original question: {question}""",
)
output_parser = LineListOutputParser()
llm_chain = LLMChain(llm=qwen_llm, prompt=QUERY_PROMPT, output_parser=output_parser)

In [81]:
from langchain.retrievers.multi_query import MultiQueryRetriever
print(query)
multi_query_retriever = MultiQueryRetriever(
    retriever=retriever, llm_chain=llm_chain, parser_key="lines"
)

unique_docs = multi_query_retriever.get_relevant_documents(query=query)
unique_docs

请推荐一些亲子游的旅行团


2024-06-25 15:12:27,250:INFO - Generated queries: ['1. What are some family-friendly tour packages for vacationing with kids?', '2. Could you suggest travel groups that cater specifically to parent-child bonding experiences?', "3. I'm looking for organized trips that are ideal for parents and children to enjoy together, any recommendations?", '4. Are there any tour options designed for families with young children that promote quality time and education?', '5. In search of adventure! Can you recommend travel tours suitable for both adults and kids for a fun-filled family getaway?']
2024-06-25 15:12:29,420:INFO - HTTP Request: POST https://infrxrfaftyrxvkwvncf.supabase.co/rest/v1/rpc/match_documents?limit=3 "HTTP/1.1 200 OK"


KeyError: 'data'

In [82]:
rag_chain = (
        {"context": (multi_query_retriever | format_docs), "question": RunnablePassthrough()}
        | ChatPromptTemplate.from_template(prompt_v2)
        | qwen_llm
        | StrOutputParser()
)

In [83]:
rag_chain.invoke(query + "请用中文回答")

2024-06-25 15:13:11,568:INFO - Generated queries: ['1. 哪些旅行社提供适合家庭的旅游套餐？', '2. 有什么好的亲子度假旅行线路推荐吗？', '3. 寻找一些适合带孩子的旅行团，有什么建议？', '4. 有哪些包含亲子活动的旅游项目可以推荐？', '5. 如何找到适合全家出游的旅行团？']
2024-06-25 15:13:13,783:INFO - HTTP Request: POST https://infrxrfaftyrxvkwvncf.supabase.co/rest/v1/rpc/match_documents?limit=3 "HTTP/1.1 200 OK"
2024-06-25 15:13:15,692:INFO - HTTP Request: POST https://infrxrfaftyrxvkwvncf.supabase.co/rest/v1/rpc/match_documents?limit=3 "HTTP/1.1 200 OK"
2024-06-25 15:13:17,346:INFO - HTTP Request: POST https://infrxrfaftyrxvkwvncf.supabase.co/rest/v1/rpc/match_documents?limit=3 "HTTP/1.1 200 OK"
2024-06-25 15:13:18,775:INFO - HTTP Request: POST https://infrxrfaftyrxvkwvncf.supabase.co/rest/v1/rpc/match_documents?limit=3 "HTTP/1.1 200 OK"
2024-06-25 15:13:20,567:INFO - HTTP Request: POST https://infrxrfaftyrxvkwvncf.supabase.co/rest/v1/rpc/match_documents?limit=3 "HTTP/1.1 200 OK"


'Answer: 这里提到了两个旅行活动，一个是高端度假越野派瑜伽行，另一个是房车游骑游天下沙漠旅行。但都没有明确提到亲子游的旅行团。对于亲子游，我建议查看游侠客平台上的相关产品，他们可能有适合家庭的旅行套餐，例如草原之旅中提到的一些小朋友专属的活动地点，如国家博物馆等。不过，具体针对亲子游的团名和详情，我没有找到。'

### 压缩

In [65]:
from langchain.retrievers.document_compressors import LLMChainFilter
from langchain.retrievers import ContextualCompressionRetriever

print(query)

# 压缩方式2
# LLMChainFilter：过滤器，删除与查询无关的文档。
_filter = LLMChainFilter.from_llm(cf_llm)
compression_retriever = ContextualCompressionRetriever(base_compressor=_filter, base_retriever=retriever)

compressed_docs = compression_retriever.get_relevant_documents(query)
pretty_print_docs(compressed_docs)

这个团【亲子·坝上双草原6日丨乌兰布统+锡林郭勒+长城】的满减优惠是多少？


2024-06-25 14:41:18,217:INFO - HTTP Request: POST https://infrxrfaftyrxvkwvncf.supabase.co/rest/v1/rpc/match_documents?limit=3 "HTTP/1.1 200 OK"


Document 1:

呼和浩特出发|6天5晚



亲子·草原沙漠行6日丨奇遇草原沙漠火山20+体验

￥3880元起
2775人已报名

北京出发|6天5晚



亲子·坝上双草原6日丨乌兰布统+锡林郭勒+长城

￥3980元起
13068人已报名
----------------------------------------------------------------------------------------------------
Document 2:

赤峰市出发|7天6晚



马背旅行·纵马乌兰布统草原，坝上秘境探索7日

￥8800元起
76人已报名

北京出发|6天5晚



坝上秋摄丨乌兰布统坝上草原+金山岭长城6日

￥3980元起
30人已报名
----------------------------------------------------------------------------------------------------
Document 3:

北京出发|6天5晚



坝上秋摄丨乌兰布统坝上草原+金山岭长城6日

￥3980元起
30人已报名

北京出发|7天6晚



越野派·坝上草原7日丨千里草原游牧无人区穿越

￥5180元起
2240人已报名


In [66]:
from langchain.retrievers.document_compressors import EmbeddingsFilter

print(query)

# 压缩方式3
# EmbeddingsFilter: 一种使用嵌入技术来丢弃与查询无关的文档的文档压缩器。
embeddings_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.8)
compression_retriever = ContextualCompressionRetriever(base_compressor=embeddings_filter, base_retriever=retriever)

compressed_docs = compression_retriever.get_relevant_documents(query)
pretty_print_docs(compressed_docs)

这个团【亲子·坝上双草原6日丨乌兰布统+锡林郭勒+长城】的满减优惠是多少？


2024-06-25 14:42:43,844:INFO - HTTP Request: POST https://infrxrfaftyrxvkwvncf.supabase.co/rest/v1/rpc/match_documents?limit=3 "HTTP/1.1 200 OK"


Document 1:

呼和浩特出发|6天5晚



亲子·草原沙漠行6日丨奇遇草原沙漠火山20+体验

￥3880元起
2775人已报名

北京出发|6天5晚



亲子·坝上双草原6日丨乌兰布统+锡林郭勒+长城

￥3980元起
13068人已报名
----------------------------------------------------------------------------------------------------
Document 2:

赤峰市出发|7天6晚



马背旅行·纵马乌兰布统草原，坝上秘境探索7日

￥8800元起
76人已报名

北京出发|6天5晚



坝上秋摄丨乌兰布统坝上草原+金山岭长城6日

￥3980元起
30人已报名
----------------------------------------------------------------------------------------------------
Document 3:

北京出发|6天5晚



坝上秋摄丨乌兰布统坝上草原+金山岭长城6日

￥3980元起
30人已报名

北京出发|7天6晚



越野派·坝上草原7日丨千里草原游牧无人区穿越

￥5180元起
2240人已报名


In [79]:
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

metadata_field_info = [
    AttributeInfo(
        name="category_cn",
        description="所属的中文分类",
        type="string",
    ),
    AttributeInfo(
        name="category_en",
        description="所属的英文分类",
        type="integer",
    ),
]
document_content_description = "旅游团的详情信息"

# 使用向量存储和 LLM 来生成向量存储查询的检索器。
self_query_retriever = SelfQueryRetriever.from_llm(cf_llm,
                                                   vectorstore,
                                                   document_content_description,
                                                   metadata_field_info,
                                                   verbose=True)

In [80]:
query = '请推荐一些亲子游的旅行团'
print(query)
retriever.get_relevant_documents(query)

请推荐一些亲子游的旅行团


2024-06-25 15:11:35,748:INFO - HTTP Request: POST https://infrxrfaftyrxvkwvncf.supabase.co/rest/v1/rpc/match_documents?limit=3 "HTTP/1.1 200 OK"


[Document(page_content='房车游骑游天下沙漠旅行', metadata={'source': 'https://www.youxiake.com/lines.html?id=26372&spm=eyJmcm9tIjoyNzIsIm9yaWdpbmFsX2lkIjowfQ', 'category_cn': '亲子游', 'category_en': 'Parent-child tour', 'start_index': 6939}),
 Document(page_content='房车游骑游天下沙漠旅行', metadata={'source': 'https://www.youxiake.com/lines.html?id=19546&spm=eyJmcm9tIjoyNzIsIm9yaWdpbmFsX2lkIjowfQ', 'category_cn': '亲子游', 'category_en': 'Parent-child tour', 'start_index': 6939}),
 Document(page_content='成行可报名暂满', metadata={'source': 'https://www.youxiake.com/lines.html?id=26372&spm=eyJmcm9tIjoyNzIsIm9yaWdpbmFsX2lkIjowfQ', 'category_cn': '亲子游', 'category_en': 'Parent-child tour', 'start_index': 9427})]