In [3]:
from langchain_openai import ChatOpenAI
import os
from dotenv import load_dotenv
from langchain_community.embeddings.cloudflare_workersai import CloudflareWorkersAIEmbeddings
from supabase.client import Client, create_client
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import SupabaseVectorStore
from langchain_community.document_loaders import (
    DirectoryLoader,
    UnstructuredMarkdownLoader,
    JSONLoader
)

In [8]:
load_dotenv(override=True)

qw_llm_openai = ChatOpenAI(
    openai_api_base=os.getenv('DASHSCOPE_API_BASE'),
    openai_api_key=os.getenv('DASHSCOPE_API_KEY'),
    model_name="qwen2-1.5b-instruct",
    temperature=0.7,
    streaming=True,
)
embeddings = CloudflareWorkersAIEmbeddings(
    account_id=os.getenv('CF_ACCOUNT_ID'),
    api_token=os.getenv('CF_API_TOKEN'),
    model_name="@cf/baai/bge-small-en-v1.5",
)

supabase_url = os.environ.get("SUPABASE_URL")
supabase_key = os.environ.get("SUPABASE_SERVICE_KEY")

supabase: Client = create_client(supabase_url, supabase_key)

In [1]:
import json

# 读取文件内容
with open('../../../file/yxk-list.json', 'r', encoding='utf-8') as f:
    content = f.read()

# 手动解码Unicode字符
# decoded_content = content.encode('utf-8').decode('unicode_escape')

# 将解码后的内容转换为JSON对象
data = json.loads(content)

# 打印或进一步处理data
print(data)

[{'title': '【登峰造极·云中岭丨青青草原 高山草甸 强度可选】', 'introduce': '高山牧场，牛羊成群，中转车上山，徒步2km即可到达牧场，4-10公里AB线（A线休闲，B线户外）', 'theme': '户外游-徒步登山', 'positive_comment_rate': '好评率：99.1%', 'tags': '线路，收费，周边，旅游产品，跟团游-一日游，1天', 'duration': '1天', 'departure_city': '成都 四川省', 'product_features': '熊猫之乡;云中牧场;高山草甸;遥望雪山', 'price': '100元起', 'the_best_travel_date': '7月份'}, {'title': '【婺源景德镇4日 从古徽州到瓷都的发现之旅】', 'introduce': '婺源篁岭 | 梯田古村晒秋，瑶里古镇 | 探寻千年秘密，新平瓷宫 | 致敬瓷楼奶奶，陶溪川夜市 | 趣味淘宝，御窑博物馆 | 打卡宝藏机位', 'theme': '深度人文-文化历史；休闲度假-懒人慢游；美景探索-游山玩水', 'positive_comment_rate': '好评率：20.1%', 'tags': '线路，收费，国内,旅游产品,跟团游-多日游,4天3晚', 'duration': '4天3晚', 'departure_city': '杭州', 'product_features': '小众深度体验；陶瓷文化之旅；文艺惊艳之旅；精选舒适酒店', 'price': '2180元起', 'the_best_travel_date': '10月份'}, {'title': '【桂林阳朔4日 从山水画卷到民族风情的奇妙之旅】', 'introduce': '漓江 | 领略如画风光，西街 | 感受热闹夜生活，龙脊梯田 | 观赏壮美稻田，象山公园 | 打卡标志性景点', 'theme': '自然风光-山水如画；民俗风情-特色体验；休闲放松-轻松出行', 'positive_comment_rate': '好评率：18.5%', 'tags': '线路，收费，国内,旅游产品,跟团游-多日游,4天3晚', 'duration': '4天3晚', 'departure_city': '广州', 'pro

In [2]:
print(content)

[
  {
    "title": "【登峰造极·云中岭丨青青草原 高山草甸 强度可选】",
    "introduce": "高山牧场，牛羊成群，中转车上山，徒步2km即可到达牧场，4-10公里AB线（A线休闲，B线户外）",
    "theme": "户外游-徒步登山",
    "positive_comment_rate": "好评率：99.1%",
    "tags": "线路，收费，周边，旅游产品，跟团游-一日游，1天",
    "duration": "1天",
    "departure_city": "成都 四川省",
    "product_features": "熊猫之乡;云中牧场;高山草甸;遥望雪山",
    "price": "100元起",
    "the_best_travel_date": "7月份"
  },
  {
    "title": "【婺源景德镇4日 从古徽州到瓷都的发现之旅】",
    "introduce": "婺源篁岭 | 梯田古村晒秋，瑶里古镇 | 探寻千年秘密，新平瓷宫 | 致敬瓷楼奶奶，陶溪川夜市 | 趣味淘宝，御窑博物馆 | 打卡宝藏机位",
    "theme": "深度人文-文化历史；休闲度假-懒人慢游；美景探索-游山玩水",
    "positive_comment_rate": "好评率：20.1%",
    "tags": "线路，收费，国内,旅游产品,跟团游-多日游,4天3晚",
    "duration": "4天3晚",
    "departure_city": "杭州",
    "product_features": "小众深度体验；陶瓷文化之旅；文艺惊艳之旅；精选舒适酒店",
    "price": "2180元起",
    "the_best_travel_date": "10月份"
  },
  {
    "title": "【桂林阳朔4日 从山水画卷到民族风情的奇妙之旅】",
    "introduce": "漓江 | 领略如画风光，西街 | 感受热闹夜生活，龙脊梯田 | 观赏壮美稻田，象山公园 | 打卡标志性景点",
    "theme": "自然风光-山水如画；民俗风情-特色体验；休闲放松-轻松出行",
    "positive_co

In [6]:
import os
from langchain.document_loaders import TextLoader

root_dir = '../../../file'
documents = []
for dirpath, dirnames, filenames in os.walk(root_dir):
    for file in filenames:
        if file.endswith('.json'):
            try:
                loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
                documents.extend(loader.load_and_split())
            except Exception as e:
                pass
        
print(len(documents))

12


In [7]:
documents

[Document(page_content='[\n  {\n    "tour_name": "婺源景德镇4日从古徽州到瓷都的发现之旅",\n    "price": {\n      "adult": 2180,\n      "child": 880\n    },\n    "highlights": [\n      "文化历史",\n      "懒人慢游",\n      "游山玩水"\n    ],\n    "attractions": [\n      "婺源古村",\n      "古徽州",\n      "景德镇",\n      "千年古窑",\n      "瓷器集市"\n    ],\n    "departure_dates": [\n      "2024-10-03",\n      "2024-10-04"\n    ],\n    "departure_city": "杭州",\n    "duration": "4天3晚",\n    "flight_info": "支持航班变更调整",\n    "included_fees": [\n      "交通",\n      "住宿",\n      "餐食",\n      "景点门票"\n    ],\n    "excluded_fees": [\n      "私人消费",\n      "旅游保险"\n    ],\n    "notes": "建议提前3-5天报名",\n    "product_manager_speech": {\n      "title": "游客深入文旅行",\n      "description": "那些旅行中探知到的鲜活文化，触达内心，就是惊艳的风景",\n      "content": [\n        "一个中国极美符号：婆源墅峰，秋季梯田花海，东方小布达拉宫，全年都有的晒秋场景，徽派建筑的代表，享受慢生活。",\n        "一个枕源：瑶里古镇，被群山环抱，草木幽深，小桥流水，禽鸟啼鸣，飞瀑流泉，素有“瓷之源、茶之乡、林之海”的美称。",\n        "一个市集：陶溪川夜市，陶艺家聚集交流创意之地。一方创意，一方欣赏，闲逛创意市集，有别于其他市集，是一件有趣的事。",\n        "一座楼：七彩瓷

In [44]:
# 将读取的数据转换为JSONLoader所需的格式
# 注意  ⚠️ 不能有 空格符号 \n
documents = JSONLoader(
    file_path='../../../file/yxk-list.json',
    jq_schema='.',
    text_content=False
).load()

In [9]:
len(documents)
documents

[Document(page_content='[\n  {\n    "tour_name": "婺源景德镇4日从古徽州到瓷都的发现之旅",\n    "price": {\n      "adult": 2180,\n      "child": 880\n    },\n    "highlights": [\n      "文化历史",\n      "懒人慢游",\n      "游山玩水"\n    ],\n    "attractions": [\n      "婺源古村",\n      "古徽州",\n      "景德镇",\n      "千年古窑",\n      "瓷器集市"\n    ],\n    "departure_dates": [\n      "2024-10-03",\n      "2024-10-04"\n    ],\n    "departure_city": "杭州",\n    "duration": "4天3晚",\n    "flight_info": "支持航班变更调整",\n    "included_fees": [\n      "交通",\n      "住宿",\n      "餐食",\n      "景点门票"\n    ],\n    "excluded_fees": [\n      "私人消费",\n      "旅游保险"\n    ],\n    "notes": "建议提前3-5天报名",\n    "product_manager_speech": {\n      "title": "游客深入文旅行",\n      "description": "那些旅行中探知到的鲜活文化，触达内心，就是惊艳的风景",\n      "content": [\n        "一个中国极美符号：婆源墅峰，秋季梯田花海，东方小布达拉宫，全年都有的晒秋场景，徽派建筑的代表，享受慢生活。",\n        "一个枕源：瑶里古镇，被群山环抱，草木幽深，小桥流水，禽鸟啼鸣，飞瀑流泉，素有“瓷之源、茶之乡、林之海”的美称。",\n        "一个市集：陶溪川夜市，陶艺家聚集交流创意之地。一方创意，一方欣赏，闲逛创意市集，有别于其他市集，是一件有趣的事。",\n        "一座楼：七彩瓷

分割文档

In [10]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
splits = text_splitter.split_documents(documents)

In [11]:
print(len(splits))

52


向量存储

In [13]:
vectorstore = SupabaseVectorStore.from_documents(
    splits,
    embeddings,
    client=supabase,
    table_name="bge_small_vector",
    query_name="bge_small_match_documents",
)

2024-07-04 13:08:38,881:INFO - HTTP Request: POST https://infrxrfaftyrxvkwvncf.supabase.co/rest/v1/bge_small_vector?columns=%22metadata%22%2C%22embedding%22%2C%22content%22%2C%22id%22 "HTTP/1.1 201 Created"
