In [6]:
from langchain_openai import ChatOpenAI
import os
from dotenv import load_dotenv
from langchain_community.embeddings.cloudflare_workersai import CloudflareWorkersAIEmbeddings
from supabase.client import Client, create_client
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import SupabaseVectorStore
from langchain_community.document_loaders import (
    DirectoryLoader,
    UnstructuredMarkdownLoader,
    JSONLoader
)

In [4]:
load_dotenv(override=True)

qw_llm_openai = ChatOpenAI(
    openai_api_base=os.getenv('DASHSCOPE_API_BASE'),
    openai_api_key=os.getenv('DASHSCOPE_API_KEY'),
    model_name="qwen2-1.5b-instruct",
    temperature=0.7,
    streaming=True,
)
embeddings = CloudflareWorkersAIEmbeddings(
    account_id=os.getenv('CF_ACCOUNT_ID'),
    api_token=os.getenv('CF_API_TOKEN'),
    model_name="@cf/baai/bge-small-en-v1.5",
)

supabase_url = os.environ.get("SUPABASE_URL")
supabase_key = os.environ.get("SUPABASE_SERVICE_KEY")

supabase: Client = create_client(supabase_url, supabase_key)

In [37]:
import json

# 读取文件内容
with open('../../../file/yxk-list.json', 'r', encoding='utf-8') as f:
    content = f.read()

# 手动解码Unicode字符
decoded_content = content.encode('utf-8').decode('unicode_escape')

# 将解码后的内容转换为JSON对象
data = json.loads(decoded_content)

# 打印或进一步处理data
print(data)

[{'tour_name': 'å©ºæº\x90æ\x99¯å¾·é\x95\x874æ\x97¥ä»\x8eå\x8f¤å¾½å·\x9eå\x88°ç\x93·é\x83½ç\x9a\x84å\x8f\x91ç\x8e°ä¹\x8bæ\x97\x85', 'price': {'adult': 2180, 'child': 880}, 'highlights': ['æ\x96\x87å\x8c\x96å\x8e\x86å\x8f²', 'æ\x87\x92äººæ\x85¢æ¸¸', 'æ¸¸å±±ç\x8e©æ°´'], 'attractions': ['å©ºæº\x90å\x8f¤æ\x9d\x91', 'å\x8f¤å¾½å·\x9e', 'æ\x99¯å¾·é\x95\x87', 'å\x8d\x83å¹´å\x8f¤çª\x91', 'ç\x93·å\x99¨é\x9b\x86å¸\x82'], 'departure_dates': ['2024-10-03', '2024-10-04'], 'departure_city': 'æ\x9d\xadå·\x9e', 'duration': '4å¤©3æ\x99\x9a', 'flight_info': 'æ\x94¯æ\x8c\x81è\x88ªç\x8f\xadå\x8f\x98æ\x9b´è°\x83æ\x95´', 'included_fees': ['äº¤é\x80\x9a', 'ä½\x8få®¿', 'é¤\x90é£\x9f', 'æ\x99¯ç\x82¹é\x97¨ç¥¨'], 'excluded_fees': ['ç§\x81äººæ¶\x88è´¹', 'æ\x97\x85æ¸¸ä¿\x9dé\x99©'], 'notes': 'å»ºè®®æ\x8f\x90å\x89\x8d3-5å¤©æ\x8a¥å\x90\x8d', 'product_manager_speech': {'title': 'æ¸¸å®¢æ·±å\x85¥æ\x96\x87æ\x97\x85è¡\x8c', 'description': 'é\x82£äº\x9bæ\x97\x85è¡\x8cä¸\xadæ\x8e¢ç\x9f¥å\x88°ç\x9a\x84é²\x9cæ´»æ\x96\x87å\x8c\

In [38]:
print(content)

[
  {
    "tour_name": "婺源景德镇4日从古徽州到瓷都的发现之旅",
    "price": {
      "adult": 2180,
      "child": 880
    },
    "highlights": [
      "文化历史",
      "懒人慢游",
      "游山玩水"
    ],
    "attractions": [
      "婺源古村",
      "古徽州",
      "景德镇",
      "千年古窑",
      "瓷器集市"
    ],
    "departure_dates": [
      "2024-10-03",
      "2024-10-04"
    ],
    "departure_city": "杭州",
    "duration": "4天3晚",
    "flight_info": "支持航班变更调整",
    "included_fees": [
      "交通",
      "住宿",
      "餐食",
      "景点门票"
    ],
    "excluded_fees": [
      "私人消费",
      "旅游保险"
    ],
    "notes": "建议提前3-5天报名",
    "product_manager_speech": {
      "title": "游客深入文旅行",
      "description": "那些旅行中探知到的鲜活文化，触达内心，就是惊艳的风景",
      "content": [
        "一个中国极美符号：婆源墅峰，秋季梯田花海，东方小布达拉宫，全年都有的晒秋场景，徽派建筑的代表，享受慢生活。",
        "一个枕源：瑶里古镇，被群山环抱，草木幽深，小桥流水，禽鸟啼鸣，飞瀑流泉，素有“瓷之源、茶之乡、林之海”的美称。",
        "一个市集：陶溪川夜市，陶艺家聚集交流创意之地。一方创意，一方欣赏，闲逛创意市集，有别于其他市集，是一件有趣的事。",
        "一座楼：七彩瓷楼，由一位老人累蓄的6万多件瓷器打造，她给世人留下的这座楼，必将激励后来人。",
        "一位老人：12岁入行，一生为陶艺投入

In [44]:
# 将读取的数据转换为JSONLoader所需的格式
# 注意  ⚠️ 不能有 空格符号 \n
documents = JSONLoader(
    file_path='../../../file/yxk-list.json',
    jq_schema='.',
    text_content=False
).load()

In [45]:
len(documents)
documents

[Document(page_content="[{'title': '登峰造极·云中岭丨青青草原 高山草甸 强度可选', 'introduce': '高山牧场，牛羊成群，中转车上山，徒步2km即可到达牧场，4-10公里AB线（A线休闲，B线户外）', 'theme': '户外游-徒步登山', 'positive_comment_rate': '好评率：99.1%', 'tags': '线路，收费，周边，旅游产品，跟团游-一日游，1天', 'duration': '1天', 'departure_city': '成都 四川省', 'product_features': '熊猫之乡;云中牧场;高山草甸;遥望雪山', 'price': '100元起', 'the_best_travel_date': '7月份'}, {'title': '婺源景德镇4日 从古徽州到瓷都的发现之旅', 'introduce': '婺源篁岭 | 梯田古村晒秋，瑶里古镇 | 探寻千年秘密，新平瓷宫 | 致敬瓷楼奶奶，陶溪川夜市 | 趣味淘宝，御窑博物馆 | 打卡宝藏机位', 'theme': '深度人文-文化历史；休闲度假-懒人慢游；美景探索-游山玩水', 'positive_comment_rate': '好评率：20.1%', 'tags': '线路，收费，国内,旅游产品,跟团游-多日游,4天3晚', 'duration': '4天3晚', 'departure_city': '杭州', 'product_features': '小众深度体验；陶瓷文化之旅；文艺惊艳之旅；精选舒适酒店', 'price': '2180元起', 'the_best_travel_date': '10月份'}]", metadata={'source': '/Users/pangmengting/Documents/workspace/python-learning/file/yxk-list.json', 'seq_num': 1})]

In [46]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
splits = text_splitter.split_documents(documents)

In [47]:
print(len(splits))

1


In [48]:
vectorstore = SupabaseVectorStore.from_documents(
    splits,
    embeddings,
    client=supabase,
    table_name="bge_small_vector",
    query_name="bge_small_match_documents",
)

2024-07-03 18:19:48,863:INFO - HTTP Request: POST https://infrxrfaftyrxvkwvncf.supabase.co/rest/v1/bge_small_vector?columns=%22id%22%2C%22metadata%22%2C%22content%22%2C%22embedding%22 "HTTP/1.1 201 Created"
