In [25]:
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()

model = 'doubao-1-5-pro-32k-250115'
embedding_model='ep-20250414112634-zq8gc'

client = OpenAI()

In [5]:
from langchain_core.documents import Document

documents = [
    Document( page_content="Dogs are great companions, known for their loyalty and friendliness.",
        metadata={"source": "mammal-pets-doc"},
    ),
    Document(
        page_content="Cats are independent pets that often enjoy their own space.",
        metadata={"source": "mammal-pets-doc"},
    ),
]

print(documents)

[Document(metadata={'source': 'mammal-pets-doc'}, page_content='Dogs are great companions, known for their loyalty and friendliness.'), Document(metadata={'source': 'mammal-pets-doc'}, page_content='Cats are independent pets that often enjoy their own space.')]


In [4]:
from langchain_community.document_loaders import PyPDFLoader

file_path = './data/s7开荒.pdf'
# 加载pdf文件
loader = PyPDFLoader(file_path)

docs = loader.load()

print(len(docs))
print(docs[0])

15
page_content='「汉祚黄天」赛季开荒白皮书 
目录 
1. 开服前准备 
2. 赛季商店活动介绍 
3. 赛季名将卡包活动介绍 
4. 开荒注意事项 
5. 开荒阵容推荐 
5-20级推荐阵容 
需20级后转型阵容 
6. 开荒节奏 
(1) 土地信息 
(2) 开荒流程' metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2025-04-10T17:06:32+08:00', 'moddate': '2025-04-10T17:06:54+08:00', 'source': './data/s7开荒.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}


In [46]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# 拆分文档
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200, chunk_overlap=10, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

print(len(all_splits))

print(all_splits[1].page_content)

18
1. 开服前准备 
1）赛季仓库 
「汉祚黄天」赛季开始前，提前选好合适的装备和坐骑放入拓印背包。 （神兵和神驹不用
放入赛季仓库。 ） 
赛季开始后，赛季仓库开放时间为创角后的 48 小时内，期间可取出装备和坐骑最多各
4件，赛季仓库中的装备和坐骑为消耗品，取出后无法进行养成，赛季结束后消失。 （未
取出的装备会保留，后续赛季仓库开放后可以正常取出使用） 
2）神兵阁


In [47]:
from langchain.embeddings.base import Embeddings
from typing import List

class ArkEmbedding(Embeddings):

    def __init__(self,client: OpenAI):
        self.client = client

    def embed_query(self,text: str) -> List[float]:
        resp = client.embeddings.create(
                model=embedding_model,
                input=[all_splits[1].page_content],
                encoding_format="float")
        return resp.data[0].embedding

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [self.embed_query(t) for t in texts]


embeddings = ArkEmbedding(client)
embeddings.embed_query(all_splits[0].page_content)
print(len(resp.data[0].embedding))

2560


In [48]:
vector_1 = embeddings.embed_query(all_splits[0].page_content)
vector_2 = embeddings.embed_query(all_splits[1].page_content)

assert len(vector_1) == len(vector_2)
print(f"Generated vectors of length {len(vector_1)}\n")
print(vector_1[:10])

Generated vectors of length 2560

[-6.09375, 3.046875, 1.4140625, -1.546875, 0.1259765625, 2.3125, -0.287109375, -0.83984375, 0.11376953125, 0.06787109375]


In [67]:
from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)

ids = vector_store.add_documents(documents=all_splits)

In [70]:
results = vector_store.similarity_search(
    "问鼎限定"
)
print(len(results))
print(results[0])


4
page_content='6. 开荒节奏 
(1) 土地信息' metadata={'producer': 'Microsoft® Word 2021', 'creator': 'Microsoft® Word 2021', 'creationdate': '2025-04-10T17:06:32+08:00', 'moddate': '2025-04-10T17:06:54+08:00', 'source': './data/s7开荒.pdf', 'total_pages': 15, 'page': 10, 'page_label': '11', 'start_index': 0}
