In [1]:
#1.加载数据
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("https://arxiv.org/pdf/2309.10305.pdf")

pages = loader.load_and_split()

In [2]:
#2.知识切片 将文档分割成均匀的块。每个块是一段原始文本
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 50,
)

docs = text_splitter.split_documents(pages)

In [9]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
import os


embed_model = OpenAIEmbeddings(
    openai_api_key=os.getenv("OPENAI_API_KEY"),
    openai_api_base="https://api.chatanywhere.tech/v1"
)
vectorstore1 = Chroma.from_documents(documents=docs, embedding=embed_model , collection_name="openai_embed")

In [10]:
query = "How large is the baichuan2 vocabulary?"

In [11]:
result2 = vectorstore1.similarity_search(query ,k = 2)

In [12]:
print(result2)

[Document(page_content='languages, such as Chinese.\nIn this technical report, we introduce Baichuan\n2, a series of large-scale multilingual language\nmodels. Baichuan 2 has two separate models,\nBaichuan 2-7B with 7 billion parameters and\nBaichuan 2-13B with 13 billion parameters. Both\nmodels were trained on 2.6 trillion tokens, which\nto our knowledge is the largest to date, more than\ndouble that of Baichuan 1 (Baichuan, 2023b,a).\nWith such a massive amount of training data,', metadata={'page': 1, 'source': 'C:\\Users\\ye_kkk\\AppData\\Local\\Temp\\tmp7ictk9yx\\tmp.pdf'}), Document(page_content='Baichuan 2: Open Large-scale Language Models\nAiyuan Yang, Bin Xiao, Bingning Wang, Borong Zhang, Chao Yin, Chenxu Lv, Da Pan\nDian Wang, Dong Yan, Fan Yang, Fei Deng, Feng Wang, Feng Liu, Guangwei Ai\nGuosheng Dong, Haizhou Zhao, Hang Xu, Haoze Sun, Hongda Zhang, Hui Liu, Jiaming Ji\nJian Xie, Juntao Dai, Kun Fang, Lei Su, Liang Song, Lifeng Liu, Liyun Ru, Luyao Ma\nMang Wang, Mickel Li

In [3]:
#没有openaikey
#1.embeding模型
#2.chat模型
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

model_name = "sentence-transformers/sentence-t5-large"


In [4]:
embedding = HuggingFaceEmbeddings(model_name=model_name)

  from .autonotebook import tqdm as notebook_tqdm
  warn(


In [23]:
vectorstore_hf = Chroma.from_documents(documents=docs, embedding=embedding , collection_name="huggingface_embed",persist_directory="chroma_data")

In [5]:
query = "How large is the baichuan2 vocabulary?"

In [25]:
result = vectorstore_hf.similarity_search(query ,k = 2)

In [26]:
print(result)

[Document(page_content='languages, such as Chinese.\nIn this technical report, we introduce Baichuan\n2, a series of large-scale multilingual language\nmodels. Baichuan 2 has two separate models,\nBaichuan 2-7B with 7 billion parameters and\nBaichuan 2-13B with 13 billion parameters. Both\nmodels were trained on 2.6 trillion tokens, which\nto our knowledge is the largest to date, more than\ndouble that of Baichuan 1 (Baichuan, 2023b,a).\nWith such a massive amount of training data,', metadata={'page': 1, 'source': 'C:\\Users\\ye_kkk\\AppData\\Local\\Temp\\tmp_2u6nnkm\\tmp.pdf'}), Document(page_content='languages, such as Chinese.\nIn this technical report, we introduce Baichuan\n2, a series of large-scale multilingual language\nmodels. Baichuan 2 has two separate models,\nBaichuan 2-7B with 7 billion parameters and\nBaichuan 2-13B with 13 billion parameters. Both\nmodels were trained on 2.6 trillion tokens, which\nto our knowledge is the largest to date, more than\ndouble that of Baich

In [6]:
vectorstore = Chroma(persist_directory="chroma_data", embedding_function=embedding, collection_name="huggingface_embed")


In [34]:
vectorstore

<langchain.vectorstores.chroma.Chroma at 0x18c03685a90>

In [31]:
vectorstore_hf

<langchain.vectorstores.chroma.Chroma at 0x18c03685880>

In [7]:
result1 = vectorstore.similarity_search(query ,k = 2)

In [8]:
print(result1)

[Document(page_content='languages, such as Chinese.\nIn this technical report, we introduce Baichuan\n2, a series of large-scale multilingual language\nmodels. Baichuan 2 has two separate models,\nBaichuan 2-7B with 7 billion parameters and\nBaichuan 2-13B with 13 billion parameters. Both\nmodels were trained on 2.6 trillion tokens, which\nto our knowledge is the largest to date, more than\ndouble that of Baichuan 1 (Baichuan, 2023b,a).\nWith such a massive amount of training data,', metadata={'page': 1, 'source': 'C:\\Users\\ye_kkk\\AppData\\Local\\Temp\\tmp_2u6nnkm\\tmp.pdf'}), Document(page_content='languages, such as Chinese.\nIn this technical report, we introduce Baichuan\n2, a series of large-scale multilingual language\nmodels. Baichuan 2 has two separate models,\nBaichuan 2-7B with 7 billion parameters and\nBaichuan 2-13B with 13 billion parameters. Both\nmodels were trained on 2.6 trillion tokens, which\nto our knowledge is the largest to date, more than\ndouble that of Baich