In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
# os.environ['CURL_CA_BUNDLE'] = ''
# os.environ['CA_CERTS'] = '/Users/zhou/Dev_Work/ask_pdf/http_ca.crt'
from langchain.document_loaders import PyMuPDFLoader, DirectoryLoader
from langchain import ElasticVectorSearch
from langchain.embeddings import OpenAIEmbeddings
from elasticsearch import Elasticsearch
import yaml

with open("./config.yml", "r") as ymlfile:
    cfg = yaml.safe_load(ymlfile)

OPENAI_API_KEY = cfg["openai"]["OPENAI_API_KEY"]
ELASTICSEARCH_URL = cfg["es"]["elasticsearch_url"]
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)


In [2]:
# loader = PyMuPDFLoader("Luckin Coffee Fraud  Fundamentally Broken Business.pdf")

loader = DirectoryLoader('/content/pdf', glob="**/*.pdf", loader_cls=PyMuPDFLoader)  # TextLoader


In [3]:
data = loader.load()
print(f'You have {len(data)} document(s) in your data')
print(f'There are {len(data[0].page_content)} characters in your document')


You have 89 document(s) in your data
There are 4448 characters in your document


In [4]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(data)
print(f'Now you have {len(texts)} documents')
# print(texts)


Now you have 374 documents


<!-- 新建 -->

In [9]:
docsearch = ElasticVectorSearch.from_texts(
    texts=[t.page_content for t in texts],
    embedding=embeddings,
    metadatas=[t.metadata for t in texts],
    elasticsearch_url=ELASTICSEARCH_URL,
    index_name="mmc2"
)  # namespace


Total Tokens: 0
Prompt Tokens: 0
Completion Tokens: 0
Total Cost (USD): $0.0


<!-- 向原有索引添加 -->

In [None]:
elastic_vector_search = ElasticVectorSearch(
    elasticsearch_url=ELASTICSEARCH_URL,
    index_name="mmc2",
    embedding=embeddings,
)

elastic_vector_search.add_texts(
    texts=[t.page_content for t in texts],
    embedding=embeddings,
    metadatas=[t.metadata for t in texts],
    elasticsearch_url=ELASTICSEARCH_URL,
    refresh_indices=True,
)
