# MultiVector

In [2]:
import os

# from langchain.storage import InMemoryStore
from langchain.storage import LocalFileStore
from langchain_text_splitters.character import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.text import TextLoader
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone

# load docs
text_dir = "./Korea info"
files = os.listdir(text_dir)
txt_files = [file for file in files if file.endswith(".txt")]

docs = []
print(files)
for file in txt_files:
    docs.extend(TextLoader(text_dir + "/" + file).load())

# create embedding API and llm
os.environ["OPENAI_API_KEY"] = "{YOUR_OPENAI_KEY}"
embedding = OpenAIEmbeddings()

# Connect database
pc = Pinecone(embedding=embedding, api_key="{YOUR_PINECONE_APIKEY}")
index = pc.Index("terry-multivector")
vectordb = PineconeVectorStore(index=index, embedding=embedding)

store = LocalFileStore("./store")
id_key = "doc_id"

# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectordb,
    byte_store=store,
    id_key=id_key,
)
import uuid

# Store chunks
doc_ids = [str(uuid.uuid4()) for _ in docs]
# The splitter to use to create smaller chunks
child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

sub_docs = []
for i, doc in enumerate(docs):
    chunks = child_text_splitter.split_documents([doc])
    for chunk in chunks:
        chunk.metadata[id_key] = doc_ids[i]
        chunk.metadata["type"] = "chunk"
    sub_docs.extend(chunks)
retriever.vectorstore.add_documents(sub_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))

# Summary
chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template("Summarize the following document:\n\n{doc}")
    | ChatOpenAI(max_retries=0)
    | StrOutputParser()
)
summaries = chain.batch(docs, {"max_concurrency": 5})

summary_docs = [
    Document(
        ids=doc_ids[i], page_content=s, metadata={id_key: doc_ids[i], "type": "summary"}
    )
    for i, s in enumerate(summaries)
]

retriever.vectorstore.add_documents(summary_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))

print(summary_docs)

  from tqdm.autonotebook import tqdm


['Korea Transport.txt', 'Korean food.txt', 'Korean tourist destination.txt', 'Korea things todo.txt']
[Document(metadata={'doc_id': '577194cc-7584-45cc-b727-b8566be75cf6', 'type': 'summary', 'text': "South Korea has convenient, efficient, and affordable public transportation systems including subways, buses, trains, and taxis. Major cities like Seoul, Busan, Incheon, Daegu, and Daejeon have well-developed public transit networks. Seoul has a world-class subway system with 10 lines, while Busan has a 4-line metro system. Incheon has two light rail metro lines, Daegu has a simple two-line metro, and Daejeon has a single metro line. The KTX high-speed rail offers comfortable and fast travel between cities. The author is willing to provide more information to help travelers navigate South Korea's transportation systems."}, page_content="South Korea has convenient, efficient, and affordable public transportation systems including subways, buses, trains, and taxis. Major cities like Seoul, B

In [6]:
query = "What I can enjoy in Korea ?"
# docs = vectordb.similarity_search_with_score(query,k=10)
docs = vectordb.similarity_search(
    query,
    k=10,
    filter={
        "$and": [
            {"type": "chunk"},
        ]
    },
)
print(docs[5])
ids = []
for doc in docs:
    ids.append(doc.metadata["doc_id"])
    # print(doc.metadata['type'],"score :",doc[1])
    # print(doc.metadata["type"])

page_content='5. Hike Around Jeju Island - South Korea's popular resort island has much natural beauty to discover like Hallasan Mountain, Manjanggul Lava Tube caves, Cheonjeyeon Falls cascading into the ocean, Sangumburi Crater and Seongsan Ilchulbong Peak sunrise views in addition to scenic coastal drives around the island's lush landscape.' metadata={'doc_id': 'e2b5ea33-a4d4-4e04-9a80-e45b731c341f', 'source': './Korea info/Korea things todo.txt', 'type': 'chunk'}


In [8]:
print(ids)
# ref https://docs.pinecone.io/reference/api/2024-07/data-plane/delete
index.delete(ids=['e2b5ea33-a4d4-4e04-9a80-e45b731c341f', '913cc0f2-57d4-4c74-9c63-21ca29bde994', '577194cc-7584-45cc-b727-b8566be75cf6', 'e2b5ea33-a4d4-4e04-9a80-e45b731c341f', '913cc0f2-57d4-4c74-9c63-21ca29bde994', 'e2b5ea33-a4d4-4e04-9a80-e45b731c341f', 'e2b5ea33-a4d4-4e04-9a80-e45b731c341f', 'e2b5ea33-a4d4-4e04-9a80-e45b731c341f', 'e2b5ea33-a4d4-4e04-9a80-e45b731c341f', 'e2b5ea33-a4d4-4e04-9a80-e45b731c341f'])

['e2b5ea33-a4d4-4e04-9a80-e45b731c341f', '913cc0f2-57d4-4c74-9c63-21ca29bde994', '577194cc-7584-45cc-b727-b8566be75cf6', 'e2b5ea33-a4d4-4e04-9a80-e45b731c341f', '913cc0f2-57d4-4c74-9c63-21ca29bde994', 'e2b5ea33-a4d4-4e04-9a80-e45b731c341f', 'e2b5ea33-a4d4-4e04-9a80-e45b731c341f', 'e2b5ea33-a4d4-4e04-9a80-e45b731c341f', 'e2b5ea33-a4d4-4e04-9a80-e45b731c341f', 'e2b5ea33-a4d4-4e04-9a80-e45b731c341f']


{}