# MultiVector

In [3]:
import os
from langchain_community.document_loaders import TextLoader
#from langchain.storage import InMemoryStore
from langchain.storage import LocalFileStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain.retrievers import ParentDocumentRetriever
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.retrievers.multi_vector import MultiVectorRetriever

import pinecone

# load docs
text_dir = "./Korea info"
files = os.listdir(text_dir)
txt_files = [file for file in files if file.endswith(".txt")]

docs = []
print(files)
for file in txt_files:
    docs.extend(TextLoader(text_dir+"/"+file).load())
    
#create embedding API and llm
os.environ["OPENAI_API_KEY"] = "{YOUR_OPENAI_KEY}"
embedding = OpenAIEmbeddings()

#Connect database
pinecone.init(api_key="{YOUR_PINECONE_APIKEY}", environment="gcp-starter")
#pinecone.create_index("terry-multivector",dimension=1536,metric="cosine")
index = pinecone.Index("terry-multivector")
text_field = "text"
vectordb = Pinecone(
    index, embedding.embed_query, text_field
)

store = LocalFileStore("./store")
id_key="doc_id"

# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectordb,
    byte_store=store,
    id_key=id_key,
)
import uuid

# Store chunks
doc_ids = [str(uuid.uuid4()) for _ in docs]
# The splitter to use to create smaller chunks
child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

sub_docs = []
for i, doc in enumerate(docs):
    chunks = child_text_splitter.split_documents([doc])
    for chunk in chunks:
        chunk.metadata[id_key] = doc_ids[i]
        chunk.metadata['type'] = 'chunk'
    sub_docs.extend(chunks)
retriever.vectorstore.add_documents(sub_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))

# Summary
chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template("Summarize the following document:\n\n{doc}")
    | ChatOpenAI(max_retries=0)
    | StrOutputParser()
)
summaries = chain.batch(docs, {"max_concurrency": 5})

summary_docs = [
    Document(ids=doc_ids[i],page_content=s, metadata={id_key: doc_ids[i],'type':'summary'})
    for i, s in enumerate(summaries)
]

retriever.vectorstore.add_documents(summary_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))

print(summary_docs)

  from tqdm.autonotebook import tqdm


['Korea Transport.txt', 'Korean food.txt', 'Korean tourist destination.txt', '.ipynb_checkpoints', 'Korea things todo.txt']




[Document(page_content="This document provides an overview of South Korea's major public transportation systems, focusing on subway systems, buses, trains, and taxis. It highlights the convenience, efficiency, and affordability of public transportation in the country. The document then provides descriptions of the public transit systems in five major cities: Seoul, Busan, Incheon, Daegu, and Daejeon. It mentions the advanced subway system in Seoul, the 4-line metro system in Busan, the light rail metro lines in Incheon, the simple metro system in Daegu, and the single metro line in Daejeon. The document concludes by offering further assistance in providing transportation information and recommendations for visitors to South Korea.", metadata={'doc_id': '6920a86c-b753-48f7-adfe-e8e8db34d1f3', 'type': 'summary', 'text': "This document provides an overview of South Korea's major public transportation systems, focusing on subway systems, buses, trains, and taxis. It highlights the convenie

In [6]:
from IPython.display import JSON
query = "What I can enjoy in Korea ?"
#docs = vectordb.similarity_search_with_score(query,k=10)
docs = vectordb.similarity_search(query,k=10,filter={
    "$and":[
        {"type":"chunk"},
    ]
})
print(docs[5])
ids = []
for doc in docs:
    #ids.append( doc[0].metadata['doc_id']) 
    #print(doc[0].metadata['type'],"score :",doc[1])
    print(doc.metadata['type'])


page_content="20. Indulge in a Templestay - Often tucked among mountain terrain, Korea's serene Buddhist temples open their doors to visitors interested in experiencing meditation, finding inner calm and trying vegetarian temple fare during an extended few days focused on self reflection enhanced through tranquil natural environments promoting peace of mind." metadata={'doc_id': '99de20bc-96a6-48f6-b78a-957314a2a325', 'source': './Korea info/Korea things todo.txt', 'type': 'chunk'}
chunk
chunk
chunk
chunk
chunk
chunk
chunk
chunk
chunk
chunk


In [54]:
print(ids)
#ref https://medium.com/@james.li/cheatsheet-for-pinecone-crud-using-langchain-caa0a5f97fe0
vectordb.delete(ids=['27c0afe6-6f08-4b4b-95a7-f2b2c851b59f'] )

[]
