## Loading Data

In [2]:
import os
from langchain.document_loaders import PyPDFLoader

In [13]:
pdf1_path = "D:/rag_assign/data/goog-10-k-2023 (1).pdf"

In [14]:
document1= []
loader = PyPDFLoader(pdf1_path)
pdf_document = loader.load()
document1.extend(pdf_document)

In [15]:
pdf2_path = "data/tsla-20231231-gen.pdf"
document2= []
loader = PyPDFLoader(pdf2_path)
pdf_document = loader.load()
document2.extend(pdf_document)

In [17]:
pdf3_path = "data/uber-10-k-2023.pdf"
document3= []
loader = PyPDFLoader(pdf3_path)
pdf_document = loader.load()
document3.extend(pdf_document)

## Chunking

In [19]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [20]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=100)

In [25]:
doc_goog = text_splitter.split_documents(document1)
print(len(doc_goog))

837


In [26]:
doc_tsla = text_splitter.split_documents(document2)
print(len(doc_tsla))

1086


In [27]:
doc_uber= text_splitter.split_documents(document3)
print(len(doc_uber))

2099


## Loading Embedding Model

In [28]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

In [38]:
hf_bge_embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-large-en")

  from tqdm.autonotebook import tqdm, trange


In [30]:
from langchain.vectorstores import Chroma
import chromadb

In [49]:
db_dir = "D:/rag_assign/db"

In [50]:
client_settings = chromadb.config.Settings(
    is_persistent=True,
    anonymized_telemetry=False,
    persist_directory = db_dir
)

In [51]:
goog_vectorstore = Chroma.from_documents(doc_goog,
                                       hf_bge_embeddings,
                                       client_settings=client_settings,
                                       collection_name="google",
                                       collection_metadata={"hnsw":"cosine"},
                                       persist_directory="/db/google")

ValueError: An instance of Chroma already exists for /db/google with different settings

In [40]:
retriever_goog = goog_vectorstore.as_retriever(search_type="mmr",search_kwargs={"k": 5, "include_metadata": True})

In [42]:
tsla_vectorstore = Chroma.from_documents(doc_tsla,
                                       hf_bge_embeddings,
                                       client_settings=client_settings,
                                       collection_name="tsla",
                                       collection_metadata={"hnsw":"cosine"},
                                       persist_directory="/db/tsla")

In [43]:
retriever_tsla = tsla_vectorstore.as_retriever(search_type="mmr",search_kwargs={"k": 5, "include_metadata": True})

In [44]:
uber_vectorstore = Chroma.from_documents(doc_uber,
                                       hf_bge_embeddings,
                                       client_settings=client_settings,
                                       collection_name="uber",
                                       collection_metadata={"hnsw":"cosine"},
                                       persist_directory="/db/uber")

In [45]:
retriever_uber = uber_vectorstore.as_retriever(search_type="mmr",search_kwargs={"k": 5, "include_metadata": True})

In [41]:
retriever_goog.get_relevant_documents("What is google's revenue?")

  retriever_goog.get_relevant_documents("What is google's revenue?")


[Document(metadata={'page': 63, 'source': 'D:/rag_assign/data/goog-10-k-2023 (1).pdf'}, page_content='Note 2.    Revenues \nDisaggregated Revenues\nThe following table presents revenues disaggregated by type (in millions):\nYear Ended December 31,\n2021 2022 2023\nGoogle Search & other $ 148,951 $ 162,450 $ 175,033 \nYouTube ads  28,845  29,243  31,510 \nGoogle Network  31,701  32,780  31,312 \nGoogle advertising  209,497  224,473  237,855 \nGoogle subscriptions, platforms, and devices  28,032  29,055  34,688 \nGoogle Services total  237,529  253,528  272,543 \nGoogle Cloud  19,206  26,280  33,088'),
 Document(metadata={'page': 34, 'source': 'D:/rag_assign/data/goog-10-k-2023 (1).pdf'}, page_content='• Revenues were $307.4 billion, an increase of 9% year over year, primarily driven by an increase in Google \nServices revenues of $19.0 billion, or 8%, and an increase in Google Cloud revenues of $6.8 billion, or 26%. \n• Total constant currency revenues, which exclude the effect of hedgi

In [47]:
client = chromadb.config.Settings(
    is_persistent=True,
    anonymized_telemetry=False,
    persist_directory = db_dir
)

In [48]:
import os
print(os.getcwd())

d:\rag_assign


In [None]:
query = ""

In [None]:
retriever = []
if "google" in query.lower():
    retriever.append(retriever_goog)
if "tesla" in query.lower():
    retriever.append(retriever_tsla)
if "uber" in query.lower():
    retriever.append(retriever_uber)
    
if retriever is None:
    response = llm(query)
else:
    

NameError: name 'query' is not defined

In [None]:
def combined(list_retriever):
    combined_retriever = MergerRetriever(retrievers = list_retriever)
    return combined_retriever

# Set up document compression pipeline
    filter = EmbeddingsRedundantFilter(embeddings=hf_bge_embeddings)
    reordering = LongContextReorder()
    pipeline = DocumentCompressorPipeline(transformers=[filter, reordering])

# Set up compression retriever
    compression_retriever = ContextualCompressionRetriever(
        base_compressor=pipeline,
        base_retriever=combined_retriever,
        search_kwargs={"k": 3, "include_metadata": True}
)