# Retrieval

In [1]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

## Load persisted vectordb


In [2]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

In [3]:
persist_directory = "../data/chroma/"

In [4]:
vectordb = Chroma(
    persist_directory=persist_directory,
    embedding_function=OpenAIEmbeddings()
)

In [5]:
vectordb._collection.count()

209

## Small DB from texts

In [6]:
texts = [
    """The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).""",
    """A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.""",
    """A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.""",
]

In [7]:
smalldb = Chroma.from_texts(texts, embedding=OpenAIEmbeddings())

In [8]:
question = "Tell me about all-white mushrooms with large fruiting bodies"

### SS: Similarity Search

Redundancy in resulted info

In [9]:
smalldb.similarity_search(question, k=2)

[Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.', metadata={}),
 Document(page_content='The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).', metadata={})]

### MMR: Maximum Marginal Relevance

In [10]:
smalldb.max_marginal_relevance_search(question, k=2)

Number of requested results 20 is greater than number of elements in index 3, updating n_results = 3


[Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.', metadata={}),
 Document(page_content='A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.', metadata={})]

## Addressing Diversity: Maximum marginal Relevance

In [11]:
question = "What did they say about matlab?"

### SS

Duplicate documents feteched -> Redundancy in retrieved information

In [12]:
docs_ss = vectordb.similarity_search(question, k=3)

In [13]:
docs_ss[0].page_content[0:500]

"those homeworks will be done in either MATLA B or in Octave, which is sort of — I \nknow some people call it a free ve rsion of MATLAB, which it sort  of is, sort of isn't.  \nSo I guess for those of you that haven't s een MATLAB before, and I know most of you \nhave, MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to \nplot data. And it's sort of an extremely easy to  learn tool to u"

In [14]:
docs_ss[1].page_content[0:500]

"those homeworks will be done in either MATLA B or in Octave, which is sort of — I \nknow some people call it a free ve rsion of MATLAB, which it sort  of is, sort of isn't.  \nSo I guess for those of you that haven't s een MATLAB before, and I know most of you \nhave, MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to \nplot data. And it's sort of an extremely easy to  learn tool to u"

### MMR

Maximum diversity in retrieved information

In [15]:
docs_mmr = vectordb.max_marginal_relevance_search(question, k=3)

In [16]:
docs_mmr[0].page_content[0:500]

"those homeworks will be done in either MATLA B or in Octave, which is sort of — I \nknow some people call it a free ve rsion of MATLAB, which it sort  of is, sort of isn't.  \nSo I guess for those of you that haven't s een MATLAB before, and I know most of you \nhave, MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to \nplot data. And it's sort of an extremely easy to  learn tool to u"

In [17]:
docs_mmr[1].page_content[0:500]

'algorithm then? So what’s different? How come  I was making all that noise earlier about \nleast squares regression being a bad idea for classification problems and then I did a \nbunch of math and I skipped some steps, but I’m, sort of, claiming at the end they’re \nreally the same learning algorithm?  \nStudent: [Inaudible] constants?  \nInstructor (Andrew Ng) :Say that again.  \nStudent: [Inaudible]  \nInstructor (Andrew Ng) :Oh, right. Okay, cool.'

## Addressing Specifity: working with metadata

In [18]:
vdb_doc = vectordb._collection.peek(1)
type(vdb_doc)

dict

In [19]:
vdb_doc.keys()

dict_keys(['ids', 'embeddings', 'documents', 'metadatas'])

In [20]:
vdb_doc['metadatas']

[{'source': '../data/docs/cs229_lectures/MachineLearning-Lecture02.pdf',
  'page': 11}]

In [21]:
question = "What did they say about regression in lecture 3?"


In [22]:
docs = vectordb.similarity_search(
    question,
    k=3,
    filter={'source': '../data/docs/cs229_lectures/MachineLearning-Lecture03.pdf'}
)

In [23]:
for doc in docs:
    print(doc.metadata)

{'source': '../data/docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 0}
{'source': '../data/docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 14}
{'source': '../data/docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 4}


## Addressing Specificity: working with metadata using `SelfQueryRetriever`


`SelfQueryRetriever`, uses an LLM to extract:

1. The `query` string to use for vector search
2. A metadata filter to pass in 

In [24]:
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.llms import OpenAI
from langchain.chains.query_constructor.base import AttributeInfo

In [25]:
llm = OpenAI(temperature=0)

In [26]:
metadata_field_info = [
    AttributeInfo(
        name='source', 
        description="""The lecture the document belongs to, should be one of `../data/docs/cs229_lectures/MachineLearning-Lecture01.pdf`, `../data/docs/cs229_lectures/MachineLearning-Lecture02.pdf`, or `../data/docs/cs229_lectures/MachineLearning-Lecture03.pdf`""",
        type="string"
    ),
    
    AttributeInfo(
        name="page",
        description="""
        The page number the document chunk belongs to
        """,
        type="integer"
    )
]

In [27]:
doc_content_desc = "Machine Learning Lecture Notes"


In [28]:
retriever = SelfQueryRetriever.from_llm(
    llm=llm,
    vectorstore=vectordb,
    document_contents=doc_content_desc,
    metadata_field_info=metadata_field_info,
)

In [29]:
question = "What did they say about regression in the third lecture?"

In [30]:
docs = retriever.get_relevant_documents(question)



In [31]:
docs

[Document(page_content='Student: It’s the lowest it –  \nInstructor (Andrew Ng) :No, exactly. Right. So zero to the same, this is not the same, \nright? And the reason is, in logi stic regression this is diffe rent from before, right? The \ndefinition of this H subscript theta of XI is not the same as the definition I was using in \nthe previous lecture. And in pa rticular this is no longer thet a transpose XI. This is not a \nlinear function anymore. This is  a logistic function of theta transpose XI. Okay? So even \nthough this looks cosmetically similar, even though this is similar on the surface, to the \nBastrian descent rule I derive d last time for least squares regression this is actually a \ntotally different learning algorithm. Okay? And it turns out that there’s actually no \ncoincidence that you ended up with the same l earning rule. We’ll actually talk a bit more \nabout this later when we talk about generalized linear models. But this is one of the most \nelegant generali

In [32]:
for doc in docs:
    print(doc.metadata)

{'source': '../data/docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 14}
{'source': '../data/docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 0}
{'source': '../data/docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 10}
{'source': '../data/docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 10}


Now all the retrieved documents are from Lecture 3! LLM was able to parse out the metadata, and also decide the relevant filter to pass

## Contextual Compression

- Another approach for improving the quality of retrieved docs is compression.

- Information most relevant to a query may be buried in a document with a lot of irrelevant text. 

- Passing that full document through your application can lead to more expensive LLM calls and poorer responses.

- Contextual compression is meant to fix this. 
- `LLMChainExtractor` extracts the relevant parts of the documents



In [33]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

In [34]:
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))


In [35]:
llm = OpenAI(temperature=0)
compressor = LLMChainExtractor.from_llm(llm)

In [36]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever(search_type="mmr")
)

In [37]:
question = "what did they say about matlab?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)

Document 1:

"MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to plot data. And it's sort of an extremely easy to learn tool to use for implementing a lot of learning algorithms."
----------------------------------------------------------------------------------------------------
Document 2:

"And the student said, "Oh, it was the MATLAB." So for those of you that don't know MATLAB yet, I hope you do learn it. It's not hard, and we'll actually have a short MATLAB tutorial in one of the discussion sections for those of you that don't know it."


## Other types of retrieval

In [38]:
from langchain.retrievers import SVMRetriever, TFIDFRetriever
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [42]:
fpath = "../data/docs/cs229_lectures/MachineLearning-Lecture01.pdf"
loader = PyPDFLoader(fpath)
docs = loader.load()

all_page_text = [doc.page_content for doc in docs]
all_text = ' '.join(all_page_text)

In [43]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150)
splits = text_splitter.split_text(all_text)

In [4]:
svm_retriever = SVMRetriever.from_texts(splits, embeddings=OpenAIEmbeddings())

In [None]:
tfidf_retriever = TFIDFRetriever.from_texts(splits)

In [None]:
question = "What are the major topics for this class?"
docs_svm = svm_retriever.get_relevant_documents(question)
docs_svm[0]

In [None]:
docs_tfidf = svm_retriever.get_relevant_documents(question)
docs_tfidf[0]