In [1]:
# TODO
import os 
import openai 
from dotenv import load_dotenv 

load_dotenv() 

openai.api_key = os.getenv("OPENAI_API_KEY") 

In [2]:
# pip install lark

### Maximum Marginal Relevance (MMR) 
- You may not always want to choose the most similar responses


### MMR Algorithm 
- Query the vectorstore 
- choose the `fetch_k` most similar responses
- within those responses, choose the `k` most diverse

### LLM Aided Retrieval 
- There are several situations where the query applied to the db is more than just the Question asked 
- One is SelfQuery where we use an llm to convert the user question into a query

### Compression
- Increase the number of results you can put in the context by shrinking the responses to only the relevant information

In [3]:
from langchain_pinecone import PineconeVectorStore 
from langchain_openai import OpenAIEmbeddings 

texts = [
    """The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).""",
    """A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.""",
    """A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.""",
] 

In [4]:
vectorstore = PineconeVectorStore.from_texts(
    texts, embedding=OpenAIEmbeddings(), index_name="deeplearningai-langchain"
)  

question = "Tell me about all-white mushrooms with large fruiting bodies"

vectorstore.similarity_search(question, k=2) 

[Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.'),
 Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.')]

In [5]:
# Using MMR 
vectorstore.max_marginal_relevance_search(question, k=2, fetch_k=3) 

[Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.'),
 Document(page_content='The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).')]

In [6]:
# Working with MetaData 
question = "What did they say about regression in the third lecture?" 

docs = vectorstore.similarity_search(
    question, 
    k=3,
    filter={"source": "data/MachineLearning-Lecture03.pdf"}
)
for d in docs:
    print(d.metadata)

{'page': 0.0, 'source': 'data/MachineLearning-Lecture03.pdf'}
{'page': 14.0, 'source': 'data/MachineLearning-Lecture03.pdf'}
{'page': 10.0, 'source': 'data/MachineLearning-Lecture03.pdf'}


In [7]:
# Working with metadata using self-query retriever 

from langchain_openai import OpenAI 
from langchain.retrievers.self_query.base import SelfQueryRetriever 
from langchain.chains.query_constructor.base import AttributeInfo 

metadata_field_info = [
    AttributeInfo(
        name="source",
        description="The lecture the chunk is from, should be one of `data/MachineLearning-Lecture01.pdf`, `data/MachineLearning-Lecture02.pdf`, or `data/MachineLearning-Lecture03.pdf`",
        type="string",
    ),
    AttributeInfo(
        name="page", 
        description="The page from the lecture",
        type="integer"
    )
]

In [8]:
doocument_content_description = "Lecture notes"
llm = OpenAI(model="gpt-3.5-turbo-instruct", temperature=0) 
retriever = SelfQueryRetriever.from_llm(
    llm, 
    vectorstore,
    doocument_content_description, 
    metadata_field_info,
    verbose=True
)

In [9]:
question = "What did they say about regression in the third lecture?"
docs = retriever.get_relevant_documents(question) 
for d in docs: 
    print(d.metadata)

  warn_deprecated(


{'page': 0.0, 'source': 'data/MachineLearning-Lecture03.pdf'}
{'page': 14.0, 'source': 'data/MachineLearning-Lecture03.pdf'}
{'page': 10.0, 'source': 'data/MachineLearning-Lecture03.pdf'}
{'page': 6.0, 'source': 'data/MachineLearning-Lecture03.pdf'}


In [10]:
# Compression 
from langchain.retrievers import ContextualCompressionRetriever 
from langchain.retrievers.document_compressors import LLMChainExtractor 

def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))


In [11]:
# Wrap our vectorstore 
llm = OpenAI(temperature=0, model="gpt-3.5-turbo-instruct")
compressor = LLMChainExtractor.from_llm(llm) 


In [12]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectorstore.as_retriever()
)

In [13]:
question = "What did they say about matlab?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)

Document 1:

- MATLAB is part of the programming language that makes it easy to write codes using matrices, to write code for numerical routines, to move data around, and to plot data.
- MATLAB is an extremely easy to learn tool to use for implementing a lot of learning algorithms.
- There is a software package called Octave that can be downloaded for free off the Internet and can be used for the purposes of this class.
- Octave has somewhat fewer features than MATLAB, but it's free and will work for just about everything in this class.
- A colleague of the professor teaches another machine learning course and one of his students from 10 years ago said that he learned MATLAB in the course and it has helped him make lots of money.
- The professor will have a short MATLAB tutorial in one of the discussion sections for those who don't know it.
- Attendance at discussion sections is optional, but they will also be recorded and televised.
- Discussion sections will be used to go over prereq

In [14]:
# Combining Various Techniques 
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, 
    base_retriever=vectorstore.as_retriever(search_type="mmr")
)

In [15]:
question = "what did they say about matlab?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)

Document 1:

- MATLAB is part of the programming language that makes it easy to write codes using matrices, to write code for numerical routines, to move data around, and to plot data.
- MATLAB is an extremely easy to learn tool to use for implementing a lot of learning algorithms.
- There is a software package called Octave that can be downloaded for free off the Internet and has somewhat fewer features than MATLAB, but will work for just about everything in this class.
- A colleague of the professor teaches another machine learning course and a student from 10 years ago came back to thank him for teaching MATLAB, which has helped him make lots of money.
- The professor will have a short MATLAB tutorial in one of the discussion sections for those who don't know it.
- Attendance at discussion sections is optional, but they will also be recorded and televised.
- Discussion sections will be used to go over prerequisites for the class and to go over extensions for the material taught in t

In [16]:
# Other types of retrieval 
# TFIDF, SVM 
from langchain.retrievers import SVMRetriever 
from langchain.retrievers import TFIDFRetriever 
from langchain.document_loaders import PyPDFLoader 
from langchain.text_splitter import RecursiveCharacterTextSplitter 

# Load
loader = PyPDFLoader("data/MachineLearning-Lecture01.pdf")
pages = loader.load() 
all_page_text = [p.page_content for p in pages]
joined_page_text = " ".join(all_page_text)

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150) 
splits = text_splitter.split_text(joined_page_text) 

In [18]:
# Retrieve
svm_retriever = SVMRetriever.from_texts(splits, OpenAIEmbeddings())
tfidf_retriever = TFIDFRetriever.from_texts(splits)

In [19]:
question = "What are major topics for this class?"
docs_svm = svm_retriever.get_relevant_documents(question)
docs_svm[0]

Document(page_content="let me just check what questions you have righ t now. So if there are no questions, I'll just \nclose with two reminders, which are after class today or as you start to talk with other \npeople in this class, I just encourage you again to start to form project partners, to try to \nfind project partners to do your project with. And also, this is a good time to start forming \nstudy groups, so either talk to your friends  or post in the newsgroup, but we just \nencourage you to try to star t to do both of those today, okay? Form study groups, and try \nto find two other project partners.  \nSo thank you. I'm looking forward to teaching this class, and I'll see you in a couple of \ndays.   [End of Audio]  \nDuration: 69 minutes")

In [20]:
question = "what did they say about matlab?"
docs_tfidf=tfidf_retriever.get_relevant_documents(question)
docs_tfidf[0]

Document(page_content="Saxena and Min Sun here did, wh ich is given an image like this, right? This is actually a \npicture taken of the Stanford campus. You can apply that sort of cl ustering algorithm and \ngroup the picture into regions. Let me actually blow that up so that you can see it more \nclearly. Okay. So in the middle, you see the lines sort of groupi ng the image together, \ngrouping the image into [inaudible] regions.  \nAnd what Ashutosh and Min did was they then  applied the learning algorithm to say can \nwe take this clustering and us e it to build a 3D model of the world? And so using the \nclustering, they then had a lear ning algorithm try to learn what the 3D structure of the \nworld looks like so that they could come up with a 3D model that you can sort of fly \nthrough, okay? Although many people used to th ink it's not possible to take a single \nimage and build a 3D model, but using a lear ning algorithm and that sort of clustering \nalgorithm is the first ste