In [1]:
from openai import OpenAI
import os
from dotenv import load_dotenv, find_dotenv
import utils_zh

_ = load_dotenv(find_dotenv())

client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_API_KEY"),
    )

# langchain是一种快速开发应用程序框架，组件可以链式组合

In [2]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings

persist_directory = 'chroma'

In [3]:
embedding = OpenAIEmbeddings(model='text-embedding-3-small')

  embedding = OpenAIEmbeddings(model='text-embedding-3-small')


In [4]:
vectordb = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding
)

In [5]:
print(vectordb._collection.count())

209


In [6]:
texts = [
    """The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).""",
    """A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.""",
    """A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.""",
]

In [7]:
smalldb = Chroma.from_texts(texts, embedding=embedding)

In [8]:
question = "Tell me about all-white mushrooms with large fruiting bodies"

In [9]:
smalldb.similarity_search(question, k=2)

[Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.'),
 Document(page_content='The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).')]

In [10]:
smalldb.max_marginal_relevance_search(question, k=2, fetch_k=3)

[Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.'),
 Document(page_content='A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.')]

#### 最大边际相关性(MMR) Maximum marginal relevance

权衡：查询的相关性和结果的多样性

In [11]:
question = "what did they say about matlab?"
docs_ss = vectordb.similarity_search(question, k=3)

In [12]:
docs_ss[0].page_content[:100]

'those homeworks will be done in either MATLA B or in Octave, which is sort of — I \nknow some people '

In [13]:
docs_mmr = vectordb.max_marginal_relevance_search(question, k=3)
docs_mmr[0].page_content[:100]

'those homeworks will be done in either MATLA B or in Octave, which is sort of — I \nknow some people '

In [14]:
docs_ss[1].page_content[:100]

'those homeworks will be done in either MATLA B or in Octave, which is sort of — I \nknow some people '

In [15]:
docs_mmr[1].page_content[:100]

"mathematical work, he feels like he's disc overing truth and beauty in the universe. And \nhe says it"

#### 使用元数据

In [16]:
question = "what did they say about regression in the third lecture?"

docs = vectordb.similarity_search(
    question,
    k=3,
    filter={"source":"data/cs229_lectures/MachineLearning-Lecture03.pdf"}
)

In [17]:
for d in docs:
    print(d.metadata)

{'page': 0, 'source': 'data/cs229_lectures/MachineLearning-Lecture03.pdf'}
{'page': 14, 'source': 'data/cs229_lectures/MachineLearning-Lecture03.pdf'}
{'page': 6, 'source': 'data/cs229_lectures/MachineLearning-Lecture03.pdf'}


In [18]:
docs[0].page_content[:100]

'MachineLearning-Lecture03  \nInstructor (Andrew Ng) :Okay. Good morning and welcome b ack to the thir'

In [19]:
docs[1].page_content[:100]

'Student: It’s the lowest it –  \nInstructor (Andrew Ng) :No, exactly. Right. So zero to the same, thi'

In [20]:
docs[2].page_content[:100]

'data sets as well. So don’t want to talk about  that. If you’re interested, look up the work \nof And'

#### 元数据自查询检索器

In [21]:
from langchain_community.llms.openai import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

In [22]:
llm = OpenAI(temperature=0)

  llm = OpenAI(temperature=0)


In [23]:
metadata_field_info = [
    AttributeInfo(
        name="source",
        description="The lecture the chunk is from, should be one of `docs/cs229_lectures/MachineLearning-Lecture01.pdf`, `docs/cs229_lectures/MachineLearning-Lecture02.pdf`, or `docs/cs229_lectures/MachineLearning-Lecture03.pdf`",
        type="string",
    ),
    AttributeInfo(
        name="page",
        description="The page from the lecture",
        type="integer",
    )
]

In [24]:
# !pip install lark

In [25]:
document_content_description="Lecture notes"
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectordb,
    document_content_description,
    metadata_field_info,
    verbose=True
)

In [26]:
question = "what did they say about regression in the third lecture?"
docs = retriever.get_relevant_documents(question)

for d in docs:
    print(d.metadata)

  docs = retriever.get_relevant_documents(question)


In [27]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

In [28]:
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

In [29]:
llm = OpenAI(temperature=0)
compressor = LLMChainExtractor.from_llm(llm)

In [30]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever()
)

In [31]:
question = "what did they say about matlab?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)

Document 1:

- "those homeworks will be done in either MATLA B or in Octave"
- "I know some people call it a free ve rsion of MATLAB"
- "MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to plot data."
- "there's also a software package called Octave that you can download for free off the Internet."
- "it has somewhat fewer features than MATLAB, but it's free, and for the purposes of this class, it will work for just about everything."
- "once a colleague of mine at a different university, not at Stanford, actually teaches another machine learning course."
----------------------------------------------------------------------------------------------------
Document 2:

- "those homeworks will be done in either MATLA B or in Octave"
- "I know some people call it a free ve rsion of MATLAB"
- "MATLAB is I guess part of the programming language that makes it very easy to write 

In [32]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever(search_type= "mmr")
)

In [33]:
question = "what did they say about matlab?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)

Document 1:

- "those homeworks will be done in either MATLA B or in Octave"
- "I know some people call it a free ve rsion of MATLAB"
- "MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to plot data."
- "there's also a software package called Octave that you can download for free off the Internet."
- "it has somewhat fewer features than MATLAB, but it's free, and for the purposes of this class, it will work for just about everything."
- "once a colleague of mine at a different university, not at Stanford, actually teaches another machine learning course."
----------------------------------------------------------------------------------------------------
Document 2:

"Oh, it was the MATLAB."


In [34]:
from langchain.retrievers import SVMRetriever
from langchain.retrievers import TFIDFRetriever
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [35]:
loader = PyPDFLoader("data/cs229_lectures/MachineLearning-Lecture01.pdf")
pages = loader.load()
all_page_text = [p.page_content for p in pages]
joined_page_text = " ".join(all_page_text)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150)
splits = text_splitter.split_text(joined_page_text)

In [36]:
# !pip install scikit-learn

In [37]:
svm_retriever = SVMRetriever.from_texts(splits, embedding)
tfidf_retriever = TFIDFRetriever.from_texts(splits)

In [38]:
question = "What are major topics for this class?"  # 这门课的主要主题是什么？
docs_svm = svm_retriever.get_relevant_documents(question)
docs_svm[0]

Document(page_content="or find project partners or discuss homework problems and so on, and it's not monitored \nby the TAs and me. So feel free to ta lk trash about this class there.  \nIf you want to contact the teaching staff, pl ease use the email address written down here, \ncs229-qa@cs.stanford.edu. This goes to an acc ount that's read by all the TAs and me. So \nrather than sending us email individually, if you send email to this account, it will \nactually let us get back to you maximally quickly with answers to your questions.  \nIf you're asking questions about homework probl ems, please say in the subject line which \nassignment and which question the email refers to, since that will also help us to route \nyour question to the appropriate TA or to me  appropriately and get the response back to \nyou quickly.  \nLet's see. Skipping ahead — let's see — for homework, one midterm, one open and term \nproject. Notice on the honor code. So one thi ng that I think will help you to

In [39]:
question = "what did they say about matlab?"  # 他们关于Matlab说了些什么？
docs_tfidf = tfidf_retriever.get_relevant_documents(question)
docs_tfidf[0]

Document(page_content="Saxena and Min Sun here did, wh ich is given an image like this, right? This is actually a \npicture taken of the Stanford campus. You can apply that sort of cl ustering algorithm and \ngroup the picture into regions. Let me actually blow that up so that you can see it more \nclearly. Okay. So in the middle, you see the lines sort of groupi ng the image together, \ngrouping the image into [inaudible] regions.  \nAnd what Ashutosh and Min did was they then  applied the learning algorithm to say can \nwe take this clustering and us e it to build a 3D model of the world? And so using the \nclustering, they then had a lear ning algorithm try to learn what the 3D structure of the \nworld looks like so that they could come up with a 3D model that you can sort of fly \nthrough, okay? Although many people used to th ink it's not possible to take a single \nimage and build a 3D model, but using a lear ning algorithm and that sort of clustering \nalgorithm is the first ste