In [3]:
from openai import OpenAI
import os
from dotenv import load_dotenv, find_dotenv
import utils_zh

_ = load_dotenv(find_dotenv())

client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_API_KEY"),
    )

# langchain是一种快速开发应用程序框架，组件可以链式组合

#### 加载向量数据库

In [4]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
persist_directory = 'chroma'
embedding = OpenAIEmbeddings(model='text-embedding-3-small')
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

  embedding = OpenAIEmbeddings(model='text-embedding-3-small')


In [5]:
print(vectordb._collection.count())

209


In [6]:
question = "What are major topics for this class?"
docs = vectordb.similarity_search(question,k=3)
len(docs)

3

#### 构造检索式问答链

In [7]:
from langchain_community.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

  llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)


In [8]:
from langchain.chains.retrieval_qa.base import RetrievalQA

In [9]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

In [10]:
# 可以以该方式进行检索问答
question = "What are major topics for this class?"
result = qa_chain({"query": question})

  result = qa_chain({"query": question})


In [11]:
result["result"]

'The major topics for this class seem to include machine learning, statistics, and algebra. The course also covers extensions and additional material related to machine learning.'

### 基于模板的检索式问答链

In [12]:
from langchain.prompts import PromptTemplate

template = """使用以下上下文片段来回答最后的问题。如果你不知道答案，只需说不知道，不要试图编造答案。答案最多使用三个句子。尽量简明扼要地回答。在回答的最后一定要说"感谢您的提问！"
{context}
问题：{question}
有用的回答："""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

In [13]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever = vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt":QA_CHAIN_PROMPT}
)

In [14]:
# 中文版
question = "机器学习是其中一节的话题吗"
result = qa_chain({"query":question})

In [15]:
result["result"]

'是的，机器学习是其中一节的话题。在这门课程中，我们将学习如何应用机器学习算法解决问题。感谢您的提问！'

In [16]:
result["source_documents"][0]

Document(metadata={'page': 14, 'source': 'data/cs229_lectures/MachineLearning-Lecture01.pdf'}, page_content="a machine learning class, right? If you go to a carpentry school, they can give you the \ntools of carpentry. They'll give you a hamme r, a bunch of nails, a screwdriver or \nwhatever. But a master carpenter will be able to  use those tools far better than most of us \nin this room. I know a carpen ter can do things with a hammer and nail that I couldn't \npossibly. And it's actually a littl e bit like that in machine learning, too. One thing that's \nsadly not taught in many courses on machine l earning is how to take the tools of machine \nlearning and really, really apply them well.  \nSo in the same way, so the tools of machin e learning are I wanna say quite a bit more \nadvanced than the tools of carpentry. Maybe a carpenter will disagree . But a large part of \nthis class will be just givi ng you the raw tools of machine learning, just the algorithms \nand so on. But what

#### 基于MapReduce的检索式问答链

In [17]:
qa_chain_mr = RetrievalQA.from_chain_type(
    llm,
    retriever = vectordb.as_retriever(),
    chain_type="map_reduce"
)

In [20]:
question = "Is probability a class topic?"
result = qa_chain_mr({"query": question})
result["result"]

TypeError: unsupported operand type(s) for +=: 'dict' and 'dict'

#### 基于 Refine 的检索式问答链

In [21]:
qa_chain_mr = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    chain_type="refine"
)
question = "Is probability a class topic?"
result = qa_chain_mr({"query": question})
result["result"]

'Based on the additional context provided, it is clear that probability is indeed a class topic in the course being described. The instructor, Andrew Ng, mentions using a probabilistic interpretation to derive the next learning algorithm, which will be the first classification algorithm discussed in the course. This indicates that probability concepts will be utilized in the context of machine learning algorithms and classification problems. Additionally, the discussion sections will cover statistics and algebra as refreshers for interested students, and will also delve into extensions of the main lecture material, further emphasizing the importance of probability in the course. Therefore, probability is a relevant and important topic in the course.'

#### chain无状态，无内存

In [22]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

In [23]:
question = "概率论是这节课的一个内容吗"
result = qa_chain({"query": question})
result["result"]

'是的，概率论是这门课程的一个内容。教授在课程介绍中提到了对基本概率和统计学的熟悉是假设的一部分，因此会涉及概率论的内容。'

In [24]:
question = "为什么需要具备这些知识"
result = qa_chain({"query": question})
result["result"]

'需要具备机器学习知识的原因是因为机器学习在科学和工业的许多领域都有很大的影响。它是一个高度跨学科的主题，可以应用于计算机视觉、生物学、机器人和语言等问题。此外，机器学习也在自然语言处理等领域发挥作用。学习算法是一种能够解决各种问题的工具，因此具备这些知识可以让你在感兴趣的领域应用最先进的机器学习算法。'