In [1]:
from openai import OpenAI
import os
from dotenv import load_dotenv, find_dotenv
import utils_zh

_ = load_dotenv(find_dotenv())

client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_API_KEY"),
    )

# langchain是一种快速开发应用程序框架，组件可以链式组合

#### 读文档

In [2]:
from langchain.document_loaders import PyPDFLoader

loaders = [
     # 故意添加重复文档，使数据混乱
    PyPDFLoader("data/cs229_lectures/MachineLearning-Lecture01.pdf"),
    PyPDFLoader("data/cs229_lectures/MachineLearning-Lecture01.pdf"),
    PyPDFLoader("data/cs229_lectures/MachineLearning-Lecture02.pdf"),
    PyPDFLoader("data/cs229_lectures/MachineLearning-Lecture03.pdf")
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150
)

In [4]:
splits = text_splitter.split_documents(docs)

In [5]:
len(splits)

209

#### Embeddings

In [6]:
from langchain.embeddings.openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings(model='text-embedding-3-small')

  embedding = OpenAIEmbeddings(model='text-embedding-3-small')


In [7]:
sentence1 = "i like dogs"
sentence2 = "i like canines"
sentence3 = "the weather is ugly outside"

In [8]:
embedding1 = embedding.embed_query(sentence1)
embedding2 = embedding.embed_query(sentence2)
embedding3 = embedding.embed_query(sentence3)

In [9]:
import numpy as np
#点积，分数越高句子越相似
np.dot(embedding1, embedding2)

0.833843782523142

In [10]:
np.dot(embedding1, embedding3)

0.21894574322018356

In [11]:
np.dot(embedding2, embedding3)

0.18502072045263457

#### VectorStores

In [12]:
#Chroma是因为它轻量级且数据存储在内存中
from langchain.vectorstores import Chroma

In [13]:
persist_directory='chroma/'
# !rm -rf 'chroma/'

In [15]:
# pip uninstall hnswlib
# pip uninstall chroma-hnswlib
# pip install chroma-hnswlib

In [14]:
vectordb = Chroma.from_documents(
    documents=splits,#pdf
    embedding=embedding,
    persist_directory=persist_directory#缓存路径
)

In [16]:
print(vectordb._collection.count())

209


In [17]:
question = "is there an email i can ask for help"  # "有我可以寻求帮助的电子邮件吗"
docs = vectordb.similarity_search(question, k = 3)

In [18]:
len(docs)

3

In [19]:
docs[0].page_content

"So all right, online resources. The class has a home page, so it's in on the handouts. I \nwon't write on the chalkboard — http:// cs229.stanford.edu. And so when there are \nhomework assignments or things like that, we  usually won't sort of — in the mission of \nsaving trees, we will usually not give out many handouts in class. So homework \nassignments, homework solutions will be posted online at the course home page.  \nAs far as this class, I've also written, a nd I guess I've also revised every year a set of \nfairly detailed lecture notes that cover the te chnical content of this  class. And so if you \nvisit the course homepage, you'll also find the detailed lecture notes that go over in detail \nall the math and equations and so on  that I'll be doing in class.  \nThere's also a newsgroup, su.class.cs229, also written on the handout. This is a \nnewsgroup that's sort of a forum for people in  the class to get to  know each other and \nhave whatever discussions you want to ha 

#### 失败的情况

In [20]:
question = "what did they say about matlab?"
docs = vectordb.similarity_search(question, k = 5)
docs[0]

Document(metadata={'page': 8, 'source': 'data/cs229_lectures/MachineLearning-Lecture01.pdf'}, page_content='those homeworks will be done in either MATLA B or in Octave, which is sort of — I \nknow some people call it a free ve rsion of MATLAB, which it sort  of is, sort of isn\'t.  \nSo I guess for those of you that haven\'t s een MATLAB before, and I know most of you \nhave, MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to \nplot data. And it\'s sort of an extremely easy to  learn tool to use for implementing a lot of \nlearning algorithms.  \nAnd in case some of you want to work on your  own home computer or something if you \ndon\'t have a MATLAB license, for the purposes of  this class, there\'s also — [inaudible] \nwrite that down [inaudible] MATLAB — there\' s also a software package called Octave \nthat you can download for free off the Internet. And it has some

In [21]:
question = "what did they say about regression in the third lecture?"  # "他们在第三讲中是怎么谈论回归的？"
docs = vectordb.similarity_search(question, k=5)
for doc in docs:
    print(doc.metadata)

{'page': 0, 'source': 'data/cs229_lectures/MachineLearning-Lecture03.pdf'}
{'page': 14, 'source': 'data/cs229_lectures/MachineLearning-Lecture03.pdf'}
{'page': 6, 'source': 'data/cs229_lectures/MachineLearning-Lecture03.pdf'}
{'page': 0, 'source': 'data/cs229_lectures/MachineLearning-Lecture02.pdf'}
{'page': 2, 'source': 'data/cs229_lectures/MachineLearning-Lecture03.pdf'}


In [22]:
print(docs[4].page_content)

regression problem like this. What I want to do today is talk about a class of algorithms 
called non-parametric learning algorithms that will help to alleviate the need somewhat 
for you to choose features very carefully. Okay ? And this leads us in to our discussion of 
locally weighted regression. And just to de fine the term, linear regression, as we’ve 
defined it so far, is an example of a parame tric learning algorithm. Parametric learning 
algorithm is one that’s defined as an algorithm that has a fixed number of parameters that 
fit to the data. Okay? So in linear regression we  have a fix set of parameters theta, right? 
That must fit to the data. In contrast, what  I’m gonna talk about now is our first non-
parametric learning algorithm. The formal defi nition, which is not very  intuitive, so I’ve 
replaced it with a second, say, more intuitive. The, sort of, formal definition of the non-
parametric learning algorithm is that it’s an  algorithm where the number of parameter