In [31]:
import os
import openai
 

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']
 


![alt text](<6.png>)
![alt text](<7.png>)
![alt text](<8.png>)

In [32]:
from langchain.document_loaders import PyPDFLoader

# Load PDF
loaders = [
    # Duplicate documents on purpose - messy data
    PyPDFLoader("https://see.stanford.edu/materials/aimlcs229/transcripts/MachineLearning-Lecture01.pdf"),
    PyPDFLoader("https://see.stanford.edu/materials/aimlcs229/transcripts/MachineLearning-Lecture01.pdf"),
    PyPDFLoader("https://see.stanford.edu/materials/aimlcs229/transcripts/MachineLearning-Lecture02.pdf"),
    PyPDFLoader("https://see.stanford.edu/materials/aimlcs229/transcripts/MachineLearning-Lecture03.pdf")
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [33]:
len(docs)

78

In [34]:
# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

In [35]:
splits = text_splitter.split_documents(docs)

In [36]:
len(splits)

208

In [37]:
#Embedding
from langchain.embeddings.openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

In [38]:
sentence1 = "i like dogs"
sentence2 = "i like canines"
sentence3 = "the weather is ugly outside"

In [39]:
embedding1 = embedding.embed_query(sentence1)
embedding2 = embedding.embed_query(sentence2)
embedding3 = embedding.embed_query(sentence3)

In [40]:
import numpy as np

In [41]:
np.dot(embedding1, embedding2) # compare first embedding 1 and embedding 2 --> expecting similar with high value of embedding

np.float64(0.9631510802407719)

In [42]:
np.dot(embedding1, embedding3)

np.float64(0.7702031204123156)

In [43]:
np.dot(embedding2, embedding3)

np.float64(0.7590539714454778)

### Vectorstore -- langchain has integration with 30 types of vector stores db
### https://python.langchain.com/docs/integrations/vectorstores/


In [44]:
from langchain.vectorstores import Chroma

In [47]:
persist_directory = 'files/chroma/'
#!rm -rf ./files/chroma/

In [48]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [49]:
print(vectordb._collection.count())
## it is same as no of splits we had before 

208


In [50]:
question = "is there an email i can ask for help"

In [51]:
docs = vectordb.similarity_search(question,k=3)
#k -- no of documents we want to return

In [52]:
len(docs)

3

In [54]:
print(docs)

[Document(metadata={'author': '', 'creationdate': '2008-07-11T11:25:23-07:00', 'creator': 'PScript5.dll Version 5.2.2', 'moddate': '2008-07-11T11:25:23-07:00', 'page': 5, 'page_label': '6', 'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'source': 'https://see.stanford.edu/materials/aimlcs229/transcripts/MachineLearning-Lecture01.pdf', 'title': '', 'total_pages': 22}, page_content="cs229-qa@cs.stanford.edu. This goes to an account that's read by all the TAs and me. So \nrather than sending us email individually, if you send email to this account, it will \nactually let us get back to you maximally quickly with answers to your questions.  \nIf you're asking questions about homework problems, please say in the subject line which \nassignment and which question the email refers to, since that will also help us to route \nyour question to the appropriate TA or to me appropriately and get the response back to \nyou quickly.  \nLet's see. Skipping ahead — let's see — for homework, one midte

In [53]:
docs[0].page_content

"cs229-qa@cs.stanford.edu. This goes to an account that's read by all the TAs and me. So \nrather than sending us email individually, if you send email to this account, it will \nactually let us get back to you maximally quickly with answers to your questions.  \nIf you're asking questions about homework problems, please say in the subject line which \nassignment and which question the email refers to, since that will also help us to route \nyour question to the appropriate TA or to me appropriately and get the response back to \nyou quickly.  \nLet's see. Skipping ahead — let's see — for homework, one midterm, one open and term \nproject. Notice on the honor code. So one thing that I think will help you to succeed and \ndo well in this class and even help you to enjoy this class more is if you form a study \ngroup.  \nSo start looking around where you're sitting now or at the end of class today, mingle a \nlittle bit and get to know your classmates. I strongly encourage you to form st

In [55]:
#  save this so we can use it later!
vectordb.persist()

  vectordb.persist()


In [56]:
question = "what did they say about matlab?"

In [57]:
docs = vectordb.similarity_search(question,k=4)

In [58]:
docs[0]

Document(metadata={'author': '', 'creationdate': '2008-07-11T11:25:23-07:00', 'creator': 'PScript5.dll Version 5.2.2', 'moddate': '2008-07-11T11:25:23-07:00', 'page': 8, 'page_label': '9', 'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'source': 'https://see.stanford.edu/materials/aimlcs229/transcripts/MachineLearning-Lecture01.pdf', 'title': '', 'total_pages': 22}, page_content='those homeworks will be done in either MATLAB or in Octave, which is sort of — I \nknow some people call it a free version of MATLAB, which it sort of is, sort of isn\'t.  \nSo I guess for those of you that haven\'t seen MATLAB before, and I know most of you \nhave, MATLAB is I guess part of the programming language that makes it very easy to \nwrite codes using matrices, to write code for numerical routines, to move data around, to \nplot data. And it\'s sort of an extremely easy to learn tool to use for implementing a lot of \nlearning algorithms.  \nAnd in case some of you want to work on your own home co

In [59]:
docs[1]

# Notice that we're getting duplicate chunks 
# (because of the duplicate MachineLearning-Lecture01.pdf in the index).
# Problem here is  Semantic search fetches all similar documents, but does not enforce diversity.
# Hence docs[0] and docs[1] are indentical.

Document(metadata={'author': '', 'creationdate': '2008-07-11T11:25:23-07:00', 'creator': 'PScript5.dll Version 5.2.2', 'moddate': '2008-07-11T11:25:23-07:00', 'page': 8, 'page_label': '9', 'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'source': 'https://see.stanford.edu/materials/aimlcs229/transcripts/MachineLearning-Lecture01.pdf', 'title': '', 'total_pages': 22}, page_content='those homeworks will be done in either MATLAB or in Octave, which is sort of — I \nknow some people call it a free version of MATLAB, which it sort of is, sort of isn\'t.  \nSo I guess for those of you that haven\'t seen MATLAB before, and I know most of you \nhave, MATLAB is I guess part of the programming language that makes it very easy to \nwrite codes using matrices, to write code for numerical routines, to move data around, to \nplot data. And it\'s sort of an extremely easy to learn tool to use for implementing a lot of \nlearning algorithms.  \nAnd in case some of you want to work on your own home co

In [60]:
# The question below asks a question about the third lecture, 
# but includes results from other lectures as well.
question = "what did they say about regression in the third lecture?"

In [61]:
docs = vectordb.similarity_search(question,k=5)

In [62]:
#check the metadata to check from where the data came 'source' variable is defined in the metadata -
# which shows from which lecture the data came from
for doc in docs:
    print(doc.metadata)

#Problem -  The question below asks a question about the third lecture, but includes results 
# from other lectures as well.

{'author': '', 'creationdate': '2008-07-11T11:25:03-07:00', 'creator': 'PScript5.dll Version 5.2.2', 'moddate': '2008-07-11T11:25:03-07:00', 'page': 0, 'page_label': '1', 'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'source': 'https://see.stanford.edu/materials/aimlcs229/transcripts/MachineLearning-Lecture03.pdf', 'title': '', 'total_pages': 16}
{'author': '', 'creationdate': '2008-07-11T11:25:05-07:00', 'creator': 'PScript5.dll Version 5.2.2', 'moddate': '2008-07-11T11:25:05-07:00', 'page': 2, 'page_label': '3', 'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'source': 'https://see.stanford.edu/materials/aimlcs229/transcripts/MachineLearning-Lecture02.pdf', 'title': '', 'total_pages': 18}
{'author': '', 'creationdate': '2008-07-11T11:25:05-07:00', 'creator': 'PScript5.dll Version 5.2.2', 'moddate': '2008-07-11T11:25:05-07:00', 'page': 17, 'page_label': '18', 'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'source': 'https://see.stanford.edu/materials/aimlcs229/transcripts/Machin

In [64]:

print(docs[0].page_content)


MachineLearning-Lecture03  
Instructor (Andrew Ng):Okay. Good morning and welcome back to the third lecture of 
this class. So here’s what I want to do today, and some of the topics I do today may seem 
a little bit like I’m jumping, sort of, from topic to topic, but here’s, sort of, the outline for 
today and the illogical flow of ideas. In the last lecture, we talked about linear regression 
and today I want to talk about sort of an adaptation of that called locally weighted 
regression. It’s very a popular algorithm that’s actually one of my former mentors 
probably favorite machine learning algorithm.  
We’ll then talk about a probable second interpretation of linear regression and use that to 
move onto our first classification algorithm, which is logistic regression; take a brief 
digression to tell you about something called the perceptron algorithm, which is 
something we’ll come back to, again, later this quarter; and time allowing I hope to get to 
Newton’s method, which is a