In [18]:
import os 
import openai 
from dotenv import load_dotenv 

load_dotenv() 

openai.api_key = os.getenv("OPENAI_API_KEY") 

In [19]:
from langchain.document_loaders import PyPDFLoader

# Load PDF
loaders = [
    # Duplicate documents on purpose - messy data
    PyPDFLoader("data/MachineLearning-Lecture01.pdf"),
    PyPDFLoader("data/MachineLearning-Lecture01.pdf"),
    PyPDFLoader("data/MachineLearning-Lecture02.pdf"),
    PyPDFLoader("data/MachineLearning-Lecture03.pdf"),
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [22]:
# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

In [23]:
splits = text_splitter.split_documents(docs)

In [24]:
len(splits)

209

In [25]:
# Create Embeddings 
from langchain_openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

In [26]:
sentence1 = "i like dogs"
sentence2 = "i like canines"
sentence3 = "the weather is ugly outside"

In [27]:
embedding1 = embedding.embed_query(sentence1)
embedding2 = embedding.embed_query(sentence2)
embedding3 = embedding.embed_query(sentence3) 

In [28]:
# Dot product to check similarity
import numpy as np

np.dot(embedding1, embedding2)

0.9630351468341318

In [29]:
np.dot(embedding1, embedding3)

0.7701147942700534

In [30]:
np.dot(embedding2, embedding3)

0.7591130371091992

In [31]:
# pip install chromadb

In [32]:
from langchain_pinecone import PineconeVectorStore

In [33]:
vectorstore = PineconeVectorStore.from_documents(
    docs, embedding=embedding, index_name="deeplearningai-langchain"
) 

In [34]:
question = "What did they say about matlab?"
vectorstore.similarity_search(question)

[Document(metadata={'page': 8.0, 'source': 'data/MachineLearning-Lecture01.pdf'}, page_content='those homeworks will be done in either MATLA B or in Octave, which is sort of — I \nknow some people call it a free ve rsion of MATLAB, which it sort  of is, sort of isn\'t.  \nSo I guess for those of you that haven\'t s een MATLAB before, and I know most of you \nhave, MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to \nplot data. And it\'s sort of an extremely easy to  learn tool to use for implementing a lot of \nlearning algorithms.  \nAnd in case some of you want to work on your  own home computer or something if you \ndon\'t have a MATLAB license, for the purposes of  this class, there\'s also — [inaudible] \nwrite that down [inaudible] MATLAB — there\' s also a software package called Octave \nthat you can download for free off the Internet. And it has somewhat fewer f

In [35]:
question = "What did they say about regression in the third lecture?"
vectorstore.similarity_search(question) 

[Document(metadata={'page': 0.0, 'source': 'data/MachineLearning-Lecture03.pdf'}, page_content='MachineLearning-Lecture03  \nInstructor (Andrew Ng) :Okay. Good morning and welcome b ack to the third lecture of \nthis class. So here’s what I want to do t oday, and some of the topics I do today may seem \na little bit like I’m jumping, sort  of, from topic to topic, but here’s, sort of, the outline for \ntoday and the illogical flow of ideas. In the last lecture, we  talked about linear regression \nand today I want to talk about sort of an  adaptation of that called locally weighted \nregression. It’s very a popular  algorithm that’s actually one of my former mentors \nprobably favorite machine learning algorithm.  \nWe’ll then talk about a probabl e second interpretation of linear regression and use that to \nmove onto our first classification algorithm, which is logistic regr ession; take a brief \ndigression to tell you about something cal led the perceptron algorithm, which is \nsom