In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [12]:
from langchain_chroma import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings

persist_directory = 'docs/chroma/'
embedding = OpenAIEmbeddings()
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

In [14]:
from langchain.document_loaders import PyPDFLoader
loaders = [
    PyPDFLoader("LectureNotes/cs229-notes1.pdf"),
    PyPDFLoader("LectureNotes/cs229-notes1.pdf"),
    PyPDFLoader("LectureNotes/cs229-notes2.pdf"),
    PyPDFLoader("LectureNotes/cs229-notes3.pdf")
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [15]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

In [16]:
splits = text_splitter.split_documents(docs)

In [17]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [18]:
print(vectordb._collection.count())

163


In [22]:
question = 'What are major topics for  this class?'
docs = vectordb.similarity_search(question, k=3)
len(docs)

3

In [21]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=0)

  llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=0)


In [23]:
from langchain.chains import RetrievalQA
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

In [24]:
result = qa_chain.invoke({'query':question})
result['result']

'The major topics covered in this class include supervised learning, generative learning algorithms, support vector machines, margins, decision boundaries, logistic regression, perceptron algorithm, Lagrange duality, and kernels.'

In [33]:
from langchain.prompts import PromptTemplate

template = """
Use the following pieces of context to answer
{context}
Question: {question}
Helpful Answer:
"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

In [34]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [35]:
question = "Is probability a class topic?"

In [36]:
result = qa_chain.invoke({'query':question})

In [37]:
result['result']

'No, probability is not explicitly mentioned as a class topic in the provided context. The context primarily discusses event models for text classification, logistic regression, and Laplace smoothing in the context of machine learning algorithms. Probability is a fundamental concept in machine learning and is likely covered as part of the foundational knowledge required for understanding these topics, but it is not specifically highlighted as a class topic in this context.'

In [38]:
result["source_documents"][0].

Document(metadata={'page': 12, 'source': 'LectureNotes/cs229-notes2.pdf'}, page_content='13\n2.2 Event models for text classiﬁcation\nTo close oﬀ our discussion of generative learning algorithms, let’s talkabout\none more model that is speciﬁcally for text classiﬁcation. While Naive B ayes\nas we’ve presented it will work well for many classiﬁcation problems, f or text\nclassiﬁcation, there is a related model that does even better.\nIn the speciﬁc context of text classiﬁcation, Naive Bayes as prese nted uses\nthe what’s called the multi-variate Bernoulli event model . In this model,\nwe assumed that the way an email is generated is that ﬁrst it is rando mly\ndetermined (according to the class priors p(y)) whether a spammer or non-\nspammer will send you your next message. Then, the person sendin g the\nemail runs through the dictionary, deciding whether to include each word i\nin that email independently and according to the probabilities p(xi = 1 |y) =\nφi|y. Thus, the probability of 

In [39]:
qa_chain_mr = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    chain_type="map_reduce"
)

In [40]:
result = qa_chain_mr.invoke({"query": question})

In [41]:
result['result']

'Yes, probability is a class topic discussed in the context of text classification, logistic regression, and the Naive Bayes algorithm.'