In [2]:
#Data Ingestion
from langchain_community.document_loaders import TextLoader, WebBaseLoader, PyPDFLoader

In [3]:
loader = TextLoader('speech.txt')
text_doc = loader.load()

In [4]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [5]:
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")

In [6]:
loader = PyPDFLoader('FD.pdf')

In [7]:
pdf_doc = loader.load()

In [8]:
pdf_doc

[Document(page_content='Normalization  \n91.2914  1 Normalization  \nWe discuss four normal forms: first, second, third, and \nBoyce -Codd normal forms  \n1NF, 2NF, 3NF, and BCNF  \n \nNormalization  is a process that “improves” a database \ndesign by generating relations that are of higher normal \nforms.  \n \nThe objective  of normalization:  \n“to create relations where every dependency is on the key, \nthe whole key, and nothing but the key ”. \n ', metadata={'source': 'FD.pdf', 'page': 0}),
 Document(page_content='Normalization  \n91.2914  2 There is a sequence to normal forms:  \n1NF is considered the weakest,  \n2NF is stronger than 1NF,  \n3NF is stronger than 2NF, and  \nBCNF is considered the strongest  \n \nAlso,  \nany relation that is in BCNF, is in 3NF;  \nany relation in 3NF is in 2NF; and  \nany relation in 2NF is in 1NF.  ', metadata={'source': 'FD.pdf', 'page': 1}),
 Document(page_content='Normalization  \n91.2914  3 BCNF  3NF 2NF 1NF a relation in BCNF , is also \ni

In [9]:
len(pdf_doc)

27

In [10]:
#Data Transformation
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10)
documents = text_splitter.split_documents(pdf_doc)

In [11]:
documents

[Document(page_content='Normalization  \n91.2914  1 Normalization  \nWe discuss four normal forms: first, second, third, and', metadata={'source': 'FD.pdf', 'page': 0}),
 Document(page_content='Boyce -Codd normal forms  \n1NF, 2NF, 3NF, and BCNF', metadata={'source': 'FD.pdf', 'page': 0}),
 Document(page_content='Normalization  is a process that “improves” a database', metadata={'source': 'FD.pdf', 'page': 0}),
 Document(page_content='design by generating relations that are of higher normal \nforms.', metadata={'source': 'FD.pdf', 'page': 0}),
 Document(page_content='The objective  of normalization:  \n“to create relations where every dependency is on the key,', metadata={'source': 'FD.pdf', 'page': 0}),
 Document(page_content='the whole key, and nothing but the key ”.', metadata={'source': 'FD.pdf', 'page': 0}),
 Document(page_content='Normalization  \n91.2914  2 There is a sequence to normal forms:  \n1NF is considered the weakest,', metadata={'source': 'FD.pdf', 'page': 1}),
 Docume

In [12]:
len(documents)

122

In [13]:
#Vector Embedding and Vector Storage
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma

In [14]:
db = Chroma.from_documents(documents, OllamaEmbeddings(model="llama2"))

In [15]:
query = "Normalization is a process that"
db.similarity_search(query)

[Document(page_content='May 2012  91.2814  5 NORMALIZATION  is a database design technique that', metadata={'page': 4, 'source': 'FD.pdf'}),
 Document(page_content='organizes tables in a manner that reduces redundancy and', metadata={'page': 4, 'source': 'FD.pdf'}),
 Document(page_content='Transitive dependency  \n91.2914  10 Transitive dependency', metadata={'page': 9, 'source': 'FD.pdf'}),
 Document(page_content='Second Normal Form  \n91.2914  18 Second Normal Form', metadata={'page': 17, 'source': 'FD.pdf'})]

In [16]:
from langchain_community.llms import Ollama

In [17]:
llm_model = Ollama(model='llama2')

In [18]:
#Design ChatPrompt Template
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_template("Answer the following question based only on the provided context. Think step by step before providing a detailed answer. I will tip you $1000 if the user finds the answer useful. <context>{context}</context> Question: {input}")


In [19]:
#chain stuff
from langchain.chains.combine_documents import create_stuff_documents_chain
doc_chain = create_stuff_documents_chain(llm_model, prompt)

In [20]:
retriever = db.as_retriever()
retriever

VectorStoreRetriever(tags=['Chroma', 'OllamaEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000001E4CDC476E0>)

In [21]:
from langchain.chains import create_retrieval_chain
retrieval_chain = create_retrieval_chain(retriever, doc_chain)

In [22]:
retrieval_chain.invoke({'input' : "BCNF is considered the strongest"})

{'input': 'BCNF is considered the strongest',
 'context': [Document(page_content='2NF is stronger than 1NF,  \n3NF is stronger than 2NF, and  \nBCNF is considered the strongest', metadata={'page': 1, 'source': 'FD.pdf'}),
  Document(page_content='•This definition of 3NF differs from BCNF only in the', metadata={'page': 22, 'source': 'FD.pdf'}),
  Document(page_content='specification of non -key attributes - 3NF is weaker than', metadata={'page': 22, 'source': 'FD.pdf'}),
  Document(page_content='Since there is a determinant that is not a \ncandidate key, InvLine is not BCNF', metadata={'page': 18, 'source': 'FD.pdf'})],
 'answer': "Great, thank you for providing the context! To answer your question, let's break it down step by step:\n\n1. 2NF is stronger than 1NF because:\n\t* 1NF allows duplication of non-key attributes, while 2NF does not allow it.\n\t* In 2NF, each non-key attribute must have a unique value for each row in the table.\n2. 3NF is stronger than 2NF because:\n\t* 3NF al