In [1]:
from langchain.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.retrievers import BM25Retriever, EnsembleRetriever

In [2]:
llm = Ollama(model="mistral",  callbacks=CallbackManager([StreamingStdOutCallbackHandler()]), base_url="http://127.0.0.1:11434")

In [3]:
import tensorflow as tf
tf.config.list_logical_devices()

[LogicalDevice(name='/device:CPU:0', device_type='CPU')]

In [4]:
modelPath = "BAAI/bge-large-en-v1.5"

model_kwargs = {'device':'cpu'}
encode_kwargs = {'normalize_embeddings': True}

embedding = HuggingFaceEmbeddings(
    model_name=modelPath,     
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs 
)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
loader = DirectoryLoader("./data", glob="*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()
len(documents)

20

In [6]:
print(documents[0])

page_content='If we observe some sugar and some soil ( mitti) placed on two different sheets of paper with a magnifying\nglass, we will find that the colour, shape and size of all the particles of sugar are the same, but the soil\ncontains particles of different colours, shapes and sizes. For example, the soil contains clay particles,\nsome grass particles and even some dead insects, etc. Now, sugar which contains particles of only one kind\nis called a pure substance whereas soil which contains particles of different kinds is called an impure\nsubstance (or mixture). From this we conclude that all the matter around us is not pure. The matter\naround us is of two types : pure substances and mixtures . The mixtures are impure substances. We will\nnow discuss pure substances and mixtures in a little more detail.\nA pure substance is one which is made up of only one kind of particles. These particles may be atoms\nor molecules. So, we can also say that a pure substance is one which is mad

In [7]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
len(texts)

138

In [8]:
from langchain.vectorstores import Chroma
persist_directory = './db'
vectordb = Chroma.from_documents(documents=texts, 
                                 embedding=embedding,
                                 persist_directory=persist_directory)
vectordb.persist()

In [9]:
retriever = vectordb.as_retriever(search_kwargs={'k': 7})

In [10]:
bm25_retriever = BM25Retriever.from_documents(texts)
bm25_retriever.k =  5

In [11]:
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, retriever],
                                       weights=[0.3, 0.7])

In [12]:
ensemble_retriever.get_relevant_documents("sugar and soil")

[Document(page_content='If we observe some sugar and some soil ( mitti) placed on two different sheets of paper with a magnifying\nglass, we will find that the colour, shape and size of all the particles of sugar are the same, but the soil\ncontains particles of different colours, shapes and sizes. For example, the soil contains clay particles,\nsome grass particles and even some dead insects, etc. Now, sugar which contains particles of only one kind', metadata={'page': 0, 'source': 'data\\Matter Around us.pdf'}),
 Document(page_content='or molecules. So, we can also say that a pure substance is one which is made up of only one kind of atoms or\n(a) Sugar is a pure substance ( b) Soil is an impure substance (or mixture)\nFigure 1.  The matter around us is of two types : pure substances and mixtures.', metadata={'page': 0, 'source': 'data\\Matter Around us.pdf'}),
 Document(page_content='SCIENCE FOR NINTH CLASS : CHEMISTRY 58\n46.State one property in which a solution of sugar in water 

In [13]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(llm=llm, 
                                  retriever=ensemble_retriever,
                                  return_source_documents=True)

In [14]:
def process_llm_response(query):
    llm_response = qa_chain(query)
    return llm_response
    # print('\n\nSources:')
    # for source in llm_response["source_documents"]:
    #     print(source.metadata['source'])

In [15]:
query = "Describe sugar and soil to me in less than 100 words."
response = process_llm_response(query)

  warn_deprecated(


 Sugar is a pure substance, made up of one kind of molecules with uniform color, shape, and size. Soil, on the other hand, is an impure substance or mixture consisting of different types of particles with various colors, shapes, and sizes, such as clay particles, grass particles, and even dead insects. Metals like sugar are pure substances that can be malleable and ductile, whereas soil contains a variety of materials, including minerals and organic matter.

In [16]:
print(response['result'])

 Sugar is a pure substance, made up of one kind of molecules with uniform color, shape, and size. Soil, on the other hand, is an impure substance or mixture consisting of different types of particles with various colors, shapes, and sizes, such as clay particles, grass particles, and even dead insects. Metals like sugar are pure substances that can be malleable and ductile, whereas soil contains a variety of materials, including minerals and organic matter.


In [17]:
print('Sources:\n')
for source in response['source_documents']:
    print(source)

Sources:

page_content='If we observe some sugar and some soil ( mitti) placed on two different sheets of paper with a magnifying\nglass, we will find that the colour, shape and size of all the particles of sugar are the same, but the soil\ncontains particles of different colours, shapes and sizes. For example, the soil contains clay particles,\nsome grass particles and even some dead insects, etc. Now, sugar which contains particles of only one kind' metadata={'page': 0, 'source': 'data\\Matter Around us.pdf'}
page_content='or molecules. So, we can also say that a pure substance is one which is made up of only one kind of atoms or\n(a) Sugar is a pure substance ( b) Soil is an impure substance (or mixture)\nFigure 1.  The matter around us is of two types : pure substances and mixtures.' metadata={'page': 0, 'source': 'data\\Matter Around us.pdf'}
page_content='exceptions. For example, sodium and potassium metals have low\nmelting points  (of less than 100°C). Another metal gallium has

In [19]:
print("Pages of textbook referenced:")
for source in response['source_documents']:
    print(source.metadata['page'], end=" ")

Pages of textbook referenced:
0 0 5 18 1 4 