In [9]:
import faiss
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
import os

In [10]:
DATA_PATH="data/"
def load_pdf_files(data):
    loader = DirectoryLoader(data,
                             glob='*.pdf',
                             loader_cls=PyPDFLoader)
    documents=loader.load()
    return documents

documents=load_pdf_files(data=DATA_PATH)
print(len(documents))

1112


In [11]:
documents[11]

Document(metadata={'source': 'data/Guyton-and-Hall-Textbook-of-Medical-Physiology-12th-Ed.pdf', 'page': 11}, page_content=' Contents\nxi\nCHAPTER 15\nVascular Distensibility and Functions of the \nArterial and Venous Systems 167\nVascular Distensibility 167\nArterial Pressure Pulsations 168\nVeins and Their Functions 171\nCHAPTER 16\nThe Microcirculation and Lymphatic  \nSystem: Capillary Fluid Exchange,  \nInterstitial Fluid, and Lymph Flow 177\nStructure of the Microcirculation  \nand Capillary System 177\nFlow of Blood in the Capillaries—  \nVasomotion 178\nExchange of Water, Nutrients, and Other  \nSubstances Between the Blood and  \nInterstitial Fluid 179\nInterstitium and Interstitial Fluid 180\nFluid Filtration Across Capillaries Is  \nDetermined by Hydrostatic and Colloid  \nOsmotic Pressures, as Well as Capillary  \nFiltration Coefficient 181\nLymphatic System 186\nCHAPTER 17\nLocal and Humoral Control of Tissue  \nBlood Flow 191\nLocal Control of Blood Flow in Response to  \n

In [12]:

def create_chunks(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500,
                                                 chunk_overlap=50)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

text_chunks=create_chunks(extracted_data=documents)
print("Length of Text Chunks: ", len(text_chunks))

Length of Text Chunks:  11546


In [13]:
text_chunks[1000]

Document(metadata={'source': 'data/Guyton-and-Hall-Textbook-of-Medical-Physiology-12th-Ed.pdf', 'page': 114}, page_content='When the myosin kinase and myosin phosphatase \nenzymes are both strongly activated, the cycling frequency \nof the myosin heads and the velocity of contraction are \ngreat. Then, as the activation of the enzymes decreases, \nthe cycling frequency decreases, but at the same time, the \ndeactivation of these enzymes allows the myosin heads to \nremain attached to the actin filament for a longer and lon-\nger proportion of the cycling period. Therefore, the num-')

In [14]:
def get_embedding_model():
    embedding_model=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embedding_model

In [15]:
embedding_model=get_embedding_model()

  return torch._C._cuda_getDeviceCount() > 0


In [16]:
DB_FAISS_PATH="vectorstore/db_faiss"
if os.path.exists(DB_FAISS_PATH):
    pass
else:
    db=FAISS.from_documents(text_chunks, embedding_model)
    db.save_local(DB_FAISS_PATH)

# connect database with langchain

In [17]:
import os
from langchain_huggingface import HuggingFaceEndpoint
from langchain_core.prompts import PromptTemplate
from langchain.chains import create_retrieval_chain
from langchain_huggingface import HuggingFaceEmbeddings

In [18]:
HF_TOKEN=os.environ.get("HF_TOKEN")

In [19]:
huggingface_repo_id="mistralai/Mistral-Nemo-Instruct-2407"

In [20]:
def load_llm(huggingface_repo_id):
    llm=HuggingFaceEndpoint(
        repo_id=huggingface_repo_id,
        temperature=0.3,
        model_kwargs={"token":HF_TOKEN,
                      "max_length":512}
    )
    return llm

In [21]:
DB_FAISS_PATH="vectorstore/db_faiss"
custom_prompt_template="""
Use the pieces of information provided in the context to answer user's question. If you don't know the answer, just dont say that you don't know, Dont try to make up an answer. Dont provide anything out of the given context.

Context:{context}
Question:{question}

Start the answer directly, No small talk!
"""
def set_custom_prompt(custom_prompt_template):
    prompt=PromptTemplate(template=custom_prompt_template,
                          input_variables=["context","question"])
    return prompt

In [22]:
embedding_model=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db=FAISS.load_local(DB_FAISS_PATH, embedding_model, allow_dangerous_deserialization=True)

## Creating qa chain

In [23]:
qa_chain=create_retrieval_chain(
    llm= load_llm(huggingface_repo_id),
    chain_type="stuff",
    retriver=db.as_retriever(search_kwargs={'k':3}),
    return_source_documents=True,
    chain_type_kwargs={'prompt':set_custom_prompt(custom_prompt_template)}
)

user_query=input("Write query here:")
response=qa_chain.invoke({'query':user_query})
print("Result:", response['result'])
print("Source Documents:", response['source_documents'])

TypeError: create_retrieval_chain() got an unexpected keyword argument 'llm'

In [52]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_core.vectorstores import VectorStoreRetriever

retriever= VectorStoreRetriever(vectorstore=db)
query="what is the meaning of life?"
question_answer_chain=create_stuff_documents_chain(load_llm(huggingface_repo_id),set_custom_prompt(custom_prompt_template))
chain=create_retrieval_chain(retriever, question_answer_chain)
chain.invoke({"input":query})

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


KeyError: "Input to PromptTemplate is missing variables {'question'}.  Expected: ['context', 'question'] Received: ['input', 'context']\nNote: if you intended {question} to be part of the string and not a variable, please escape it with double curly braces like: '{{question}}'.\nFor troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/INVALID_PROMPT_INPUT "