In [11]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [12]:
# extract data from pdf file
def load_pdf_file(file_path):
    loader = DirectoryLoader(file_path, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [13]:
extracted_data = load_pdf_file("../Data/")

In [14]:
len(extracted_data)

4505

In [None]:
# split data into text chunks

def text_split(extracted_data):
    splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=20)
    text_chunks = splitter.split_documents(extracted_data)
    return text_chunks

In [16]:
text_chunks = text_split(extracted_data)
print("length of extracted chunks: ",len(text_chunks))

length of extracted chunks:  30614


In [18]:
from langchain.embeddings import HuggingFaceEmbeddings

def download_hf_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [22]:
embeddings = download_hf_embeddings()

In [6]:
from dotenv import load_dotenv
load_dotenv()

True

In [7]:
import os
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
GROQ_API_KEY = os.environ.get("GROQ_API_KEY")

In [17]:
index_name = "medical-encyclopedia"

In [None]:
# query_results 
from pinecone.grpc import PineconeGRPC as Pinecone
# from pinecone import ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "medical-encyclopedia"

pc.create_index(
    name="medical-encyclopedia",
    dimension=384, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [8]:
import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["GROQ_API_KEY"] = GROQ_API_KEY

In [25]:
from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings
)

In [23]:
# Load Existing Pinecone Index
from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [24]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [14]:
retrieved_docs = retriever.invoke("heart attack")
retrieved_docs

[Document(id='30437c73-93a9-4429-9399-2b92591d0391', metadata={'page': 1730.0, 'source': '..\\Data\\encyclopedia.pdf'}, page_content='Heart attack\nDefinition\nA heart attack is the death of, or damage to, part\nof the heart muscle because the supply of blood to the\nheart muscle is severely reduced or stopped.\nDescription\nHeart attack is the leading cause of death in\nthe United States. More than 1.5 million Americans\nsuffer a heart attack every year, and almost half a\nmillion die, according to the American Heart\nAssociation. Most heart attacks are the end result of\nyears of silent but progressivecoronary artery disease,\nwhich can be prevented in many people. A heart attack\noften is the first symptom of coronary artery disease.\nAccording to the American Heart Association, 63%\nof women and 48% of men who died suddenly of\ncoronary artery disease had no previous symptoms.\nHeart attacks also are called myocardial infarctions'),
 Document(id='640c1425-0687-4bb2-a379-aff91ff8a7b

In [1]:
from langchain_groq import ChatGroq

In [10]:
chat_model = ChatGroq(
    temperature=0.4,
    groq_api_key=GROQ_API_KEY,
    model_name="llama3-8b-8192",
)

In [43]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following piece of retrived context to answer "
    "the question. If you don't know the answer, say that you don't know. "
    "Use three to four sentences maximum and keep the answer concise. "
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [None]:
question_answer_chain = create_stuff_documents_chain(chat_model, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [None]:
# response = rag_chain.invoke({"input": "What to do in case of cardiac arrest?"})
response = rag_chain.invoke({"input": "who is president of the USA?"})
print(response["answer"])