In [1]:
from langchain.document_loaders import PyPDFLoader , DirectoryLoader

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [9]:
#Extract text from pdf file
def load_pdf(data):
    loader = DirectoryLoader(data, glob="**/*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    print(f'You have {len(documents)} document(s) in your data.')
    return documents

In [10]:
extract_data = load_pdf("C:\\projects\\MEDICAL_CHAT_BOT\\data\\")

You have 637 document(s) in your data.


In [14]:
def text_splitter(documents):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=200)
    texts = text_splitter.split_documents(documents)
    print(f'Now you have {len(texts)} chunks of documents.')
    print("now you can use these chunks for further processing.")
    return texts

In [15]:
text_chunk = text_splitter(extract_data)

Now you have 8646 chunks of documents.
now you can use these chunks for further processing.


In [19]:
from langchain.embeddings import HuggingFaceEmbeddings

In [21]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") #384 dimension
    return embeddings


In [22]:
embedding = download_hugging_face_embeddings()

  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") #384 dimension
  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [34]:
import os
from dotenv import load_dotenv
load_dotenv()


True

In [39]:
from pinecone import Pinecone, ServerlessSpec
api_key = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=api_key)



In [44]:
index_name = "medical-chatbot"

pc.create_index(
    name=index_name,
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(
        region="us-east-1",
        cloud="aws"

    )
)

{
    "name": "medical-chatbot",
    "metric": "cosine",
    "host": "medical-chatbot-x7p01xr.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [46]:
from langchain_pinecone import PineconeVectorStore

In [50]:
docsearch = PineconeVectorStore.from_documents(
    index_name=index_name,
    embedding=embedding,
    documents=text_chunk
)


In [51]:
retrieval = docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 5})

In [68]:
answer = retrieval.get_relevant_documents("\hlo baby ?")

  answer = retrieval.get_relevant_documents("\hlo baby ?")


In [69]:
answer

[Document(id='d72bf105-c363-48c4-9111-a8d5409f177f', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 429.0, 'page_label': '430', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'C:\\projects\\MEDICAL_CHAT_BOT\\data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='infant in utero are already functioning at four and half\nmonths of age. Just as the umbilical cord provides nour-\nishment to the unborn infant’s body, Tomatis postulated\nthat the sound of the mother’s voice is also a nutrient\nheard by the fetus. This sound literally charges and stimu-\nlates the growth of the brain.\nTomatis took this further, into the realm of language.\nTomatis concluded that the need to communicate and to\nbe understood are among our most basic needs. He was a'),
 Document(id='9c8a2f73-f221-4dc6-ad76-ac33a8c88f37', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31

In [56]:
from langchain_groq import ChatGroq

In [57]:
groq_api_key = os.getenv("GROQ_API_KEY")
model = ChatGroq(model="Gemma2-9b-It", groq_api_key=groq_api_key)


In [71]:
#make chain
from langchain.chains import ConversationalRetrievalChain

qa_chain = ConversationalRetrievalChain.from_llm(
    llm=model,
    retriever=retrieval
)

# Example user input
user_input = "What is diabetes?"
chat_history = []
response = qa_chain({
    "question": user_input,
    "chat_history": chat_history
})
print(response['answer'])

'This document does not contain information about the symptoms of diabetes.  \n\n'