In [5]:
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [6]:
# load the pdf function
def load_pdf(data):
    loader = DirectoryLoader(data, glob="*.pdf",
                            loader_cls=PyPDFLoader)
    documents = loader.load()

    return documents

In [7]:
import os

data_path = r"C:\Users\Blue\Desktop\medical-chatbot\test\Data"
if not os.path.isdir(data_path):
    print(f"Directory not found: {data_path}")
else:
    extract_data = load_pdf(data=data_path)

In [9]:
# Split the Data to texte chunks
def text_splitter(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
    )
    texts = text_splitter.split_documents(documents)
    return texts

In [10]:
text_chunk = text_splitter(documents=extract_data)
# print the first 5 chunks
for i in range(1):
    print(text_chunk[i].page_content)
    print("\n")

print(f"Total number of chunks: {len(text_chunk)}")

The GALE
ENCYCLOPEDIA of
MEDICINE
THIRD EDITION


Total number of chunks: 40000


In [1]:
from sentence_transformers import SentenceTransformer

# Download the model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Save it locally
model.save('../local_model')

  from .autonotebook import tqdm as notebook_tqdm


In [21]:
## Let's wrap the model into langchain compatable format
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore

# Load the local model via LangChain's wrapper
embedding_model = HuggingFaceEmbeddings(model_name="../local_model")

  embedding_model = HuggingFaceEmbeddings(model_name="../local_model")


In [23]:
embedding_model.embed_query("What is the capital of France?")


[0.08204811066389084,
 0.03605553135275841,
 -0.0038928852882236242,
 -0.0048810457810759544,
 0.02565113641321659,
 -0.05714348703622818,
 0.012191606685519218,
 0.004678904078900814,
 0.03494987264275551,
 -0.0224219411611557,
 -0.008005237206816673,
 -0.10935354232788086,
 0.022724784910678864,
 -0.02932087890803814,
 -0.04352205619215965,
 -0.12024123221635818,
 -0.000848641328047961,
 -0.018150122836232185,
 0.056129537522792816,
 0.003085229778662324,
 0.0023363472428172827,
 -0.01683923974633217,
 0.06362469494342804,
 -0.023660214617848396,
 0.03149356320500374,
 -0.034797921776771545,
 -0.0205488633364439,
 -0.002790951170027256,
 -0.011037975549697876,
 -0.03612672537565231,
 0.0541410930454731,
 -0.036617133766412735,
 -0.02500864863395691,
 -0.03817041590809822,
 -0.04960364103317261,
 -0.015148096717894077,
 0.02131503075361252,
 -0.012740420177578926,
 0.07670091837644577,
 0.04435573145747185,
 -0.010834861546754837,
 -0.029760034754872322,
 -0.016970466822385788,
 -0.02

In [8]:
model = SentenceTransformer('../local_model')
embedding = model.encode("What the symptoms of diabetes?")
print(embedding)

[ 2.01463215e-02  1.51488064e-02 -5.61105125e-02  1.00039624e-01
 -2.78950413e-03 -2.58262549e-02  1.19866602e-01 -6.38048630e-03
 -1.10536478e-02 -1.89225713e-03 -9.13449675e-02 -2.58039813e-02
 -5.29153720e-02  5.73070254e-03 -8.24658573e-02 -6.82189539e-02
 -8.64502601e-03 -1.17091406e-02  4.41129431e-02  1.17899310e-02
  9.06714350e-02  8.44753161e-03  1.74835213e-02  3.97165753e-02
  3.16774286e-02  2.22795177e-02  6.20494857e-02 -5.74464491e-03
 -6.71992078e-02 -6.79524317e-02 -1.01427995e-01  3.06472313e-02
 -1.32998656e-02  1.01939596e-01  1.65904108e-02  7.64981881e-02
  6.82670847e-02 -9.93887312e-04 -7.91770518e-02 -6.22382350e-02
  5.45788072e-02 -4.85753790e-02  2.59162262e-02  5.62502630e-02
  4.90830876e-02 -4.04825248e-02 -8.89330432e-02  4.95038964e-02
  1.70245674e-02  8.88625309e-02  3.37850326e-03 -6.55328557e-02
  7.01331496e-02 -3.64242829e-02  8.80213380e-02 -9.35046375e-03
 -7.89368600e-02 -8.09914097e-02  2.82360055e-02 -8.72377977e-02
 -6.24256618e-02  3.92167

In [14]:
## load the pinecone api key
import os
from dotenv import load_dotenv
load_dotenv()
pinecone_key = os.getenv("PINECONE_API_KEY")

from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(api_key=pinecone_key)

In [15]:
index_name = "medical-bot"

pc.create_index(
    name=index_name,
    dimension=384, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

{
    "name": "medical-bot",
    "metric": "cosine",
    "host": "medical-bot-irzfc3q.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [24]:
from langchain_pinecone import PineconeVectorStore

# Initialize the vector store
vector_store = PineconeVectorStore.from_documents(
    documents=text_chunk,
    index_name=index_name,
    embedding=embedding_model
)

In [25]:
# load the existin gindex
from langchain_pinecone import PineconeVectorStore

docseach = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding_model
)

In [26]:
docseach

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x1a24c7f2e50>

In [None]:
# Retrive the documents
retriever = docseach.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [29]:
result = retriever.invoke("What are the symptoms of diabetes?")
result

[Document(id='26098ca3-4488-47bf-85cb-c6adaa0ad4cf', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acrobat 6.0', 'moddate': '2006-10-16T22:03:45+02:00', 'page': 1185.0, 'page_label': '1156', 'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'source': 'C:\\Users\\Blue\\Desktop\\medical-chatbot\\test\\Data\\medical_book.pdf', 'total_pages': 4505.0}, page_content='that a person may not know that he or she has it. Early\nsigns are lethargy, extreme thirst, and frequent urina-\ntion. Other symptoms may include sudden weight loss,\nslow wound healing, urinary tract infections, gum dis-\nease, or blurred vision. It is not unusual for Type II\ndiabetes to be detected while a patient is seeing a doctor\nabout another health concern that is actually being\ncaused by the yet undiagnosed diabetes.\nIndividuals who are at high risk of developing'),
 Document(id='0aed5453-dc8a-43bc-bf88-bba449c71515', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acrobat 6

In [102]:
import os
load_dotenv()

from langchain_openai import OpenAI
llm = OpenAI(temperature=0.5, max_tokens=200)

In [104]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt =  ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [None]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(
    retriever=retriever,
    combine_docs_chain=question_answer_chain,
)

In [None]:
# test the chain
response = rag_chain.invoke({"input": "What are the symptoms of diabetes?"})

In [None]:
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from sentence_transformers import SentenceTransformer
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
import os
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain_pinecone import PineconeVectorStore
import os
from langchain_openai import OpenAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

# load the pdf function
def load_pdf(data):
    loader = DirectoryLoader(data, glob="*.pdf",
                            loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents


data_path = r"C:\Users\Blue\Desktop\medical-chatbot\test\Data"
if not os.path.isdir(data_path):
    print(f"Directory not found: {data_path}")
else:
    extract_data = load_pdf(data=data_path)

# Split the Data to texte chunks
def text_splitter(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
    )
    texts = text_splitter.split_documents(documents)
    return texts

text_chunk = text_splitter(documents=extract_data)
# print the first 5 chunks
for i in range(1):
    print(text_chunk[i].page_content)
    print("\n")

print(f"Total number of chunks: {len(text_chunk)}")


# Download the model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Save it locally
model.save('../local_model')

## Let's wrap the model into langchain compatable format

# Load the local model via LangChain's wrapper
embedding_model = HuggingFaceEmbeddings(model_name="../local_model")

embedding_model.embed_query("What is the capital of France?")

model = SentenceTransformer('../local_model')
embedding = model.encode("What the symptoms of diabetes?")
print(embedding)

## load the pinecone api key
load_dotenv()
pinecone_key = os.getenv("PINECONE_API_KEY")

pc = Pinecone(api_key=pinecone_key)

index_name = "medical-bot"

pc.create_index(
    name=index_name,
    dimension=384, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)


# Initialize the vector store
vector_store = PineconeVectorStore.from_documents(
    documents=text_chunk,
    index_name=index_name,
    embedding=embedding_model
)

# load the existin gindex

docseach = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding_model
)

docseach

# Retrive the documents
retriever = docseach.as_retriever(search_type="similarity", search_kwargs={"k": 3})

result = retriever.invoke("What are the symptoms of diabetes?")
result

load_dotenv()

llm = OpenAI(temperature=0.5, max_tokens=200)


system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt =  ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(
    retriever=retriever,
    combine_docs_chain=question_answer_chain,
)

# test the chain
response = rag_chain.invoke({"input": "What are the symptoms of diabetes?"})