In [1]:
%pwd

'/Users/adityakumar/Medical-ChatBot_GenAI/research'

In [2]:
import os
os.chdir("../")

In [3]:
%pwd

'/Users/adityakumar/Medical-ChatBot_GenAI'

### Loading Dataset and Text Spilting into chunks

In [4]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [5]:
# Load Data
def load_pdf(data):
    loader = DirectoryLoader(data,
                            glob = "*.pdf",
                            loader_cls = PyPDFLoader)
    documents = loader.load()
    return documents

data_pdf = load_pdf(data = 'dataset/')

In [6]:
# Text Splitting into chunks
def text_split(data_pdf):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap= 50)
    text_chunks = text_splitter.split_documents(data_pdf)
    return text_chunks

text_chunks = text_split(data_pdf)
print("Length of text chunks:", len(text_chunks))

Length of text chunks: 13040


### Converting text chunks into vector embedding
1) all-MiniLM-L6-v2 model :
This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.

In [7]:
from langchain.embeddings import HuggingFaceEmbeddings

In [8]:
#Downloading the embeddings model (all-MiniLM-L6-v2) from HuggingFace
def HF_Embeddings():
    embeddings = HuggingFaceEmbeddings(model_name= 'sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

embeddings = HF_Embeddings()

  embeddings = HuggingFaceEmbeddings(model_name= 'sentence-transformers/all-MiniLM-L6-v2')
  from .autonotebook import tqdm as notebook_tqdm


In [9]:
query_result = embeddings.embed_query("Hello World")
print("Length", len(query_result))

Length 384


### Store Vector Embeddings into PineCone Vector DB

In [45]:
import os
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
from dotenv import load_dotenv
load_dotenv()

True

In [46]:
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')

In [36]:
#Creating an index on pinecone.io

pc = Pinecone(api_key= PINECONE_API)

index_name = "medicalchatbot"

pc.create_index(
    name=index_name,
    dimension=384, 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [47]:
import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY


In [38]:
#Embed each chunk and store in VectorDB
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents = text_chunks,
    index_name = index_name,
    embedding = embeddings,
)

In [49]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [50]:
retrieved_docs = retriever.invoke("What is Acne?")

In [51]:
retrieved_docs

[Document(id='a0066a86-9bf9-49d2-9a7c-8d20901ec963', metadata={'page': 37.0, 'source': 'dataset/merged_output.pdf'}, page_content='Nancy J. Nordenson\nAcid reflux see Heartburn\nAcidosis see Respiratory acidosis; Renal\ntubular acidosis; Metabolic acidosis\nAcne\nDefinition\nAcne is a common skin disease characterized by\npimples on the face, chest, and back. It occurs when the\npores of the skin become clogged with oil, dead skin\ncells, and bacteria.\nDescription\nAcne vulgaris, the medical term for common acne, is\nthe most common skin disease. It affects nearly 17 million\npeople in the United States. While acne can arise at any'),
 Document(id='efd3c443-cb0e-4e2f-97f7-f6fc5eb3c033', metadata={'page': 239.0, 'source': 'dataset/merged_output.pdf'}, page_content='used to clear up mild to moderately severe acne.\nIsotretinoin (Accutane) is prescribed only for very\nsevere, disfiguring acne.\nAcne is a skin condition that occurs when pores or\nhair follicles become blocked. This allows

### Integrating LLM with OPENAI Model

In [48]:
from langchain_openai import OpenAI
LLM = OpenAI(temperature=0.4, max_tokens=500)

In [52]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [53]:
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [54]:
question_answer_chain = create_stuff_documents_chain(LLM, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [56]:
response = rag_chain.invoke({"input": "what is Acne?"})
print(response["answer"])



Acne is a common skin disease that causes pimples on the face, chest, and back. It occurs when pores become clogged with oil, dead skin cells, and bacteria. It is the most common skin disease in the United States, affecting approximately 17 million people. Treatment options include topical medications and oral medications such as isotretinoin, which is reserved for severe cases.
