In [2]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [3]:
#Extract text from PDF files
def load_pdf_files(file_paths):
    loader = DirectoryLoader(
        file_paths,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )
    documents = loader.load()
    return documents

In [10]:
extracted_data = load_pdf_files("../data/")
len(extracted_data)   

637

In [12]:
# Filter to minimal docs
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs : List[Document]) -> List[Document]:
    minimal_docs : List[Document] = []
    for doc in docs:
        minimal_docs.append(
            Document(page_content=doc.page_content, 
                     metadata={"source": doc.metadata["source"]} )
        )
    return minimal_docs
minimal_docs = filter_to_minimal_docs(extracted_data)
len(minimal_docs)

637

In [13]:
#Split the documents into smaller chunks
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 500,
        chunk_overlap = 50
    )
    text_chunks = text_splitter.split_documents(minimal_docs)
    return text_chunks

text_chunks = text_split(minimal_docs)
len(text_chunks)

5961

In [18]:
# Embedding model
from langchain.embeddings import HuggingFaceEmbeddings

def download_embedding_model():
    model_name = 'sentence-transformers/all-MiniLM-L6-v2'
    embeddings = HuggingFaceEmbeddings(model_name=model_name)
    return embeddings

embeddings = download_embedding_model()


In [21]:
vector = embeddings.embed_query("My name is Ashish")
len(vector)

384

In [22]:
from dotenv import load_dotenv
import os
load_dotenv()

PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

# Set env variables
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

from pinecone import Pinecone
pincone_api_key = PINECONE_API_KEY
pc = Pinecone(api_key=pincone_api_key)

In [27]:
# Create Pinecone index
from pinecone import ServerlessSpec
index_name = "medical-chatbot"

if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,
        dimension = 384,
        metric = "cosine",
        spec = ServerlessSpec(cloud="aws", region="us-east-1")
    )

index = pc.Index(index_name)


In [29]:
#Create Pinecone vector store from documents
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents = text_chunks,
    embedding = embeddings,
    index_name = index_name
)

In [30]:
# Load to existing Pinecone index
from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore(
    embedding=embeddings,
    index_name=index_name
)

doc_to_load = Document(
    page_content="This is Ashish a Data Scientist.",
    metadata={"source": "MyData"}
)

# Add more data into existing Pinecone index
docsearch.add_documents(documents=[doc_to_load])

['208a2829-f499-4333-bbf5-10c582d49f32']

In [32]:
# Docsearch as a retriever
retriever = docsearch.as_retriever(
    search_type="similarity",
    search_kwargs={"k":4}
)

retrived_docs = retriever.invoke("Who is Ashish?")
retrived_docs

[Document(id='208a2829-f499-4333-bbf5-10c582d49f32', metadata={'source': 'MyData'}, page_content='This is Ashish a Data Scientist.'),
 Document(id='57fc651c-9c91-40da-8e3d-db477ef26c80', metadata={'source': '..\\data\\Medical_book.pdf'}, page_content='Francis Fishback. Philadelphia: Lippincott, 1996.\nGALE ENCYCLOPEDIA OF MEDICINE 2 557\nBone x rays\nGEM -0433 to 0624 - B  10/22/03 6:08 PM  Page 557'),
 Document(id='7af3731b-2808-4a75-976d-8f6dca6eac28', metadata={'source': '..\\data\\Medical_book.pdf'}, page_content='was injected.\nWheezing —A whistling or musical sound caused\nby tightening of the air passages inside the patient’s\nchest.\nNational Institute of Allergy and Infectious Disease. Building\n31, Room 7A-50, 31 Center Drive MSC 2520, Bethesda,\nMD 20892-2520. (301) 496-5717. <http://www.niaid.nih.\ngov/default.htm>.\nRebecca J. Frey\nAllergic purpura\nDefinition\nAllergic purpura (AP) is an allergic reaction of\nunknown origin causing red patches on the skin and other'),
 D

In [34]:
# Load LLM model
from langchain_openai import ChatOpenAI
chatModel = ChatOpenAI(model="gpt-3.5-turbo")

In [35]:
#Implement LLM chain for question answering
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are a helpful Medical assistant for question answering tasks"
    "Use the following piece of retrieval context to answer the question. If you don't know the answer, say that you don't know."
    "Use three sentences maximum to answer the question and keep the answer concise."
    "\n\n"
    "{context}"
)


In [38]:
prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("user", "{input}")
])

qa_chain = create_stuff_documents_chain(
    chatModel,
    prompt
)

rag_chain = create_retrieval_chain(
    retriever,
    qa_chain
)



In [39]:
response = rag_chain.invoke({"input" : "Is high triglycerides causes diabeties?"})
print(response["answer"])

High levels of triglycerides are often associated with diabetes, but they do not directly cause diabetes. Elevated triglyceride levels are a risk factor for developing atherosclerosis, which is a common complication of diabetes. However, the relationship between high triglycerides and diabetes is complex and involves multiple factors.
