In [1]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader

def pdf_loader(pdf_path):
    """Load a PDF file and split it into chunks."""
    loader = DirectoryLoader(pdf_path, glob="*.pdf", loader_cls=PyPDFLoader)

    return loader.load()


In [2]:
docs = pdf_loader("pdfs")

In [3]:
len(docs)

637

In [4]:
docs[0]

Document(metadata={'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'creator': 'PyPDF', 'creationdate': '2004-12-18T17:00:02-05:00', 'moddate': '2004-12-18T16:15:31-06:00', 'source': 'pdfs\\The Gale Encyclopedia of Medicine.pdf', 'total_pages': 637, 'page': 0, 'page_label': '1'}, page_content='')

In [5]:
from langchain_core.documents.base import Document

def meta_filter(docs: list[Document]) -> list[Document]:
    """Filter documents based on metadata."""
    final_docs = []
    for doc in docs:
        source = doc.metadata['source']
        page = doc.metadata['page']
        final_docs.append(Document(
            page_content=doc.page_content,
            metadata={
                'source': source,
                'page': page
            }
        ))

    return final_docs

In [6]:
docs = meta_filter(docs)
docs[0]

Document(metadata={'source': 'pdfs\\The Gale Encyclopedia of Medicine.pdf', 'page': 0}, page_content='')

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def text_splitter(docs: list[Document]) -> list[Document]:
    """Split documents into smaller chunks."""
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=80)
    
    return text_splitter.split_documents(docs)

In [8]:
splits = text_splitter(docs)

len(splits)

3900

In [9]:
from langchain.embeddings import HuggingFaceEmbeddings
import torch

embedding_model = HuggingFaceEmbeddings(
    model_name = 'sentence-transformers/all-MiniLM-L6-v2',
    model_kwargs = {
        "device": "cuda" if torch.cuda.is_available() else "cpu"
    }
)

  embedding_model = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


In [10]:
embedding_model

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={'device': 'cpu'}, encode_kwargs={}, multi_process=False, show_progress=False)

In [11]:
vec = embedding_model.embed_query("What is the treatment for diabetes?")
len(vec)

  return forward_call(*args, **kwargs)


384

In [12]:
vec[:10]  

[-0.0043837158009409904,
 0.07504492998123169,
 -0.036999497562646866,
 0.08182161301374435,
 -0.06613903492689133,
 -0.022863920778036118,
 0.06654287874698639,
 0.054875463247299194,
 0.016562569886446,
 -0.016643783077597618]

In [13]:
from dotenv import load_dotenv
load_dotenv()

True

In [14]:
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore

pc = Pinecone()

In [15]:
index_name = "medical-rag"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1',
        ),
        dimension=384,
        metric="cosine"
    )

    vector_store = PineconeVectorStore.from_documents(
        documents = splits,
        embedding = embedding_model,
        index_name = index_name
    )

# index = pc.Index(index_name)

In [16]:
vector_store = PineconeVectorStore.from_existing_index(
    embedding=embedding_model,
    index_name=index_name
)

In [17]:
retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3},
)

In [18]:
retriever.invoke("What is the treatment for diabetes?")

  return forward_call(*args, **kwargs)


[Document(id='e7bcec19-da37-46e1-a1ae-bf64a3ad4a9a', metadata={'page': 274.0, 'source': 'pdfs\\The Gale Encyclopedia of Medicine.pdf'}, page_content='interact with tricyclic antidepressants. Be sure to check\nwith a physician or pharmacist before combining tri-\ncyclic antidepressants with any other prescription or non-\nprescription (over-the-counter) medicine.\nNancy Ross-Flanigan\nAntidiabetic drugs\nDefinition\nAntidiabetic drugs are medicines that help control\nblood sugar levels in people with diabetes mellitus\n(sugar diabetes).\nPurpose\nDiabetes may be divided into type I and type II, for-\nmerly termed juvenile onset or insulin-dependent, and\nGALE ENCYCLOPEDIA OF MEDICINE 2 261\nAntidiabetic drugs\nGEM - 0001 to 0432 - A  10/22/03 1:42 PM  Page 261'),
 Document(id='37e05116-65bc-4d4f-8748-5fb5c13e9985', metadata={'page': 275.0, 'source': 'pdfs\\The Gale Encyclopedia of Medicine.pdf'}, page_content='biguanides, meglitinides, and thiazolidinediones.\nInsulin (Humulin, Novolin)

In [19]:
from langchain_groq import ChatGroq

llm = ChatGroq(model="llama3-8b-8192")
llm

ChatGroq(client=<groq.resources.chat.completions.Completions object at 0x00000196FA33A900>, async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x00000196FB726180>, model_name='llama3-8b-8192', model_kwargs={}, groq_api_key=SecretStr('**********'))

In [20]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.prompts import ChatPromptTemplate

In [21]:
system_prompt = (
    "You are an medical assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}"),
])

In [23]:
qa_chain = create_stuff_documents_chain(llm, prompt)

retrieval_chain = create_retrieval_chain(retriever, qa_chain)

retrieval_chain.invoke({"input": "What is the treatment for diabetes?"})

  return forward_call(*args, **kwargs)


{'input': 'What is the treatment for diabetes?',
 'context': [Document(id='e7bcec19-da37-46e1-a1ae-bf64a3ad4a9a', metadata={'page': 274.0, 'source': 'pdfs\\The Gale Encyclopedia of Medicine.pdf'}, page_content='interact with tricyclic antidepressants. Be sure to check\nwith a physician or pharmacist before combining tri-\ncyclic antidepressants with any other prescription or non-\nprescription (over-the-counter) medicine.\nNancy Ross-Flanigan\nAntidiabetic drugs\nDefinition\nAntidiabetic drugs are medicines that help control\nblood sugar levels in people with diabetes mellitus\n(sugar diabetes).\nPurpose\nDiabetes may be divided into type I and type II, for-\nmerly termed juvenile onset or insulin-dependent, and\nGALE ENCYCLOPEDIA OF MEDICINE 2 261\nAntidiabetic drugs\nGEM - 0001 to 0432 - A  10/22/03 1:42 PM  Page 261'),
  Document(id='37e05116-65bc-4d4f-8748-5fb5c13e9985', metadata={'page': 275.0, 'source': 'pdfs\\The Gale Encyclopedia of Medicine.pdf'}, page_content='biguanides, meg