In [1]:
! pip install -U faiss-cpu langchain-huggingface pymupdf tiktoken langchain-ollama python-dotenv

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-win_amd64.whl.metadata (4.5 kB)
Collecting pymupdf
  Downloading pymupdf-1.25.1-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-win_amd64.whl.metadata (6.8 kB)
Downloading faiss_cpu-1.9.0.post1-cp310-cp310-win_amd64.whl (13.8 MB)
   ---------------------------------------- 0.0/13.8 MB ? eta -:--:--
    --------------------------------------- 0.3/13.8 MB ? eta -:--:--
   - -------------------------------------- 0.5/13.8 MB 1.9 MB/s eta 0:00:08
   --- ------------------------------------ 1.3/13.8 MB 2.9 MB/s eta 0:00:05
   ------ --------------------------------- 2.1/13.8 MB 3.1 MB/s eta 0:00:04
   ------- -------------------------------- 2.6/13.8 MB 3.0 MB/s eta 0:00:04
   --------- ------------------------------ 3.4/13.8 MB 3.0 MB/s eta 0:00:04
   ---------- ----------------------------- 3.7/13.8 MB 2.9 MB/s eta 0:00:04
   ------------ --------------------------- 

In [1]:
from langchain_ollama import OllamaEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings

import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore

In [5]:
import os
import warnings
from dotenv import load_dotenv

os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
warnings.filterwarnings("ignore")

load_dotenv()

False

In [6]:
#loading document
from langchain_community.document_loaders import PyMuPDFLoader

loader = PyMuPDFLoader("../../datasets/book/medEncyclopedia.pdf")

docs = loader.load()

In [9]:
#document chunking
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=300)
chunks = text_splitter.split_documents(docs)

In [14]:
import tiktoken

encoding = tiktoken.encoding_for_model("gpt-4o-mini")

len(encoding.encode(docs[0].page_content)), len(encoding.encode(chunks[0].page_content))

(0, 19)

In [16]:
embeddings = OllamaEmbeddings(model='nomic-embed-text', base_url="http://localhost:11434")
single_vector = embeddings.embed_query("this is some text data")

In [18]:
index = faiss.IndexFlatL2(len(single_vector))

In [19]:
# #creating vector store
# vector_store = FAISS(
#     embedding_function=embeddings,
#     index=index,
#     docstore=InMemoryDocstore(),
#     index_to_docstore_id={}
# )

In [3]:
#ids = vector_store.add_documents(documents=chunks)
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [5]:
vector_store = FAISS.load_local("../../med_db", embeddings=embeddings, allow_dangerous_deserialization=True)
len(vector_store.index_to_docstore_id)

12318

In [None]:
question = "what is effusion?"
docs = vector_store.search(query=question, search_type='similarity')

for doc in docs:
    print(doc.page_content)
    print("\n\n")

In [9]:
retriever = vector_store.as_retriever(search_type="mmr", search_kwargs = {'k': 2, 
                                                                          'fetch_k': 100,
                                                                          'lambda_mult': 1})

In [None]:
docs = retriever.invoke(question)
for doc in docs:
    print(doc.page_content)
    print("\n\n")

In [11]:
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate

from langchain_ollama import ChatOllama

In [32]:
model = ChatOllama(model="llama3.2",num_predict=100 ,base_url="http://localhost:11434")

In [33]:
prompt = """You are a Medical Chatbot answering questions strictly related to chest diseases. 
Do not provide any consultation. Give answers only for medical queries. Answer the question based only on the following context:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(prompt)

In [34]:
def format_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])

In [35]:
rag_chain = (
    {"context": retriever|format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [37]:
question = "who are you?"

output = rag_chain.invoke(question)
print(output)

I am a Medical Chatbot specializing in answering questions related to chest diseases. I provide information and responses based on my knowledge cutoff, but please note that I'm not a substitute for professional medical advice or consultation. My purpose is to assist with general inquiries about chest diseases, symptoms, causes, diagnosis, treatment options, and more.
