In [1]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
#Extract Data From the PDF File
def load_pdf_file(data):
    loader= DirectoryLoader(data,
                            glob= "*.pdf",
                            loader_cls=PyPDFLoader)
    
    documents= loader.load()
    
    return documents

In [3]:
extracted_data = load_pdf_file(data='C:/Projects/Medical-Generative-AI/Data')

In [4]:
#extracted_data

In [5]:
# Split the data into Text chunks
def text_split(extracted_data):
    text_splitter= RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [6]:
text_chunks= text_split(extracted_data)
print("Length of text Chunks", len(text_chunks))

Length of text Chunks 40000


In [7]:
#text_chunks

In [8]:
from langchain.embeddings import HuggingFaceEmbeddings

In [9]:
def download_hugging_face_embeddings():
    embeddings= HuggingFaceEmbeddings(model_name= 'sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [10]:
embeddings = download_hugging_face_embeddings()

  embeddings= HuggingFaceEmbeddings(model_name= 'sentence-transformers/all-MiniLM-L6-v2')
  from .autonotebook import tqdm as notebook_tqdm


In [11]:
query_result = embeddings.embed_query("Hello World")
print("Length", len(query_result))

Length 384


In [12]:
#query_result

In [13]:
from dotenv import load_dotenv
load_dotenv()

True

In [14]:
from langchain.vectorstores import FAISS

# Build FAISS index locally
docsearch = FAISS.from_documents(text_chunks, embeddings)


In [15]:
import os

In [16]:
docsearch

<langchain_community.vectorstores.faiss.FAISS at 0x1dc80219640>

In [17]:
retriever = docsearch.as_retriever(search_type = "similarity", search_kwargs = {"k":3})

In [18]:
retrieved_docs = retriever.invoke("What is Acne?")

In [19]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from langchain.llms import HuggingFacePipeline

model_id = "google/flan-t5-base"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=256)
llm = HuggingFacePipeline(pipeline=pipe)


Device set to use cpu
  llm = HuggingFacePipeline(pipeline=pipe)


In [20]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks." 
    "Use the following pieces of retrieved context to answer" 
    "the question. If you don't know the answer, say that you" 
    "don't know. Use three sentences maximum and keep the " 
    "answer concise." 
    "\n\n" 
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [21]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(docsearch.as_retriever(), question_answer_chain)

In [22]:
response = rag_chain.invoke({"input": "What is Acne?"})
print(response["answer"])

Acne is a common skin disease characterized by pimples on the face, chest, and back. It occurs when the pores of the skin become clogged with oil, dead skin cells, and bacteria.
