In [29]:
from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from dotenv  import load_dotenv
load_dotenv()
import os


In [18]:
def load_pdf_file(data):
    loader=DirectoryLoader(data,glob="*.pdf",loader_cls=PyPDFLoader)
    documents=loader.load()
    return documents

In [19]:
extracted_data = load_pdf_file(data=r'C:\Final Projects\Medical-Chatbot\Data')

In [20]:
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks
    

In [21]:
text_chunks = text_split(extracted_data)
print("Length of text chunks", len(text_chunks))

Length of text chunks 5859


In [22]:
def get_embeddings():
    embeddings= OpenAIEmbeddings(model="text-embedding-3-small")
    return embeddings

In [23]:
embeddings = get_embeddings()



  embeddings= OpenAIEmbeddings(model="text-embedding-3-small")


In [24]:
query_result = embeddings.embed_query("hello world")
print("Length",len(query_result))

Length 1536


In [30]:
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')

In [38]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from tqdm import tqdm

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "chatbot"
dimension = 1536

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=dimension,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )

In [39]:
vectorstore = PineconeVectorStore(
    index_name=index_name,
    embedding=embeddings
)

In [40]:
batch_size = 100
for i in tqdm(range(0, len(text_chunks), batch_size)):
    batch = text_chunks[i:i + batch_size]
    vectorstore.add_documents(batch)

100%|██████████| 59/59 [04:05<00:00,  4.16s/it]


In [41]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [42]:
retriever=docsearch.as_retriever(search_type="similarity",search_kwargs={"k":3})

In [43]:
retrievd_docs=retriever.invoke("What is Acne ?")

In [46]:
retrievd_docs

[Document(id='bd0c3dd6-0d3d-4086-ad56-866043952d67', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 239.0, 'page_label': '240', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'C:\\Final Projects\\Medical-Chatbot\\Data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='Isotretinoin (Accutane) is prescribed only for very\nsevere, disfiguring acne.\nAcne is a skin condition that occurs when pores or\nhair follicles become blocked. This allows a waxy\nmaterial, sebum, to collect inside the pores or follicles.\nNormally, sebum flows out onto the skin and hair to\nform a protective coating, but when it cannot get out,\nsmall swellings develop on the skin surface. Bacteria\nand dead skin cells can also collect that can cause\ninflammation. Swellings that are small and not'),
 Document(id='b8164533-440a-4733-99df-3129772b0c76', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate

In [47]:
from langchain_openai import OpenAI
llm = OpenAI(temperature=0.4,max_tokens=500)

In [48]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [49]:
system_prompt = (
    "You are an assistant for question-answering tasks."
    "Use the following pieces of retrieved context to answer"
    "the question. If you don't know the answer , say that you"
    "don't know. uSe three sentences maximum and keep the answer concise."
    "\n\n"
    "{context}"
)

In [50]:
prompt = ChatPromptTemplate(
    [
        ("system",system_prompt),
        ("human","{input}")
    ]
)


In [51]:
question_answer_chain=create_stuff_documents_chain(llm,prompt)
rag_chain=create_retrieval_chain(retriever,question_answer_chain)


In [53]:
response=rag_chain.invoke({"input":"What is Acne?"})
print(response["answer"])



Acne is a common skin disease that occurs when pores become blocked with oil, dead skin cells, and bacteria. It is characterized by pimples on the face, chest, and back and is most commonly seen in teenagers and young adults. It can be a severe and disfiguring condition, and is often treated with medication such as isotretinoin (Accutane).
