In [None]:
import os
os.chdir("../")


In [None]:
%pwd

In [None]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [None]:
# Extract text from PDF files
def load_pdf_files(data):
    loader = DirectoryLoader(
        data, 
        glob="*.pdf", 
        loader_cls=PyPDFLoader
    )
    documents = loader.load()
    return documents

In [None]:
extracted_data = load_pdf_files("data")

In [None]:
extracted_data

In [None]:
len(extracted_data)

In [None]:
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """
    """

    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    return minimal_docs


In [None]:
minimal_docs = filter_to_minimal_docs(extracted_data)

In [None]:
minimal_docs

In [None]:
# Split the documents into smaller Chunks
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 500,
        chunk_overlap=20,
    )
    texts_chunk = text_splitter.split_documents(minimal_docs)
    return texts_chunk

In [None]:
text_chunk = text_split(minimal_docs)
print(f"Number of chunks: {len(text_chunk)}")


In [None]:
text_chunk

In [None]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

def download_embeddings():
    """
    """
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embedding = HuggingFaceBgeEmbeddings(
        model_name=model_name,
    ) 
    return embedding
embedding = download_embeddings()

In [None]:
embedding

In [None]:
vector = embedding.embed_query("Hello Manoj")
vector

In [None]:
print("Vector Length:", len(vector))

In [None]:
from dotenv import load_dotenv
import os
load_dotenv()

In [None]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY


In [None]:
from pinecone import Pinecone
pinecone_api_key = PINECONE_API_KEY

pc = Pinecone(api_key=pinecone_api_key)


In [None]:
pc

In [None]:
from pinecone import ServerlessSpec

index_name = "medical-chatbot"

if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

index = pc.Index(index_name)

In [None]:
from langchain_pinecone import PineconeVectorStore

docserch = PineconeVectorStore.from_documents(
    documents=text_chunk,
    embedding=embedding,
    index_name=index_name
)

In [None]:
# LOad Existing index

from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding
    
)

# Add more data to the existing Pinecone Index

In [None]:
"""dswith = Document(
    page_content="I am Manoj Kumar, From goa",
    metadata={"sourec": "LinkedIN"}
)"""

In [None]:
"""docsearch.add_documents(documents=[dswith])"""

In [None]:
retriver = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [None]:
retrived_docs = retriver.invoke("What is Acne?")
retrived_docs

In [None]:
from transformers import pipeline
from langchain.llms import HuggingFacePipeline

# CPU-friendly free model
generator = pipeline(
    "text2text-generation",
    model="google/flan-t5-small",  # small, free
    max_length=512,
    do_sample=True,
    temperature=0.7
)

# Wrap as LangChain LLM
chatModel = HuggingFacePipeline(pipeline=generator)


In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [None]:
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "The question. If you don't know the answer, say that you "
    "Don't know. Use thre sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
        
    ]
)

In [None]:
question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
retriever = docsearch.as_retriever()
rag_chain = create_retrieval_chain(retriver, question_answer_chain)

In [None]:
response = rag_chain.invoke({"input": "What is Acromegaly and gigantism?"})
print(response["answer"])


In [None]:
response = rag_chain.invoke({"input": "What is Acne?"})
print(response["answer"])


In [None]:
response = rag_chain.invoke({"input": "What is Treatment of Acne?"})
print(response["answer"])
