In [30]:
print("OK")

OK


In [31]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [32]:
#Extract Data From the PDF File
def load_pdf_file(data):
    loader= DirectoryLoader(data,
                            glob="*.pdf",
                            loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents

In [33]:
#load PDF file
extracted_data = load_pdf_file(data='../data')

In [34]:
#Split the text into chunks
def split_text(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=20)
    chunks = text_splitter.split_documents(extracted_data)

    return chunks

In [35]:
#check the length of the chunks
chunks = split_text(extracted_data)
print(len(chunks))


5860


In [36]:
#download the embeddings model
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
#function to download the embeddings model
def download_embeddings_model():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") # has 384 dimensions

    
    return embeddings


In [38]:
embeddings = download_embeddings_model()

In [40]:
#check if the embeddings model has 384 dimensions
query = embeddings.embed_query("Hello world")
print(len(query))

384


In [54]:
from dotenv import load_dotenv
load_dotenv()

True

In [55]:
import os
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')
OPENAI_API_KEY=os.environ.get('OPENAI_API_KEY')

In [43]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "telemedai"


pc.create_index(
    name=index_name,
    dimension=384, 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws", 
        region="us-east-1"
    ) 
) 

In [56]:
import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [48]:
#to embed each chunk and store the embeddings in pinecone
from langchain_community.vectorstores import Pinecone

docsearch = Pinecone.from_documents(
    documents=chunks,
    embedding=embeddings,  # Changed from embeddings to embedding
    index_name=index_name,
)

In [None]:
#load the indexes from pinecone
from langchain_community.vectorstores import Pinecone
docsearch = Pinecone.from_existing_index(
    index_name=index_name,
    embedding=embeddings
    )

In [50]:
docsearch

<langchain_community.vectorstores.pinecone.Pinecone at 0x799299312f00>

In [51]:
#check similarity
retrival = docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 5})

In [52]:
#test the similarity
searched_docs = retrival.invoke("What is acne?")

In [53]:
searched_docs

[Document(metadata={'page': 39.0, 'source': '../data/Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(metadata={'page': 38.0, 'source': '../data/Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed.(Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 25'),
 Document(metadata={'page': 37.0, 'source': '../data/Medical_book.pdf'}, page_content='Acidosis see Respiratory acidosis; Renal\ntubular acidosis; Metabolic acidosis\nAcne\nDefinition\nAcne is a common skin disease characterized by\npimples on the face, chest, and back. It occurs when the\npores of the skin become clogged with oil, dead skin\ncells, and bacteria.\nDescription\nAcne vulgaris, the medical t

In [57]:
#we need to pass through an LLM for refined response generation
from langchain_openai import OpenAI
llm = OpenAI(temperature=0.4, max_tokens=500)

In [58]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [59]:
question_answering_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retrival, question_answering_chain)


In [62]:
response = rag_chain.invoke({"input": "what is Acne?"})
print(response["answer"])



Acne is a common skin disease that causes pimples on the face, chest, and back. It occurs when the pores of the skin become clogged with oil, dead skin cells, and bacteria. Isotretinoin (Accutane) is a medication prescribed for severe cases of acne. It works by reducing the amount of oil produced by the skin.
