In [70]:
from langchain import PromptTemplate
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv
import os

In [8]:
def load_data(data):
 loader = DirectoryLoader(data,
                          glob="*.pdf",
                          loader_cls=PyPDFLoader

 )
 documents = loader.load()
 return documents

In [15]:
extracted_data = load_data("../Data")

In [20]:
extracted_data[0]

Document(metadata={'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'creator': 'PyPDF', 'creationdate': '2004-12-18T17:00:02-05:00', 'moddate': '2004-12-18T16:15:31-06:00', 'source': '..\\Data\\Medical_book.pdf', 'total_pages': 637, 'page': 0, 'page_label': '1'}, page_content='')

In [21]:
def text_split(extracted_data):
 text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500,chunk_overlap=20)
 text_chunks= text_splitter.split_documents(extracted_data)
 return text_chunks

In [24]:
text_chunks = text_split(extracted_data)

In [25]:
len(text_chunks)

5860

In [37]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

In [46]:
def download_HuggingFaceEmbeddings():
  embiddings =HuggingFaceBgeEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  return embiddings

In [47]:
embiddings = download_HuggingFaceEmbeddings()

  from .autonotebook import tqdm as notebook_tqdm
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [51]:
embiddings

HuggingFaceBgeEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_instruction='Represent this question for searching relevant passages: ', embed_instruction='', show_progress=False)

In [48]:
embidding = embiddings.embed_query("Hello world")

In [50]:
len(embidding)

384

In [56]:
load_dotenv()

True

In [59]:
my_key = os.getenv("PINECONE_API_KEY")
index_name = "testing"

In [61]:
vector_store = PineconeVectorStore.from_documents(text_chunks, embiddings, index_name=index_name)

In [65]:
top_3 =vector_store.similarity_search("What are Alergies",k=3)

In [68]:
for i in range(len(top_3)):print(top_3[i].page_content,"\n","--"*10)

and the eustachian tube, hypertrophied adenoids can also
obstruct it and cause middle ear infections.
KEY TERMS
Eustacian tube—A tube connecting the middle ear
with the back of the nose, allowing air pressure to
equalize within the ear whenever it opens, such as
with yawning.
Hyperplastic—Overgrown.
Hypertrophy—Overgrowth.
Strep throat—An infection of the throat caused by
bacteria of the Streptococcus family, which causes
tonsillitis.
Ulcerated—Damaged so that the surface tissue is 
 --------------------
off the invaders. Often it does not completely return to its
former size. Each subsequent infection leaves behind a
larger set of tonsils and adenoids. To make matters
worse, the sponge-like structure of these hypertrophied
glands can produce safe havens for germs where the
body cannot reach and eliminate them. Before antibi-
otics and the reduction in infectious childhood diseases
over the past few generations, tonsils and adenoids
caused greater health problems.
Causes and symptoms 


In [73]:
template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""


In [76]:
prompt_template=PromptTemplate(template=template,
                               input_variables=["context","question"],
)
chain_type_kwargs={"prompt":prompt_template}

In [71]:
groq_key = os.getenv("GROQ_API_KEY")

In [72]:
llm = ChatGroq(model = "deepseek-r1-distill-llama-70b"
               ,api_key= groq_key)

In [85]:
qa = RetrievalQA.from_chain_type(llm=llm,chain_type_kwargs=chain_type_kwargs,
           retriever = vector_store.as_retriever(search_kwargs={"k":3}),
            chain_type="stuff" )


In [89]:
query ="what is acne" 
result = qa.invoke(query)
print(result["result"])

<think>
Okay, I need to answer the question "What is acne?" using the provided context. Let me go through each piece of information step by step.

First, the context mentions that acne is the general name given to a skin disorder where the sebaceous glands become inflamed. That's a good starting point. I should explain that acne is a skin condition involving inflammation of the sebaceous glands.

Looking further, there's a mention of "Acne vulgaris" affecting a woman’s face. I remember that Acne vulgaris is the most common form of acne, so it's important to note that it's the most common type.

I also see that the context includes references to various articles and journals, but the specific details about what causes acne or its symptoms aren't provided here. I should stick to the information given without adding anything extra.

So, putting it all together, the answer should be concise, stating that acne is a skin disorder characterized by inflammation of the sebaceous glands, with th