In [5]:
import os
os.chdir("../")

In [6]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [7]:
def load_pdf_files(data):
    loader= DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )
    documents = loader.load()
    return documents

In [8]:
extracted_data = load_pdf_files("data")

In [9]:
len(extracted_data)


104

In [10]:
extracted_data

[Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 20.0 (Windows)', 'creationdate': '2025-02-21T15:04:22+05:30', 'moddate': '2025-03-07T15:15:16+05:30', 'title': '', 'trapped': '/False', 'source': 'data/Bio10th.pdf', 'total_pages': 104, 'page': 0, 'page_label': '1'}, page_content='Standard   X\nState Council of Educational Research  \nand Training (SCERT), Kerala \n2025\nGovernment of Kerala\nDepartment of General Education\nPrepared by\nPart    1\nBiology'),
 Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 20.0 (Windows)', 'creationdate': '2025-02-21T15:04:22+05:30', 'moddate': '2025-03-07T15:15:16+05:30', 'title': '', 'trapped': '/False', 'source': 'data/Bio10th.pdf', 'total_pages': 104, 'page': 1, 'page_label': '2'}, page_content='State Council of Educational  Research and Training (SCERT)\nPoojappura, Thiruvananthapuram 695012, Kerala \nWebsite  :  www.scert.kerala.gov.in \ne-mail  :  scertkerala@gmail.com\nPhone

In [11]:
from typing import List
from langchain_core.documents import Document

def filter_to_minimal_docs(docs: List[Document])-> List[Document]:
    
    minimal_docs: List[Document]=[]
    for doc in docs:
        src= doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    return minimal_docs



In [12]:
minimal_docs = filter_to_minimal_docs(extracted_data)
minimal_docs

[Document(metadata={'source': 'data/Bio10th.pdf'}, page_content='Standard   X\nState Council of Educational Research  \nand Training (SCERT), Kerala \n2025\nGovernment of Kerala\nDepartment of General Education\nPrepared by\nPart    1\nBiology'),
 Document(metadata={'source': 'data/Bio10th.pdf'}, page_content='State Council of Educational  Research and Training (SCERT)\nPoojappura, Thiruvananthapuram 695012, Kerala \nWebsite  :  www.scert.kerala.gov.in \ne-mail  :  scertkerala@gmail.com\nPhone  :  0471 - 2341883, Fax : 0471 - 2341869\nTypesetting  and Layout :  SCERT\nFirst Edition : 2025\nPrinted at : KBPS, Kakkanad, Kochi-30\n© Department of General Education, Government of Kerala\nTHE NATIONAL ANTHEM\nJana-gana-mana adhinayaka, jaya he\nBharatha-bhagya-vidhata\nPunjab-Sindh-Gujarat-Maratha\nDravida-Utkala-Banga\nVindhya-Himachala-Yamuna-Ganga\nUchchala-Jaladhi-taranga\nTava subha name jage,\nTava subha asisa mage,\nGahe tava jaya gatha\nJana-gana-mangala-dayaka jaya he\nBharatha-bha

In [13]:
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
        length_function=len
    )
    text_chunks = text_splitter.split_documents(minimal_docs)
    return text_chunks

In [14]:
text_chunks = text_split(minimal_docs)


In [15]:
from langchain_community.embeddings import HuggingFaceEmbeddings

def download_embedding():
    model_name = "BAAI/bge-small-en-v1.5"
    embeddings= HuggingFaceEmbeddings(
        model_name= model_name,
    )
    return embeddings

embedding = download_embedding()

  embeddings= HuggingFaceEmbeddings(
Loading weights: 100%|██████████| 199/199 [00:00<00:00, 1581.11it/s, Materializing param=pooler.dense.weight]                               
BertModel LOAD REPORT from: BAAI/bge-small-en-v1.5
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [16]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [17]:
import os
from dotenv import load_dotenv

load_dotenv() 

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["GEMINI_API_KEY"] = GEMINI_API_KEY


In [18]:
from pinecone import Pinecone
pinecone_api_key= PINECONE_API_KEY

pc = Pinecone(api_key= pinecone_api_key)

In [19]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key="pcsk_5xABNr_NrsQGQtadk9kHrsAEk3tLD7773nAgn3PDHxP2MDdeiTqYNzoVRR5sr7Uv1Svu8m")

index_name = "adhd-bot"

existing_indexes = [idx["name"] for idx in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )

index = pc.Index(index_name)
print("Index ready:", index_name)


Index ready: adhd-bot


In [20]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents= text_chunks,
    embedding=embedding,
    index_name=index_name
)


In [21]:
retriever = docsearch.as_retriever(search_type= "similarity", search_kwargsss={"k":3})

In [22]:
retrieved_docs=retriever.invoke("What is biology") 

In [23]:
import os
from langchain_google_genai import ChatGoogleGenerativeAI

chatModel = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash-lite", # Or "gemini-2.5-flash-lite"
    google_api_key=os.getenv("GEMINI_API_KEY"),
    temperature=0.7
)


response = chatModel.invoke("Hello! Can you hear me?")
print(response.content)

As a large language model, I don't have ears or the ability to "hear" in the way humans do. I process text that you type to me.

So, while I can't literally hear you, I can **read and understand your typed words**.

**How can I help you today?**


In [24]:
from langchain_classic.chains import create_retrieval_chain
from langchain_classic.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [25]:
system_prompt = (
    "You are an ADHD-specialized Study Assistant for 10th-standard students. "
    "Your goal is to reduce cognitive load and prevent overwhelm. "
    "\n\n"
    "Rules for your response:\n"
    "1. Use 'The 3-Bullet Rule': Summarize information into exactly 3 clear, actionable bullets.\n"
    "2. Use 'Micro-Tasks': If the student asks about a big topic, break it down into 5-minute study tasks.\n"
    "3. Highlight Key Terms: Use **bold** for essential vocabulary.\n"
    "4. Tone: Be encouraging and non-judgmental. If you don't find the answer in the context, "
    "suggest a related basic concept from the 10th-grade syllabus.\n\n"
    "Context:\n{context}"
)

prompt= ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human","{input}"),
    ]
)

In [26]:
question_answer_chain= create_stuff_documents_chain(chatModel,prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [29]:
response= rag_chain.invoke({"input":"What is mitochondria?"})
print(response["answer"])

That's a great question! The text you provided doesn't specifically define mitochondria.

However, based on what you're learning about **molecular biology** and **evolutionary relationships**, understanding structures like mitochondria is super important!

Here's a quick breakdown of what **mitochondria** are, which might be helpful for your studies:

*   **Powerhouses of the Cell:** Mitochondria are like tiny energy factories inside most of your cells. They take in nutrients and oxygen and convert them into a form of energy your cells can use.
*   **Cellular Respiration:** This energy-making process is called **cellular respiration**. It's a key part of how living things get the energy they need to function.
*   **Important for Life:** Without mitochondria working properly, cells wouldn't have enough energy to do all their jobs, which is essential for the survival of organisms.
