# **Data Preparation Pipeline**

In [1]:
from google.colab import userdata
import os

os.environ['GOOGLE_API_KEY'] = userdata.get("GOOGLE_API_KEY")
os.environ['HUGGINGFACEHUB_ACCESS_TOKEN'] = userdata.get("HUGGINGFACEHUB_ACCESS_TOKEN")
os.environ['OPENAI_API_KEY'] = userdata.get("OPENAI_API_KEY")
os.environ["PINECONE_API_KEY"] = userdata.get("PINECONE_API_KEY")

In [2]:
import nest_asyncio
nest_asyncio.apply()

In [3]:
!pip -q install langchain langchain-pinecone langchain-google-genai openai-agents langchain-community tiktoken python-dotenv pypdf langchain-huggingface sentence-transformers pinecone unstructured

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.7/50.7 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.9/193.9 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m49.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.5/310.5 kB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.6/587.6 kB[0m [31m36.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m28.4 MB/s[0m eta [36m0:

# **Imports & API setup**

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings,ChatGoogleGenerativeAI,GoogleGenerativeAI
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate
from langchain_community.document_loaders import PyPDFLoader
from langchain_huggingface import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
import os


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from langchain_pinecone.vectorstores import Pinecone, PineconeVectorStore


In [5]:
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
import time

index_name = "semester-books"

# if index_name not in pc.list_indexes():
#     pc.create_index(
#         index_name,
#         dimension=384,
#         metric='cosine',
#         spec=ServerlessSpec(cloud='aws', region='us-east-1')
#     )
#     # wait for index to be initialized
#     while not pc.describe_index(index_name).status.ready:
#         time.sleep(1)

PINECONE = PineconeVectorStore(
    index_name=index_name,
    embedding=embeddings,
    namespace="linear_algebra"
)

# **OpenAI Agents SDK Testing**

In [7]:
from agents import Agent,Runner,OpenAIChatCompletionsModel
from openai import AsyncOpenAI

external_client = AsyncOpenAI(
    base_url = "https://generativelanguage.googleapis.com/v1beta/openai/",
    api_key = os.getenv("GOOGLE_API_KEY"),
)

model = OpenAIChatCompletionsModel(
    model="gemini-1.5-flash",
    openai_client = external_client,
)

In [8]:
agent:Agent = Agent(
    name="Assistant",
    instructions="You are a helpful assistant",
    model=model,
)

response = Runner.run_sync(agent,"Hi")
print(response.final_output)

Hi there! How can I help you today?



In [9]:
chat_model = ChatGoogleGenerativeAI(model="gemini-1.5-flash")
chat_model.invoke("HI")

AIMessage(content='Hi there! How can I help you today?', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-1.5-flash', 'safety_ratings': []}, id='run--5acde8be-c8d5-407a-8cc9-6348d19c8d5b-0', usage_metadata={'input_tokens': 1, 'output_tokens': 11, 'total_tokens': 12, 'input_token_details': {'cache_read': 0}})

# **Step 1a → Multi-Document Ingestion (per subject)**

## **Linear Algebra**

In [10]:
loader = PyPDFLoader("/content/Linear_Algebra.pdf")
docs = loader.load()

In [11]:
len(docs)

579

In [12]:
docs[340]

Document(metadata={'producer': 'xdvipdfmx (0.7.9)', 'creator': 'HELIOS pdfcat', 'creationdate': '2015-01-21T14:42:45+05:30', 'codemantra, llc': 'http://www.codemantra.com', 'moddate': "D:20240310035050Z00'00", 'author': '', 'universal pdf': 'The process that creates this PDF constitutes a trade secret of codeMantra, LLC and is protected by the copyright laws of the United States', 'title': '0321982630.pdf', 'source': '/content/Linear_Algebra.pdf', 'total_pages': 579, 'page': 340, 'page_label': '324'}, page_content='324 CHAPTER 5 Eigenvalues and Eigenvectors\nScale Ax2 by 1=\x162 to get x3, and so on. The results of MATLAB calculations for the\nﬁrst ﬁve iterations are arranged in Table 2.\nTABLE 2 The Power Method for Example 2\nk 0 1 2 3 4 5\nxk\n\x140\n1\n\x15 \x14 1\n:4\n\x15 \x14 1\n:225\n\x15 \x14 1\n:2035\n\x15 \x14 1\n:2005\n\x15 \x14 1\n:20007\n\x15\nAxk\n\x145\n2\n\x15 \x14 8\n1:8\n\x15 \x14 7:125\n1:450\n\x15 \x14 7:0175\n1:4070\n\x15 \x14 7:0025\n1:4010\n\x15 \x14 7:00036\n1:

## **Discrete Structures**

In [13]:
loader1 = PyPDFLoader("/content/Discrete mathematics and its applications-BY Kenneth H. Rosen -McGraw-Hill (2013).pdf")
docs1 = loader.load()

In [14]:
len(docs1)

579

In [15]:
docs1[340]

Document(metadata={'producer': 'xdvipdfmx (0.7.9)', 'creator': 'HELIOS pdfcat', 'creationdate': '2015-01-21T14:42:45+05:30', 'codemantra, llc': 'http://www.codemantra.com', 'moddate': "D:20240310035050Z00'00", 'author': '', 'universal pdf': 'The process that creates this PDF constitutes a trade secret of codeMantra, LLC and is protected by the copyright laws of the United States', 'title': '0321982630.pdf', 'source': '/content/Linear_Algebra.pdf', 'total_pages': 579, 'page': 340, 'page_label': '324'}, page_content='324 CHAPTER 5 Eigenvalues and Eigenvectors\nScale Ax2 by 1=\x162 to get x3, and so on. The results of MATLAB calculations for the\nﬁrst ﬁve iterations are arranged in Table 2.\nTABLE 2 The Power Method for Example 2\nk 0 1 2 3 4 5\nxk\n\x140\n1\n\x15 \x14 1\n:4\n\x15 \x14 1\n:225\n\x15 \x14 1\n:2035\n\x15 \x14 1\n:2005\n\x15 \x14 1\n:20007\n\x15\nAxk\n\x145\n2\n\x15 \x14 8\n1:8\n\x15 \x14 7:125\n1:450\n\x15 \x14 7:0175\n1:4070\n\x15 \x14 7:0025\n1:4010\n\x15 \x14 7:00036\n1:

# **Step 1b → Subject-Aware Text Splitting**


## **Linear Algebra**

In [None]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
)
chunks = splitter.split_documents(docs)

In [None]:
len(chunks)

1968

In [None]:
chunks[200]

Document(metadata={'producer': 'xdvipdfmx (0.7.9)', 'creator': 'HELIOS pdfcat', 'creationdate': '2015-01-21T14:42:45+05:30', 'codemantra, llc': 'http://www.codemantra.com', 'moddate': "D:20240310035050Z00'00", 'author': '', 'universal pdf': 'The process that creates this PDF constitutes a trade secret of codeMantra, LLC and is protected by the copyright laws of the United States', 'title': '0321982630.pdf', 'source': '/content/Linear_Algebra.pdf', 'total_pages': 579, 'page': 64, 'page_label': '48'}, page_content='48 CHAPTER 1 Linear Equations in Linear Algebra\n1.5 EXERCISES\nIn Exercises 1–4, determine if the system has a nontrivial solution.\nTry to use as few row operations as possible.\n1. 2x1 \x00 5x2 C 8x3 D 0\n\x002x1 \x00 7x2 C x3 D 0\n4x1 C 2x2 C 7x3 D 0\n2. x1 \x00 3x2 C 7x3 D 0\n\x002x1 C x2 \x00 4x3 D 0\nx1 C 2x2 C 9x3 D 0\n3. \x003x1 C 5x2 \x00 7x3 D 0\n\x006x1 C 7x2 C x3 D 0\n4. \x005x1 C 7x2 C 9x3 D 0\nx1 \x00 2x2 C 6x3 D 0\nIn Exercises 5 and 6, follow the method of Exa

In [None]:
vector_store = PINECONE.add_documents(chunks,namespace="linear_algebra")

## **Discrete Structures**

In [16]:
PINECONE_DIS = PineconeVectorStore(
    index_name=index_name,
    embedding=embeddings,
    namespace="discrete_structures"
)

In [None]:
splitter1 = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
)
chunks1 = splitter1.split_documents(docs1)

In [None]:
len(chunks1)

1968

In [None]:
chunks1[189]

Document(metadata={'producer': 'xdvipdfmx (0.7.9)', 'creator': 'HELIOS pdfcat', 'creationdate': '2015-01-21T14:42:45+05:30', 'codemantra, llc': 'http://www.codemantra.com', 'moddate': "D:20240310035050Z00'00", 'author': '', 'universal pdf': 'The process that creates this PDF constitutes a trade secret of codeMantra, LLC and is protected by the copyright laws of the United States', 'title': '0321982630.pdf', 'source': '/content/Linear_Algebra.pdf', 'total_pages': 579, 'page': 60, 'page_label': '44'}, page_content='2\n4\n3 5 \x004 0\n\x003 \x002 4 0\n6 1 \x008 0\n3\n5 \x18\n2\n4\n3 5 \x004 0\n0 3 0 0\n0 \x009 0 0\n3\n5 \x18\n2\n4\n3 5 \x004 0\n0 3 0 0\n0 0 0 0\n3\n5\nSince x3 is a free variable, Ax D 0 has nontrivial solutions (one for each choice of x3).\nTo describe the solution set, continue the row reduction of\x8c A 0 \x8d to reduced echelon\nform:\n2\n4\n1 0 \x004\n3 0\n0 1 0 0\n0 0 0 0\n3\n5\nx1 \x00 4\n3 x3 D 0\nx2 D 0\n0 D 0\nSolve for the basic variables x1 and x2 and obtain x1

In [None]:
vector_store1 = PINECONE_DIS.add_documents(chunks1,namespace="discrete_structures")

# **Step 2 → Retrieval**

## **Linear_Algebra**

In [17]:
retriever_lin = PINECONE.as_retriever(search_type="mmr",search_kwargs={"k":4,"lambda_mul":0.8})

In [18]:
retriever_lin

VectorStoreRetriever(tags=['PineconeVectorStore', 'HuggingFaceEmbeddings'], vectorstore=<langchain_pinecone.vectorstores.PineconeVectorStore object at 0x78bbf6dabf50>, search_type='mmr', search_kwargs={'k': 4, 'lambda_mul': 0.8})

In [None]:
retriever_lin.invoke("What is linear algebra")

[Document(metadata={'author': '', 'codemantra, llc': 'http://www.codemantra.com', 'creationdate': '2015-01-21T14:42:45+05:30', 'creator': 'HELIOS pdfcat', 'moddate': "D:20240310035050Z00'00", 'page': 17.0, 'page_label': '1', 'producer': 'xdvipdfmx (0.7.9)', 'source': '/content/Linear_Algebra.pdf', 'title': '0321982630.pdf', 'total_pages': 579.0, 'universal pdf': 'The process that creates this PDF constitutes a trade secret of codeMantra, LLC and is protected by the copyright laws of the United States'}, page_content='in many other ﬁelds have employed computers to analyze\nmathematical models. Because of the massive amounts of\ndata involved, the models are usually linear; that is, they\nare described by systems of linear equations.\nThe importance of linear algebra for applications has\nrisen in direct proportion to the increase in computing\npower, with each new generation of hardware and\nsoftware triggering a demand for even greater capabilities.\nComputer science is thus intricatel

## **Discrete_Structures**

In [19]:
retriever_dis = PINECONE_DIS.as_retriever(search_type="mmr",search_kwargs={"k":4,"lambda_mul":0.8})

In [20]:
retriever_dis

VectorStoreRetriever(tags=['PineconeVectorStore', 'HuggingFaceEmbeddings'], vectorstore=<langchain_pinecone.vectorstores.PineconeVectorStore object at 0x78bbcb3ef740>, search_type='mmr', search_kwargs={'k': 4, 'lambda_mul': 0.8})

In [None]:
retriever_dis.invoke("What is discrete structures")

[Document(metadata={'author': '', 'codemantra, llc': 'http://www.codemantra.com', 'creationdate': '2015-01-21T14:42:45+05:30', 'creator': 'HELIOS pdfcat', 'moddate': "D:20240310035050Z00'00", 'page': 209.0, 'page_label': '193', 'producer': 'xdvipdfmx (0.7.9)', 'source': '/content/Linear_Algebra.pdf', 'title': '0321982630.pdf', 'total_pages': 579.0, 'universal pdf': 'The process that creates this PDF constitutes a trade secret of codeMantra, LLC and is protected by the copyright laws of the United States'}, page_content='EXAMPLE 3 Let S be the space of all doubly inﬁnite sequences of numbers (usually\nwritten in a row rather than a column):\nfykg D .: : : ; y\x002; y\x001; y0; y1; y2; : : :/\nIf f´kg is another element of S, then the sum fykg C f´kg is the sequence fyk C ´kg\nformed by adding corresponding terms of fykg and f´kg. The scalar multiplec fykg is\nthe sequence fcykg. The vector space axioms are veriﬁed in the same way as forRn.\nElements of S arise in engineering, for exampl

# **Step 3 → Tool Definitions**

In [33]:
from agents import function_tool
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser

In [42]:
def format_docs(retrieved_docs):
    context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
    return context_text
llm = GoogleGenerativeAI(model="models/gemini-1.5-flash")

## **Linear Algebra**

In [46]:
@function_tool
def answer_from_linear_algebra(query:str)->str:
  """
  Answer questions about linear algebra using structured RAG chain
  """
  print(f"[Debug] answer_from_linear_algebra function call with query {query}")
  parallel_chain = RunnableParallel({
    'context': retriever_lin | RunnableLambda(format_docs),
    'question': RunnablePassthrough()
  }
  )

  prompt = PromptTemplate.from_template(
        """You are a linear algebra expert. Answer the question using only the provided context.

        Context: {context}
        Question: {question}

        Answer:"""
    )

  parser = StrOutputParser()

  main_chain = parallel_chain | prompt | llm | parser
  result = main_chain.invoke(query)
  print(f"[Debug] RAG function call with response ***{result}***")
  return result

## **Discrete Structures**

In [49]:
@function_tool
def answer_from_discrete_structures(query:str)->str:
  """
  Answer questions about discrete structures using structured RAG chain
  """
  print(f"[Debug] answer_from_discrete_structures function call with query {query}")
  parallel_chain = RunnableParallel({
    'context': retriever_dis | RunnableLambda(format_docs),
    'question': RunnablePassthrough()
  }
  )

  prompt = PromptTemplate.from_template(
        """You are a discrete structures expert. Answer the question using only the provided context.

        Context: {context}
        Question: {question}

        Answer:"""
    )

  parser = StrOutputParser()

  main_chain = parallel_chain | prompt | llm | parser
  result = main_chain.invoke(query)
  print(f"[Debug] RAG function call with response ***{result}***")
  return result

In [50]:
import asyncio
qa_agent = Agent(
    name="QA Agent",
    instructions="""
    You are a specialized academic tutor with access to course textbooks and materials.

CRITICAL INSTRUCTIONS:
1. ALWAYS use your tools to search the knowledge base - never answer from your general knowledge
2. Analyze each question to identify the subject area:
   - Discrete structures: cardinality, sets, graphs, algorithms, combinatorics, logic
   - Linear algebra: matrices, vectors, eigenvalues, transformations, vector spaces

3. When students say they are "confused" or need "deep explanation":
   - Use the appropriate tool to get comprehensive context
   - Ask the tool to provide step-by-step explanations
   - Focus on foundational concepts first

4. Always start your response by using the relevant tool, then provide a clear, educational answer based on the retrieved information.

Your role is to be a patient, knowledgeable tutor who helps students understand complex academic concepts using their course materials.
    """,
    tools=[answer_from_linear_algebra,answer_from_discrete_structures],
    # Use OpenAIChatCompletionsModel with the pre-configured external_client
    model=model,
)

async def main():
    agent_question = "From discrete structures, tell me about cardinality of steps and explain very deeply i am confused at that."

    # Run the agent
    result = await Runner.run(qa_agent, agent_question)

    # Extract and print the final answer
    # print("Agent result:", result)
    print("Agent's answer:", result.final_output)

if __name__ == "__main__":
    asyncio.run(main())

[Debug] answer_from_discrete_structures function call with query Explain cardinality of sets deeply
[Debug] RAG function call with response ***The provided text does not offer a definition or explanation of cardinality of sets.  Therefore, I cannot answer your question using only the provided context.***
Agent's answer: I apologize, but the provided materials do not contain information on the cardinality of steps in the context of discrete structures.  To help you understand this concept, we need a more precise definition of "steps" and the context in which their cardinality is being considered.

The term "cardinality" generally refers to the number of elements in a set.  If "steps" refers to elements within a set, then the cardinality would simply be the count of those steps.  For example:

* **Set of Steps in an Algorithm:** If you have an algorithm with a defined sequence of steps, the cardinality would be the number of steps in that sequence.  If the algorithm has five steps, the c