### RAG System with Agents : For Indian Railways Annual Report 2023-2024 & Train Details using DataSet
-----------------

In [1]:
## Importing necessary libraries
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb


In [2]:
from llama_index.readers.file import PDFReader

In [3]:
import dotenv
dotenv.load_dotenv()

True

In [4]:
reader = PDFReader()
docs = reader.load_data("data/Indian_Railways_Annual_Report _23_24.pdf")

In [5]:
# 2. Create a text splitter with chunk size & overlap
splitter = SentenceSplitter(chunk_size=500, chunk_overlap=50)

In [6]:
# Convert docs into chunks (nodes)
nodes = splitter.get_nodes_from_documents(docs)

In [7]:
# 3. Setup Chroma client for persistence
chroma_client = chromadb.PersistentClient(path="./chroma_store")
chroma_collection = chroma_client.get_or_create_collection("my_documents")

In [None]:
# 4. Create Chroma vector store
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

In [9]:
# 5. Build storage context with Chroma backend
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [None]:
# 6. Store the chunked nodes in Chroma
index = VectorStoreIndex(nodes, storage_context=storage_context)

In [11]:
query_engine = index.as_query_engine()


In [176]:
response = query_engine.query("At What Year Indian Railways have conducted full scale disaster management exercise?")
print(response)

Indian Railways conducted a full scale disaster management exercise in the year 2023.


In [13]:
resp = query_engine.query("What ia the  Cost of Extension of Harbour Line between Goregaon-Borivali?")
print(resp)

The cost of the Extension of Harbour Line between Goregaon-Borivali is 826.


#### Extracting Table Details from PDF using PDFPlumber

In [15]:
import pdfplumber
from llama_index.core.schema import Document
import pandas as pd

def extract_tables_to_documents(pdf_path):
    """
    Extracts tables from a PDF and converts them into a list of LlamaIndex Document objects.
    Each document represents a single table.
    """
    documents = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            page_tables = page.extract_tables()
            if page_tables:
                for table_num, table in enumerate(page_tables):
                    # Convert the list of lists into a more readable string format
                    # This helps the LLM understand the table structure
                    table_str = f"Table on page {page_num + 1}, table {table_num + 1}:\n"
                    # Add a header row
                    header = table
                    header_row = [str(cell) if cell is not None else "" for cell in header]
                    table_str += "| " + " | ".join(header_row) + " |\n"
                    table_str += "| " + " | ".join(["---"] * len(header_row)) + " |\n"
                    # Add data rows
                    for row in table[1:]:
                        # Filter out None values to prevent errors
                        clean_row = [str(cell) if cell is not None else "" for cell in row]
                        table_str += "| " + " | ".join(clean_row) + " |\n"
                    
                    # Create a Document object
                    doc = Document(text=table_str, metadata={"page_number": page_num + 1})
                    documents.append(doc)
    return documents


pdf_file = "data/Indian_Railways_Annual_Report _23_24.pdf"
documents = extract_tables_to_documents(pdf_file)

In [16]:
# Convert docs into chunks (nodes)
nodes_table = splitter.get_nodes_from_documents(documents)

In [17]:
index.insert_nodes(nodes_table, storage_context=storage_context)

In [18]:
query_engine = index.as_query_engine()

In [19]:
resp2 = query_engine.query("Ahmednagar - New Loni - Ashti belongs to which state?")
print(resp2)

Maharashtra


In [20]:
resp3 = query_engine.query("What is New Bongaigaon- Kamakhya via Rangiya project length and anticipated cost?")
print(resp3)

The New Bongaigaon- Kamakhya via Rangiya project length is 176 km and the anticipated cost is 4,060.


In [21]:
resp4 = query_engine.query("Summarize theKey Financial Highlights?")
print(resp4)

The key financial highlights include Gross Traffic Receipts of 2,55,272.63 crore, Ordinary Working Expenses of 1,91,093.61 crore, Contribution to Reserve Funds of 59,800.00 crore, Net Traffic Receipts (Operating Profit) of 250,893.61 crore, Miscellaneous Transactions (Net) of 4,379.02 crore, Net Revenue (Gross Profit) of 3,259.68 crore.


### Using Train DataSet to extract general Train Info

In [22]:
import pandas as pd
data_frames = pd.read_csv("data_set/train_info.csv")

In [23]:
data_frames.head()  # Display the first few rows of the DataFrame

Unnamed: 0,Train_No,Train_Name,Source_Station_Name,Destination_Station_Name,days
0,107,SWV-MAO-VLNK,SAWANTWADI ROAD,MADGOAN JN.,Saturday
1,108,VLNK-MAO-SWV,MADGOAN JN.,SAWANTWADI ROAD,Friday
2,128,MAO-KOP SPEC,MADGOAN JN.,CHHATRAPATI SHAHU MAHARAJ TERMINUS,Friday
3,290,PALACE ON WH,DELHI-SAFDAR JANG,DELHI-SAFDAR JANG,Wednesday
4,401,BSB BHARATDA,AURANGABAD,VARANASI JN.,Saturday


In [24]:
data_frames.to_json("data_set/train_info.json", orient="records")  # Save as JSON

In [25]:
from llama_index.core.indices.struct_store import JSONQueryEngine

In [26]:
train_info_schema = {
  "type": "array",
  "items": {
    "type": "object",
    "properties": {
      "Train_No": {
        "type": "integer",
        "description": "The unique number of the train."
      },
      "Train_Name": {
        "type": "string",
        "description": "The name of the train."
      },
      "Source_Station_Name": {
        "type": "string",
        "description": "The starting station of the train's journey."
      },
      "Destination_Station_Name": {
        "type": "string",
        "description": "The final destination of the train's journey."
      },
      "days": {
        "type": "string",
        "description": "The day of the week the train operates."
      }
    },
    "required": ["Train_No", "Train_Name", "Source_Station_Name", "Destination_Station_Name", "days"]
  }
}

In [27]:
import json

with open("data_set/train_info.json", "r", encoding="utf-8") as f:
    train_info_json_obj = json.load(f)

In [28]:
nl_query_engine = JSONQueryEngine(
    json_value=train_info_json_obj,
    json_schema=train_info_schema
)

In [193]:
nl_response = nl_query_engine.query(
    "Give the details for the train no 107?",
)
print(nl_response)

Train Number: 107
Train Name: SWV-MAO-VLNK
Source Station: SAWANTWADI ROAD
Destination Station: MADGOAN JN.
Operating Day: Saturday


In [196]:
nl_response = nl_query_engine.query(
    "Give the details for the train no 108?",
)
print(nl_response)

The train with number 108 is named VLNK-MAO-SWV. It starts its journey from MADGOAN JN. and reaches its final destination at SAWANTWADI ROAD. This train operates on Fridays.


In [198]:
nl_response = nl_query_engine.query(
    "Give the details for the train whose name is 'PNBE-ASR FTR'?",
)
print(nl_response)

The train with the name 'PNBE-ASR FTR' has the following details:
- Train Number: 604
- Source Station: PATNA JN.
- Destination Station: AMRITSAR JN.
- Operating Day: Thursday


In [199]:
nl_response = nl_query_engine.query(
    "Give the details for the train whose destination station is 'SIRSA'?",
)
print(nl_response)

The details for the trains with the destination station 'SIRSA' are as follows:
1. Train No: 477, Train Name: FTR TRAIN NO, Source Station: SIRSA, Destination Station: SIRSA, Operating Day: Sunday
2. Train No: 14085, Train Name: HARYANA EXPR, Source Station: TILAK BRIDGE, Destination Station: SIRSA, Operating Day: Saturday
3. Train No: 54632, Train Name: DHURI -SSA P, Source Station: DHURI JN., Destination Station: SIRSA, Operating Day: Tuesday


### Creating Agents using QueryEngines

In [31]:
from llama_index.core.agent import AgentRunner

In [32]:
from llama_index.core.tools import QueryEngineTool

QueryEngine:Tool1: Retrive Train Info Details

In [83]:
query_engine_tool_1 = QueryEngineTool.from_defaults(
    query_engine=nl_query_engine,
    name="train_info_query_tool",
    description=(
        "This tool can answer questions related to Train details such as train journey from which place it starts and where it ends, and operations and it accept train no and other train details. "
        "Use it when the user asks about specific information that this query engine can handle."
        "Use this tool to get specific train information based on the train number or other details such as starting point and end point or destination.Do not use this tool for general queries or unrelated information."
    )
)

Query Engine Tool 2 : Indian Railways Station Annual Report 23-24

In [34]:
query_engine_tool_2 = QueryEngineTool.from_defaults(
    query_engine=query_engine,
    name="train_annual_report_tool",
    description=(
        "This tool can answer questions related to Train Annual Report such as financial highlights, project details, and other operational information, Also" \
        "it can handle queries about the annual report's content and structure.Differnt set of question related to status and all" \
        "Use it when the user asks about specific information that this query engine can handle."
        "Do not use this tool for general queries or unrelated information which is outside of this context."
    )
)

In [None]:
from llama_index.core.agent import AgentRunner, ReActAgentWorker

In [82]:
from llama_index.llms.openai import OpenAI

In [84]:
from llama_index.core.agent.workflow import FunctionAgent
from llama_index.core.workflow import Context

agent = FunctionAgent(tools=[query_engine_tool_1,query_engine_tool_2], llm=OpenAI(model="gpt-4o"))

# context to hold the session/state
ctx = Context(agent)

1. Running Query 1 from Agent: Give the details for the train no 108?

In [85]:
from llama_index.core.agent.workflow import ToolCallResult, AgentStream

handler = agent.run("Give the details for the train no 108?", ctx=ctx)

async for ev in handler.stream_events():
    if isinstance(ev, ToolCallResult):
        print(
            f"Call {ev.tool_name} with args {ev.tool_kwargs}\nReturned: {ev.tool_output}"
        )
    elif isinstance(ev, AgentStream):
        print(ev.delta, end="", flush=True)

response = await handler

Call train_info_query_tool with args {'input': 'train no 108'}
Returned: Train number 108 is VLNK-MAO-SWV, which operates from MADGOAN JN. to SAWANTWADI ROAD on Fridays.
Train number 108, known as VLNK-MAO-SWV, operates from Madgaon Junction (MADGOAN JN.) to Sawantwadi Road (SAWANTWADI ROAD) on Fridays.

In [86]:
response.response.blocks[0].text if response.response.blocks else "No response received."

'Train number 108, known as VLNK-MAO-SWV, operates from Madgaon Junction (MADGOAN JN.) to Sawantwadi Road (SAWANTWADI ROAD) on Fridays.'

2. Running Query 2 from Agent: Status of level crossings on IR as on 01.04.2024?

In [87]:
handler2 = agent.run("Status of level crossings on IR as on 01.04.2024?", ctx=ctx)

In [88]:
rr = await handler2

In [89]:
print(rr.response.blocks[0].text if rr.response.blocks else "No response received.")

As of April 1, 2024, the status of level crossings on Indian Railways is as follows:

- Total number of level crossings: 17,777
- Number of manned level crossings: 17,260 (97%)
- Number of unmanned level crossings: 513 (3%)


3. Running Query 3 from Agent: Provide details about train no 504

In [90]:
handler3 = agent.run("provide details about train no 504", ctx=ctx)
trains = await handler3

In [91]:
trains.response.blocks[0].text if trains.response.blocks else "No response received."

'Train number 504, known as the PNBE-BTI FTR, operates from Patna Junction (PATNA JN.) to Bathinda Junction (BATHINDA JN) on Wednesdays.'

4. Running Query 4 from Agent: provide details about train whose name is VLNK-MAO-SWV

In [101]:
handler4 = agent.run("provide details about train whose name is VLNK-MAO-SWV", ctx=ctx)
trains = await handler4

In [102]:
trains.response.blocks[0].text if trains.response.blocks else "No response received."

'The train named VLNK-MAO-SWV, with train number 108, operates from Madgaon Junction (MADGOAN JN.) to Sawantwadi Road (SAWANTWADI ROAD) on Fridays.'

5. Running Query 5 from Agent: provide details about train whose name is VLNK-MAO-SWV

In [201]:
handler5 = agent.run("Total Number of Gazetted Staff Trained During 2023-24", ctx=ctx)
res = await handler5

In [203]:
res.response.blocks[0].text if res.response.blocks else "No response received."

'During the year 2023-24, a total of 64,338 Gazetted Staff members were trained.'

### Evaluation
----------------------------

Evaluation can be done using multiple Evaluator Such as Releveancy, Failthfulness and Correctness
We are choosing to evaluate using Relevency here with llama RelevancyEvaluator, we are performing this only for QueryEngine for PDF Files,
skipping the JSONEngine.

In [117]:
from llama_index.core.evaluation import RelevancyEvaluator
from llama_index.core import Settings
from llama_index.core.base.response.schema import Response

Evaluating Query Engine 2: query_engine_tool_2 (Annual Report)

In [204]:
# define evaluator
evaluator = RelevancyEvaluator()
your_eval_dataset = ["Ahmednagar - New Loni - Ashti belongs to which state?", 
                     "What is New Bongaigaon- Kamakhya via Rangiya project length and anticipated cost?",
                     "Total Number of Gazetted Staff Trained During 2023-24?"]
# query index
for query in your_eval_dataset:
    print(f"Evaluating Query : {query}")
    print('*'*50)
    response = query_engine_tool_2.query_engine.query(query)
    eval_result = evaluator.evaluate_response(query=query, response=response)
    print('Response', response)
    print('Evaluation Score:',str(eval_result.score))
    print('\n')

Evaluating Query : Ahmednagar - New Loni - Ashti belongs to which state?
**************************************************
Response Maharashtra
Evaluation Score: 1.0


Evaluating Query : What is New Bongaigaon- Kamakhya via Rangiya project length and anticipated cost?
**************************************************
Response The New Bongaigaon- Kamakhya via Rangiya project length is 176 km and the anticipated cost is 4,060 crore.
Evaluation Score: 1.0


Evaluating Query : Total Number of Gazetted Staff Trained During 2023-24?
**************************************************
Response The total number of Gazetted Staff trained during 2023-24 is 64,338.
Evaluation Score: 1.0


