# RAG Evaluation ‚Äì Airport Surveillance Knowledge Base

## Objective
This notebook evaluates the retrieval accuracy of the RAG system
built for scenario-aware airport surveillance network orchestration.

The evaluation focuses on:
- Semantic retrieval quality
- Embedding model comparison
- Robustness to query variations


In [None]:
!pip install -q sentence-transformers chromadb


In [None]:
import json
import random
import re

import numpy as np
import pandas as pd

from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.api.types import EmbeddingFunction


In [None]:
from chromadb.api.types import EmbeddingFunction

class SentenceTransformerEmbeddingFunction(EmbeddingFunction):
    def __init__(self, model_name):
        self.model = SentenceTransformer(model_name)

    def __call__(self, input):
        return self.model.encode(input).tolist()


In [None]:
TOP_K = 3
NUM_QUERIES = 15
RANDOM_SEED = 42

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)


In [None]:
SCENARIO_FILE = "scenarios.jsonl"

with open(SCENARIO_FILE, "r") as f:
    scenarios = [json.loads(line) for line in f]

print(f"Loaded {len(scenarios)} scenarios")



Loaded 300 scenarios


In [None]:
scenarios[0].keys(), scenarios[0]["metadata"]


(dict_keys(['metadata', 'structured_facts', 'zone_analysis', 'sla_requirements', 'recommendations', 'text_summary']),
 {'scenario_id': 204,
  'intent': 'failure_recovery',
  'time_profile': 'night',
  'label': 'failure',
  'processed_at': '2026-01-05T17:17:20.954885'})

In [None]:
scenarios[0]["metadata"]



{'scenario_id': 204,
 'intent': 'failure_recovery',
 'time_profile': 'night',
 'label': 'failure',
 'processed_at': '2026-01-05T17:17:20.954885'}

In [None]:
from collections import Counter
import numpy as np

def extract_decision_vector(s):
    decision = {
        "fps": None,
        "bandwidth": None,
        "processing": None,
        "latency": None
    }

    # 1. Latency from SLA
    sla = s.get("sla_requirements", {})
    decision["latency"] = sla.get("max_latency_ms")

    zone_analysis = s.get("zone_analysis", {})

    fps_values = []
    bandwidth_values = []
    processing_locations = []

    for zone in zone_analysis.values():
        config = zone.get("configuration", {})
        perf = zone.get("performance", {})

        if config.get("fps") is not None:
            fps_values.append(config["fps"])

        if config.get("data_rate_mbps") is not None:
            bandwidth_values.append(config["data_rate_mbps"])

        if config.get("processing_location"):
            processing_locations.append(config["processing_location"])

    # 2. Aggregate FPS (median is robust)
    if fps_values:
        decision["fps"] = int(np.median(fps_values))

    # 3. Aggregate bandwidth (sum across zones)
    if bandwidth_values:
        decision["bandwidth"] = round(sum(bandwidth_values), 2)

    # 4. Dominant processing tier
    if processing_locations:
        decision["processing"] = Counter(processing_locations).most_common(1)[0][0]

    return decision


In [None]:
def build_vector_db(model_name):
    client = chromadb.Client()
    embedding_fn = SentenceTransformerEmbeddingFunction(model_name)

    collection = client.create_collection(
        name=f"rag_decision_eval_{model_name}",
        embedding_function=embedding_fn,
        get_or_create=True
    )

    # Clean previous entries
    existing = collection.get()
    if existing and existing["ids"]:
        collection.delete(ids=existing["ids"])

    for s in scenarios:
        meta = s["metadata"]
        dv = extract_decision_vector(s)

        retrieval_text = (
            f"Operation: {meta.get('label','normal')}. "
            f"Time profile: {meta.get('time_profile','unknown')}. "
            f"Processing tier: {dv['processing']}. "
            f"Bandwidth: {dv['bandwidth']} Mbps. "
            f"FPS range: {dv['fps']}. "
            f"Latency: {dv['latency']} ms. "
            f"{s['text_summary'][:200]}"
        )

        collection.add(
            documents=[retrieval_text],
            metadatas=[{"scenario_id": meta["scenario_id"]}],
            ids=[str(meta["scenario_id"])]
        )

    return collection


In [None]:
def range_overlap(a, b, threshold=0.5):
    if not a or not b:
        return False
    low = max(a[0], b[0])
    high = min(a[1], b[1])
    return (high - low) / (b[1] - b[0]) >= threshold


def close_enough(a, b, tol=0.2):
    if a is None or b is None:
        return False
    return abs(a - b) / b <= tol


In [None]:
def fps_compatible(a, b, tol=0.2):
    if a is None or b is None:
        return False
    return abs(a - b) / b <= tol


In [None]:
def decision_compatible(retrieved_s, expected_s):
    r = extract_decision_vector(retrieved_s)
    e = extract_decision_vector(expected_s)

    score = 0
    score += fps_compatible(r["fps"], e["fps"])
    score += close_enough(r["bandwidth"], e["bandwidth"])
    score += r["processing"] == e["processing"]
    score += close_enough(r["latency"], e["latency"])

    return score >= 2


In [None]:
def generate_queries(s):
    meta = s["metadata"]
    dv = extract_decision_vector(s)

    base = f"{meta.get('label','normal')} operation during {meta.get('time_profile','normal')}"
    queries = [base]

    if dv["bandwidth"] is not None:
        queries.append(f"{base} with limited bandwidth")

    if dv["latency"] is not None:
        queries.append(f"{base} requiring low latency")

    if dv["fps"] is not None:
        queries.append(f"{base} with reduced FPS")

    return queries


In [None]:
queries = []

for s in scenarios:
    for q in generate_queries(s):
        queries.append({
            "query": q,
            "source_id": s["metadata"]["scenario_id"]
        })

random.shuffle(queries)
queries = queries[:NUM_QUERIES]

queries[:3]


[{'query': 'success operation during evening_rush with limited bandwidth',
  'source_id': 34},
 {'query': 'failure operation during early_morning requiring low latency',
  'source_id': 17},
 {'query': 'success operation during evening_rush with limited bandwidth',
  'source_id': 193}]

In [None]:
def retrieve_ids(collection, query, k=TOP_K):
    result = collection.query(
        query_texts=[query],
        n_results=k
    )
    return [int(i) for i in result["ids"][0]]


In [None]:
models = ["all-MiniLM-L6-v2", "all-mpnet-base-v2"]
results = []

for model in models:
    collection = build_vector_db(model)

    for item in queries:
        retrieved_ids = retrieve_ids(collection, item["query"])
        expected = next(
            s for s in scenarios
            if s["metadata"]["scenario_id"] == item["source_id"]
        )

        hits = 0
        rr = 0

        for rank, rid in enumerate(retrieved_ids, 1):
            retrieved_s = next(
                s for s in scenarios
                if s["metadata"]["scenario_id"] == rid
            )
            if decision_compatible(retrieved_s, expected):
                hits += 1
                if rr == 0:
                    rr = 1 / rank

        results.append({
            "model": model,
            "precision@3": hits / TOP_K,
            "recall@3": int(hits > 0),
            "mrr": rr
        })


In [None]:
df = pd.DataFrame(results)

summary = (
    df
    .groupby("model")[["precision@3", "recall@3", "mrr"]]
    .mean()
    .reset_index()
)

summary


Unnamed: 0,model,precision@3,recall@3,mrr
0,all-MiniLM-L6-v2,0.533333,0.8,0.6
1,all-mpnet-base-v2,0.488889,0.866667,0.655556


In [None]:
# ================= INTERACTIVE RAG DEMO =================

print("üîß Initializing RAG system (this runs once)...")

collection = build_vector_db("all-MiniLM-L6-v2")

print("‚úÖ RAG system ready.")
print("Type your question and press Enter.")
print("Type 'exit' or 'quit' to stop.\n")


def show_answer(question, k=3):
    result = collection.query(
        query_texts=[question],
        n_results=k
    )

    for rank, sid in enumerate(result["ids"][0], start=1):
        scenario = next(
            s for s in scenarios
            if s["metadata"]["scenario_id"] == int(sid)
        )

        meta = scenario["metadata"]
        dv = extract_decision_vector(scenario)

        print(f"\n--- Result {rank} ---")
        print(f"Scenario ID : {meta['scenario_id']}")
        print(f"Type        : {meta.get('label','normal')}")
        print(f"Time        : {meta.get('time_profile','unknown')}")
        print("\nRecommended Configuration:")
        print(f"  FPS range       : {dv['fps']}")
        print(f"  Bandwidth (Mbps): {dv['bandwidth']}")
        print(f"  Latency (ms)    : {dv['latency']}")
        print(f"  Processing tier : {dv['processing']}")
        print("\nExplanation:")
        print(scenario["text_summary"][:300] + "...")


# -------- MAIN LOOP --------
while True:
    question = input("\nAsk a question > ").strip()

    if question.lower() in ["exit", "quit", "stop"]:
        print("\nüõë Session ended.")
        break

    if not question:
        print("‚ö†Ô∏è Please enter a valid question.")
        continue

    print("\nüß† Question:")
    print(question)
    print("\nüîç Retrieving relevant scenarios...")

    show_answer(question)


üîß Initializing RAG system (this runs once)...
‚úÖ RAG system ready.
Type your question and press Enter.
Type 'exit' or 'quit' to stop.


üß† Question:
normal daytime operation with bandwidth constraints

üîç Retrieving relevant scenarios...

--- Result 1 ---
Scenario ID : 57
Type        : success
Time        : night

Recommended Configuration:
  FPS range       : 10
  Bandwidth (Mbps): 392.49
  Latency (ms)    : 41.0
  Processing tier : edge

Explanation:
Scenario 57 represents a testing network congestion limits deployment during night operations (8pm-11pm) with reduced traffic. The system operated with 123 active cameras achieving 249.5 Mbps total throughput with an average network delay of 0.269ms. Processing was distributed with 6 zones processed...

--- Result 2 ---
Scenario ID : 38
Type        : success
Time        : night

Recommended Configuration:
  FPS range       : 14
  Bandwidth (Mbps): 742.4
  Latency (ms)    : 42.0
  Processing tier : core

Explanation:
Scenario 38 r