### Python code for calculating nDCG

In [9]:
import math
import itertools
from typing import List

def retrieval_ndcg(gt: List[List[str]], pred: List[str]) -> float:
    """
    Calculates the nDCG (normalized Discounted Cumulative Gain) score
    based on the given ground truth (gt) and predicted (pred) lists.

    Args:
        gt: A list of lists of ground truth document IDs. Each inner list
            represents a set of relevant documents for a single query.
            For a single query evaluation, this is typically in the format:
            [[relevant_doc1, relevant_doc2, ...]]
        pred: An ordered list of document IDs predicted by the LLM agent.
              The order is important. For calculating nDCG@k, this list
              should contain the top k predictions.

    Returns:
        The nDCG score (float).
    """
    gt_sets = [frozenset(g) for g in gt]
    pred_set = set(pred)

    # Calculate relevance scores for each document in the prediction list. A document is considered relevant (1) if it is present in any of the gt_sets.
    relevance_scores = {
        pred_id: 1 if any(pred_id in gt_set for gt_set in gt_sets) else 0
        for pred_id in pred_set
    }
    print(f"relevance_scores : {relevance_scores}")

    # Calculate DCG. Penalizes documents ranked lower, so relevant documents ranked higher get better scores.
    dcg = sum(
        (2 ** relevance_scores.get(doc_id, 0) - 1) / math.log2(i + 2)
        for i, doc_id in enumerate(pred)
    )
    # Calculate IDCG. The DCG score for the best possible ranking
    len_flatten_gt = len(list(itertools.chain.from_iterable(gt)))
    len_pred = len(pred)

    # Create an ideal prediction relevance list: Fill with 1s up to the minimum of the number of relevant documents and the prediction list length, and the rest with 0s.
    ideal_pred_relevances = [1] * min(len_flatten_gt, len_pred) + [0] * max(0, len_pred - len_flatten_gt)
    idcg = sum(
        relevance / math.log2(i + 2)
        for i, relevance in enumerate(ideal_pred_relevances)
    )

    # Calculate nDCG (DCG / IDCG)
    ndcg = dcg / idcg if idcg > 0 else 0.0
    return ndcg


# Example Usage

# user query: "How to troubleshoot robot arm vibration issues?"

# 1. Define the Ground Truth document ID list
ground_truth_documents = [[
    "RG-VIB-001",
    "RG-MAINT-003"
    "RG-CALIB-002",
    "RG-ERR-005",  
    "PM-ROBOT-007" 
]]

# 2. LLM Agent's Retrieval Results (Top 5 document IDs) - 5 items for nDCG@5
retrieval_documents_top5 = [
    "RG-VIB-001",      # Highly relevant (correct)
    "GEN-MANUAL-002",  # General Equipment Manual (incorrect)
    "RG-MAINT-003",    # Moderately relevant (correct)
    "RG-CALIB-002",    # Moderately relevant (correct)
    "CTRL-UPDATE-001"  # Control System Update Log (incorrect)
]

# 3. Calculate nDCG@5 score
ndcg_score_at_5 = retrieval_ndcg(gt=ground_truth_documents, pred=retrieval_documents_top5)

print(f"nDCG@5 score: {ndcg_score_at_5:.4f}\n")

# example2
retrieval_documents_top5_better = [
    "RG-VIB-001",   # Highly relevant (correct)
    "RG-MAINT-003", # Highly relevant (correct)
    "PM-ROBOT-007", # Highly relevant (correct)
    "RG-CALIB-002", # Moderately relevant (correct)
    "RG-ERR-005"    # Less relevant (correct)
]
ndcg_score_at_5_better = retrieval_ndcg(gt=ground_truth_documents, pred=retrieval_documents_top5_better)
print("Better retrieval Example")
print(f"nDCG@5 score (better predictions): {ndcg_score_at_5_better:.4f}\n")

# example3
retrieval_documents_top5_worse = [
    "GEN-SAFETY-001",     # General Safety Rules (incorrect)
    "GEN-MANUAL-002",     # General Equipment Manual (incorrect)
    "CTRL-UPDATE-001",    # Control System Update Log (incorrect)
    "HR-POLICY-003",      # HR Policy (incorrect)
    "CAFETERIA-MENU-001"  # Cafeteria Menu (incorrect)
]
ndcg_score_at_5_worse = retrieval_ndcg(gt=ground_truth_documents, pred=retrieval_documents_top5_worse)
print("Very poor retrieval Example")
print(f"nDCG@5 score (very poor predictions): {ndcg_score_at_5_worse:.4f}\n")

# example4
ground_truth_documents_less = [[
    "RG-VIB-001",
    "RG-MAINT-003"
]]
retrieval_documents_for_less_gt = [
    "RG-VIB-001",
    "GEN-MANUAL-002",
    "RG-MAINT-003",
    "RG-CALIB-002", # This document is not in ground_truth_documents_less
    "CTRL-UPDATE-001"
]
ndcg_score_for_less_gt = retrieval_ndcg(gt=ground_truth_documents_less, pred=retrieval_documents_for_less_gt)
print("Less ground truth Example")
print(f"nDCG@5 score (2 ground truth docs): {ndcg_score_for_less_gt:.4f}")

relevance_scores : {'CTRL-UPDATE-001': 0, 'RG-VIB-001': 1, 'RG-MAINT-003': 0, 'RG-CALIB-002': 0, 'GEN-MANUAL-002': 0}
nDCG@5 score: 0.3904

relevance_scores : {'PM-ROBOT-007': 1, 'RG-ERR-005': 1, 'RG-VIB-001': 1, 'RG-MAINT-003': 0, 'RG-CALIB-002': 0}
Better retrieval Example
nDCG@5 score (better predictions): 0.7366

relevance_scores : {'CTRL-UPDATE-001': 0, 'HR-POLICY-003': 0, 'CAFETERIA-MENU-001': 0, 'GEN-SAFETY-001': 0, 'GEN-MANUAL-002': 0}
Very poor retrieval Example
nDCG@5 score (very poor predictions): 0.0000

relevance_scores : {'CTRL-UPDATE-001': 0, 'RG-VIB-001': 1, 'RG-MAINT-003': 1, 'RG-CALIB-002': 0, 'GEN-MANUAL-002': 0}
Less ground truth Example
nDCG@5 score (2 ground truth docs): 0.9197


### Evaluation Dataset Example (JSON format)

In [12]:
[
  {
    "query_id": "MQ001",
    "query_text": "What are the inspection points if abnormal noise occurs in the CNC milling machine spindle?",
    "ground_truth_document_ids": [
      "CNC-SPINDLE-NOISE-TRBL-V1.2.pdf", 
      "CNC-MAINT-CHECKLIST-SPINDLE.docx",
      "BEARING-SPEC-XYZ-SPINDLE.pdf",  
      "LUBRICATION-GUIDE-CNC.html" 
    ]
  },
  {
    "query_id": "MQ002",
    "query_text": "How do I set up the TCP for a robotic welder?",
    "ground_truth_document_ids": [
      "ROBOT-WELD-TCP-SETUP-MANUAL.pdf", 
      "ROBOT-CALIBRATION-PROC-V3.pdf",
      "WELDING-TIP-GUIDE.docx" 
    ]
  },
  {
    "query_id": "MQ003",
    "query_text": "The hydraulic system pressure suddenly dropped. What could be the cause?",
    "ground_truth_document_ids": [
      "HYDRAULIC-PRESS-LOW-PRESSURE-TRBL.pdf",
      "HYDRAULIC-SYSTEM-DIAGRAM-MODEL-A.png", 
      "PUMP-MAINT-HYD-001.docx",  
      "HYDRAULIC-FLUID-SPEC.pdf" 
    ]
  },
  {
    "query_id": "MQ004",
    "query_text": "The conveyor belt keeps shifting to one side. How do I adjust it?",
    "ground_truth_document_ids": [
      "CONVEYOR-BELT-ALIGNMENT-GUIDE.pdf", 
      "CONVEYOR-MAINT-COMMON-ISSUES.html", 
      "ROLLER-INSPECTION-CONVEYOR.docx"
    ]
  },
  {
    "query_id": "MQ005",
    "query_text": "Please tell me the PLC program backup procedure. The model is ABC-123.",
    "ground_truth_document_ids": [
      "PLC-BACKUP-PROC-ABC-123.pdf",
      "PLC-SOFTWARE-MANUAL-V2.html", 
      "DATA-MANAGEMENT-POLICY-FACTORY.docx"
    ]
  }
]

[{'query_id': 'MQ001',
  'query_text': 'What are the inspection points if abnormal noise occurs in the CNC milling machine spindle?',
  'ground_truth_document_ids': ['CNC-SPINDLE-NOISE-TRBL-V1.2.pdf',
   'CNC-MAINT-CHECKLIST-SPINDLE.docx',
   'BEARING-SPEC-XYZ-SPINDLE.pdf',
   'LUBRICATION-GUIDE-CNC.html']},
 {'query_id': 'MQ002',
  'query_text': 'How do I set up the TCP for a robotic welder?',
  'ground_truth_document_ids': ['ROBOT-WELD-TCP-SETUP-MANUAL.pdf',
   'ROBOT-CALIBRATION-PROC-V3.pdf',
   'WELDING-TIP-GUIDE.docx']},
 {'query_id': 'MQ003',
  'query_text': 'The hydraulic system pressure suddenly dropped. What could be the cause?',
  'ground_truth_document_ids': ['HYDRAULIC-PRESS-LOW-PRESSURE-TRBL.pdf',
   'HYDRAULIC-SYSTEM-DIAGRAM-MODEL-A.png',
   'PUMP-MAINT-HYD-001.docx',
   'HYDRAULIC-FLUID-SPEC.pdf']},
 {'query_id': 'MQ004',
  'query_text': 'The conveyor belt keeps shifting to one side. How do I adjust it?',
  'ground_truth_document_ids': ['CONVEYOR-BELT-ALIGNMENT-GUIDE.pdf