In [73]:
import os
from datetime import datetime
import json
from dotenv import load_dotenv
from openai import OpenAI
from IPython.display import Markdown, display
from agents import Agent, Runner, trace
from agents.extensions.models.litellm_model import LitellmModel
from pydantic import BaseModel, Field
from typing import Literal
from IPython.display import Markdown, display
from pathlib import Path

In [16]:
load_dotenv(override=True)

True

In [17]:
openai_api_key = os.getenv('OPENAI_API_KEY')
openrouter_api_key = os.getenv('OPENROUTER_API_KEY')

if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")
    
if openrouter_api_key:
    print(f"OpenRouter API Key exists and begins {openrouter_api_key[:8]}")
else:
    print("OpenRouter API Key not set")

OpenAI API Key exists and begins sk-proj-
OpenRouter API Key exists and begins sk-or-v1


In [18]:
agent = Agent(name="Jokester", instructions="You are a joke teller", model="gpt-4.1-mini")

with trace("Telling a joke"):
    result = await Runner.run(agent, "Tell a joke about Autonomous AI Agents")
    print(result.final_output)

Why did the autonomous AI agent bring a ladder to work?

Because it wanted to reach the cloud on its own!


In [82]:
agent2 = Agent(
       name="Assistant",
       model=LitellmModel(model="openrouter/x-ai/grok-code-fast-1", api_key=openrouter_api_key),
       instructions="You are helpful."
   )

with trace("Telling a joke"):
    result = await Runner.run(agent, "Tell a joke about Autonomous AI Agents")
    print(result.final_output)

Why did the autonomous AI agent bring a suitcase to work?

Because it was ready to *take its tasks* to the next level!


In [49]:
class BugFinding(BaseModel):
    title: str = Field(description="Brief name for the bug")
    description: str = Field(description="Detailed explanation")
    severity: int = Field(description="Severity 1-10")
    file: str = Field(description="File path")
    relevant_lines: list[int] = Field(description="Line numbers")
    suggested_fix: str = Field(description="Recommended solution")

class VulnerabilityFinding(BaseModel):
    title: str = Field(description="Brief name for the vulnerability")
    description: str = Field(description="Detailed explanation")
    severity: int = Field(description="Severity 1-10")
    file: str = Field(description="File path")
    relevant_lines: list[int] = Field(description="Line numbers")
    suggested_fix: str = Field(description="Recommended solution")
    cve_reference: str | None = Field(default=None, description="CVE ID if applicable")

class BestPracticeFinding(BaseModel):
    title: str = Field(description="Brief name for the best practice violation")
    description: str = Field(description="Detailed explanation")
    severity: int = Field(description="Severity 1-10")
    file: str = Field(description="File path")
    relevant_lines: list[int] = Field(description="Line numbers")
    suggested_fix: str = Field(description="Recommended solution")
    
class TestGap(BaseModel):
    function_name: str = Field(description="Name of the function/method lacking tests")
    file: str = Field(description="File containing the untested code")
    lines: list[int] = Field(description="Line numbers of the untested code")
    missing_scenarios: list[str] = Field(description="Specific test cases that should be added, e.g., ['edge case: empty input', 'error handling: invalid type']")
    priority: int = Field(description="Priority 1-10, based on code criticality")
    suggested_test_approach: str = Field(description="How to test this (unit test, integration test, etc.)")
    
class CodeAnalyzerOutput(BaseModel):
    findings: list[BugFinding] = Field(description="Bugs and anti-patterns found")

class SecurityOutput(BaseModel):
    findings: list[VulnerabilityFinding] = Field(description="Security vulnerabilities found")

class BestPracticesOutput(BaseModel):
    findings: list[BestPracticeFinding] = Field(description="Style and best practice violations")

class TestCoverageOutput(BaseModel):
    findings: list[TestGap] = Field(description="Testing gaps found")

In [50]:

code_analyzer_instructions = """You are a Code Analyzer agent reviewing a pull request diff. 
Identify bugs and anti-patterns including: logic errors, unhandled edge cases, null/undefined access, type mismatches, off-by-one errors, resource leaks, infinite loops, incorrect error handling, code duplication, and overly complex functions. 
For each issue found, specify the exact lines, severity (1-10), and a clear fix."""

code_analyzer = Agent(
    name="Code Analyzer",
    instructions=code_analyzer_instructions,
    model="gpt-4.1-mini",
    output_type=CodeAnalyzerOutput
)

In [51]:
sample_diff = """
diff --git a/user_service.py b/user_service.py
index 1234567..abcdefg 100644
--- a/user_service.py
+++ b/user_service.py
@@ -10,6 +10,15 @@ class UserService:
     def __init__(self):
         self.users = []
     
+    def get_user_by_id(self, user_id):
+        for i in range(len(self.users) + 1):
+            if self.users[i]['id'] == user_id:
+                return self.users[i]
+        return None
+    
+    def calculate_discount(self, price, discount_percent):
+        return price - (price * discount_percent / 100)
+    
     def process_payment(self, amount):
         if amount > 0:
             return True
"""

result = await Runner.run(code_analyzer, sample_diff)
print(result.final_output)

findings=[BugFinding(title='Off-by-one error causing IndexError', description='In get_user_by_id method, the loop iterates from 0 to len(users) inclusive, leading to an IndexError when i equals len(users). The correct range should be range(len(users)) to avoid out-of-range access.', severity=7, file='user_service.py', relevant_lines=[12], suggested_fix='Change the loop to: for i in range(len(self.users)):')]


In [54]:
from IPython.display import display, Markdown

# Get the output
output = result.final_output

# Format it nicely
markdown_text = "# Code Analysis Report\n\n"

if not output.findings:
    markdown_text += "*No issues found.*"
else:
    for i, finding in enumerate(output.findings, 1):
        markdown_text += f"## Issue {i}: {finding.title}\n\n"
        markdown_text += f"**Severity:** {finding.severity}/10  \n"
        markdown_text += f"**File:** `{finding.file}`  \n"
        markdown_text += f"**Lines:** {', '.join(map(str, finding.relevant_lines))}  \n\n"
        markdown_text += f"**Description:**  \n{finding.description}\n\n"
        markdown_text += f"**Suggested Fix:**  \n{finding.suggested_fix}\n\n"
        markdown_text += "---\n\n"

display(Markdown(markdown_text))

# Code Analysis Report

## Issue 1: Off-by-one error causing IndexError

**Severity:** 7/10  
**File:** `user_service.py`  
**Lines:** 12  

**Description:**  
In get_user_by_id method, the loop iterates from 0 to len(users) inclusive, leading to an IndexError when i equals len(users). The correct range should be range(len(users)) to avoid out-of-range access.

**Suggested Fix:**  
Change the loop to: for i in range(len(self.users)):

---



In [56]:
security_instructions = """You are a Security agent reviewing a pull request diff. 
Identify security vulnerabilities including: SQL injection, command injection, XSS vulnerabilities, hardcoded secrets/credentials, insecure authentication, path traversal, insecure deserialization, and improper input validation.
For each issue found, specify the exact lines, severity (1-10), clear fix, and CVE reference if applicable."""

best_practices_instructions = """You are a Best Practices agent reviewing a pull request diff. 
Identify code quality issues including: unclear variable names, functions exceeding 50 lines, nested complexity over 3 levels, missing docstrings, inconsistent formatting, magic numbers without explanation, and violations of DRY principle.
For each issue found, specify the exact lines, severity (1-10), and a clear fix."""

test_coverage_instructions = """You are a Test Coverage agent reviewing a pull request diff. 
For each new or modified function, suggest test cases covering: normal input cases, edge cases (empty, null, boundary values), error conditions, and integration scenarios.
For each gap found, specify the function name, lines, missing test scenarios, priority (1-10), and whether unit or integration tests are needed."""

security_agent = Agent(
    name="Security Agent",
    instructions=security_instructions,
    model="gpt-4.1-mini",
    output_type=SecurityOutput
)

best_practices_agent = Agent(
    name="Best Practices Agent",
    instructions=best_practices_instructions,
    model="gpt-4.1-mini",
    output_type=BestPracticesOutput
)

test_coverage_agent = Agent(
    name="Test Coverage Agent",
    instructions=test_coverage_instructions,
    model="gpt-4.1-mini",
    output_type=TestCoverageOutput
)

In [57]:
sample_diff = """
diff --git a/auth_service.py b/auth_service.py
index 1234567..abcdefg 100644
--- a/auth_service.py
+++ b/auth_service.py
@@ -1,5 +1,30 @@
 import sqlite3
+import os
 
 class AuthService:
-    pass
+    def __init__(self):
+        self.db_password = "admin123"
+        self.connection = sqlite3.connect('users.db')
+    
+    def authenticate_user(self, username, password):
+        query = f"SELECT * FROM users WHERE username='{username}' AND password='{password}'"
+        cursor = self.connection.cursor()
+        cursor.execute(query)
+        result = cursor.fetchone()
+        if result:
+            return True
+        return False
+    
+    def get_user_file(self, filename):
+        filepath = "/var/data/" + filename
+        with open(filepath, 'r') as f:
+            return f.read()
+    
+    def calculate_price(self, base_price, discount):
+        for i in range(100):
+            for j in range(100):
+                for k in range(100):
+                    x = i * j * k
+        final = base_price - discount
+        return final
"""

import asyncio

async def run_all_agents(diff):
    results = await asyncio.gather(
        Runner.run(code_analyzer, diff),
        Runner.run(security_agent, diff),
        Runner.run(best_practices_agent, diff),
        Runner.run(test_coverage_agent, diff)
    )
    return results

results = await run_all_agents(sample_diff)

# Unpack results
code_result, security_result, best_practices_result, test_coverage_result = results

In [59]:
print("=== CODE ANALYZER ===")
for i, finding in enumerate(code_result.final_output.findings, 1):
    print(f"\n{i}. {finding.title}")
    # print(f"   Type: {finding.finding_type}")
    print(f"   File: {finding.file}")
    print(f"   Lines: {finding.relevant_lines}")
    print(f"   Severity: {finding.severity}")
    print(f"   Description: {finding.description}")
    print(f"   Fix: {finding.suggested_fix}")

print("\n=== SECURITY ===")
for i, finding in enumerate(security_result.final_output.findings, 1):
    print(f"\n{i}. {finding.title}")
    # print(f"   Type: {finding.finding_type}")
    print(f"   File: {finding.file}")
    print(f"   Lines: {finding.relevant_lines}")
    print(f"   Severity: {finding.severity}")
    print(f"   Fix: {finding.suggested_fix}")
    print(f"   CVE: {finding.cve_reference}")

print("\n=== BEST PRACTICES ===")
for i, finding in enumerate(best_practices_result.final_output.findings, 1):
    print(f"\n{i}. {finding.title}")
    # print(f"   Type: {finding.finding_type}")
    print(f"   File: {finding.file}")
    print(f"   Lines: {finding.relevant_lines}")
    print(f"   Severity: {finding.severity}")
    print(f"   Fix: {finding.suggested_fix}")

print("\n=== TEST COVERAGE ===")
for i, gap in enumerate(test_coverage_result.final_output.findings, 1):
    print(f"\n{i}. Function: {gap.function_name}")
    print(f"   File: {gap.file}")
    print(f"   Lines: {gap.lines}")
    print(f"   Priority: {gap.priority}")
    print(f"   Missing scenarios: {gap.missing_scenarios}")

=== CODE ANALYZER ===

1. SQL Injection Vulnerability
   File: auth_service.py
   Lines: [9, 10, 11]
   Severity: 9
   Description: The authenticate_user method constructs SQL queries by directly embedding user input without sanitization or parameterization, leading to SQL injection risks.
   Fix: Use parameterized queries with placeholders to safely include user inputs, for example:

query = "SELECT * FROM users WHERE username=? AND password=?"
cursor.execute(query, (username, password))

2. Potential Path Traversal in get_user_file
   File: auth_service.py
   Lines: [15, 16, 17]
   Severity: 8
   Description: The get_user_file method concatenates the filename parameter directly to a directory path without validation, which allows path traversal attacks if filename contains '../' sequences.
   Fix: Validate and sanitize the filename parameter to ensure it does not contain path traversal characters or use os.path methods to resolve and verify the final path is within the intended direc

In [62]:
code_result.final_output.findings[0].severity

9

In [None]:
def organize_findings(
    code_result,
    security_result, 
    best_practices_result,
    test_coverage_result
):
    """
    Organizes all findings by file.
    
    Returns:
        dict: {
            "file.py": [Finding, Finding, TestGap, ...]
        }
    """
    organized = {}
    for result in [code_result, security_result,  best_practices_result, test_coverage_result]:
        for finding in result.final_output.findings:
            file = finding.file
            if file not in organized:
                organized[file] = []
            organized[file].append(finding)
        
    return organized

# Usage:
organized = organize_findings(
    code_result,
    security_result,
    best_practices_result,
    test_coverage_result
)

In [64]:
organized

{'auth_service.py': [BugFinding(title='SQL Injection Vulnerability', description='The authenticate_user method constructs SQL queries by directly embedding user input without sanitization or parameterization, leading to SQL injection risks.', severity=9, file='auth_service.py', relevant_lines=[9, 10, 11], suggested_fix='Use parameterized queries with placeholders to safely include user inputs, for example:\n\nquery = "SELECT * FROM users WHERE username=? AND password=?"\ncursor.execute(query, (username, password))'),
  BugFinding(title='Potential Path Traversal in get_user_file', description="The get_user_file method concatenates the filename parameter directly to a directory path without validation, which allows path traversal attacks if filename contains '../' sequences.", severity=8, file='auth_service.py', relevant_lines=[15, 16, 17], suggested_fix='Validate and sanitize the filename parameter to ensure it does not contain path traversal characters or use os.path methods to resolve

In [66]:
aggregator_instructions = """You are a Code Review Aggregator. 
You receive findings from multiple code analysis agents (Code Analyzer, Security, Best Practices, Test Coverage).

Your job:
1. Identify duplicate findings (same file + overlapping lines flagged by multiple agents)
2. For duplicates, merge them and note which agents flagged the issue (increases confidence)
3. Organize findings by file
4. Within each file, prioritize by severity (highest first)
5. Write a professional, coherent executive summary at the top
6. Format the report clearly with sections for each file

Output a markdown-formatted report."""

aggregator = Agent(
    name="Aggregator",
    instructions=aggregator_instructions,
    model="gpt-4.1-mini",
)

with trace("Aggregator"):
    result = await Runner.run(aggregator, f"Aggregate these findings into a coherent report:\n\n{organized}")
    print(result.final_output)

# Code Analysis Aggregate Report for `auth_service.py`

---

## Executive Summary

The analysis of `auth_service.py` reveals several critical security vulnerabilities, significant best practice issues, inefficiencies, and notable test coverage gaps. The highest priority concerns cluster around **SQL Injection vulnerabilities** in the `authenticate_user` method and **Path Traversal risks** in the `get_user_file` method, both flagged by multiple agents with severity levels up to 10. There is also a critical issue of a **hardcoded database password** which poses a serious security risk.

Additionally, the `calculate_price` method contains inefficient and unnecessary nested loops that degrade performance without contributing to functionality. Best practices are further undermined by missing docstrings, reducing code maintainability and clarity.

Test coverage is insufficient, leaving important normal, edge, and error scenarios untested, especially for the methods handling authentication, f

In [67]:
async def aggregator_agent(organized):
    result = await Runner.run(aggregator, f"Aggregate these findings into a coherent report:\n\n{organized}")
    return result.final_output

In [None]:
async def review_code(diff: str, save_output: bool = True) -> str:
    """
    Complete code review pipeline.
    
    Args:
        diff: The code diff to review
        
    Returns:
        Markdown-formatted code review report
    """
    results = await run_all_agents(diff)
    code_result, security_result, best_practices_result, test_coverage_result = results    
    organized = organize_findings(code_result, security_result, best_practices_result, test_coverage_result)
    report = await aggregator_agent(organized)
    
    if save_output:
        os.makedirs("user-data", exist_ok=True)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filepath = f"user-data/code_review_{timestamp}.md"
        with open(filepath, "w") as f:
            f.write(report)
        print(f"Report saved to {filepath}")
    
    return report

report = await review_code(sample_diff)
print(report)


TEST COVERAGE AGENT RAW OUTPUT:

Function: authenticate_user
Missing scenarios: ['normal case: valid username and password', 'edge case: username or password is empty string', 'edge case: username or password is None', 'error handling: SQL injection attempt in username or password', 'error handling: database connection issue or query failure']

Function: get_user_file
Missing scenarios: ['normal case: file exists and is readable', 'error handling: file does not exist', 'error handling: file is not readable due to permissions', 'edge case: filename is empty or None', "security case: filename contains path traversal characters (e.g. '../')"]

Function: calculate_price
Missing scenarios: ['normal case: base_price and discount are positive numbers with discount less than base_price', 'edge case: discount is zero', 'edge case: discount equals base_price', 'edge case: discount greater than base_price', 'edge case: base_price or discount is zero', 'edge case: base_price or discount is negati

In [71]:
multi_file_diff = """
diff --git a/auth_service.py b/auth_service.py
index 1234567..abcdefg 100644
--- a/auth_service.py
+++ b/auth_service.py
@@ -1,5 +1,15 @@
 import sqlite3
 
 class AuthService:
-    pass
+    def __init__(self):
+        self.db_password = "admin123"
+        self.connection = sqlite3.connect('users.db')
+    
+    def login(self, username, password):
+        query = f"SELECT * FROM users WHERE username='{username}' AND password='{password}'"
+        cursor = self.connection.cursor()
+        cursor.execute(query)
+        return cursor.fetchone() is not None

diff --git a/payment_service.py b/payment_service.py
index 2345678..bcdefgh 100644
--- a/payment_service.py
+++ b/payment_service.py
@@ -1,3 +1,20 @@
+import os
+
 class PaymentService:
-    pass
+    def __init__(self):
+        self.api_key = "sk_live_abc123xyz"
+    
+    def process_payment(self, amount, user_input):
+        cmd = f"charge --amount {amount} --user {user_input}"
+        os.system(cmd)
+        return True
+    
+    def refund(self, amount):
+        if amount:
+            return amount
+        
diff --git a/logger.py b/logger.py
index 3456789..cdefghi 100644
--- a/logger.py
+++ b/logger.py
@@ -1,3 +1,12 @@
 class Logger:
-    pass
+    def log(self, msg):
+        f = open('/var/log/app.log', 'a')
+        f.write(msg)
+    
+    def get_logs(self, count):
+        logs = []
+        for i in range(count + 1):
+            logs.append(self.read_line(i))
+        return logs
"""


report = await review_code(multi_file_diff)
print(report)

Report saved to user-data/code_review_20251117_114129.md
# Code Analysis Aggregate Report

## Executive Summary

This report summarizes critical findings across three Python modules: `auth_service.py`, `logger.py`, and `payment_service.py`. The most severe security risks involve **SQL injection vulnerabilities** in `auth_service.py` and **command injection risks** in `payment_service.py`, both caused by unsanitized user inputs directly used in queries or shell commands. Both services also suffer from **hardcoded sensitive credentials** (database passwords and API keys), which pose significant exposure risks. 

The logging module (`logger.py`) exhibits multiple resource management issues, including file handles not being properly closed and an off-by-one bug in log retrieval, both of which degrade reliability and could cause runtime errors.

Across all files, missing or incomplete test coverage leaves critical functionality unverified, emphasizing an urgent need to build comprehensive u

## With real diffs from test-cases/

In [76]:
judge_instructions = """You are an evaluation judge for code review systems.
Your task: Compare expected findings (ground truth) against actual findings from a code review report.
For each expected finding, determine if it was detected in the actual report. A match means the report identified the same issue, even if worded differently.
Count the total number of distinct issues found in the actual report and how many expected findings were matched."""

class MatchedFinding(BaseModel):
    expected: str = Field(description="the expected finding text")
    matched: bool = Field(description="true if the expected finding is present, else false")
    actual_finding: str | None = Field(default=None, description="the matching text from report (if matched)")

class EvaluationResult(BaseModel):
    matched_findings: list[MatchedFinding]
    total_expected: int = Field(description="Total number of expected findings from ground truth")
    total_actual: int = Field(description="Total number of distinct issues found in the actual report")
    matches: int = Field(description="Number of expected findings that were successfully matched in the actual report.")

judge_agent = Agent(
    name="Evaluation Judge",
    instructions=judge_instructions,
    model="gpt-4.1-mini",
    output_type=EvaluationResult
)

async def evaluate_report(report: str, ground_truth_content: str) -> dict:
    """
    Evaluates a code review report against ground truth.
    Ground truth can be any format - the judge figures it out.
    """
    prompt = f"""
You are evaluating a code review system.

GROUND TRUTH (expected findings):
{ground_truth_content}

ACTUAL REPORT (what the system found):
{report}

Compare them and determine:
1. Which expected findings were detected in the actual report (even if worded differently)
2. Total number of distinct issues in the actual report
3. How many matches occurred

Be flexible - the ground truth format may vary.
"""
    
    result = await Runner.run(judge_agent, prompt)
    eval_result = result.final_output
    
    # Calculate metrics
    recall = eval_result.matches / eval_result.total_expected if eval_result.total_expected > 0 else 0
    precision = eval_result.matches / eval_result.total_actual if eval_result.total_actual > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return {
        "recall": recall,
        "precision": precision,
        "f1": f1,
        "matches": eval_result.matches,
        "total_expected": eval_result.total_expected,
        "total_actual": eval_result.total_actual,
        "details": eval_result.matched_findings
    }

In [80]:
# Run test 1
test_dir = Path("test-cases")
diff_file = test_dir / "01_sql_injection.diff"

# Load files
diff_content = diff_file.read_text()
expected_file = diff_file.with_name("01_sql_injection_expected.json")
ground_truth_content = expected_file.read_text()

print("="*60)
print("TEST 1: SQL INJECTION")
print("="*60)

# Run review WITH saving
report = await review_code(diff_content, save_output=True)

print("\n" + "="*60)
print("GENERATED REPORT:")
print("="*60)
print(report)

# Evaluate
eval_result = await evaluate_report(report, ground_truth_content)

print("\n" + "="*60)
print("JUDGE OUTPUT:")
print("="*60)
print(f"total_expected: {eval_result['total_expected']}")
print(f"total_actual: {eval_result['total_actual']}")
print(f"matches: {eval_result['matches']}")
print(f"\nmatched_findings:")
for mf in eval_result['details']:
    print(f"\n  Expected: {mf.expected}")
    print(f"  Matched: {mf.matched}")
    if mf.actual_finding:
        print(f"  Actual: {mf.actual_finding[:100]}...")  # truncate if long

print("\n" + "="*60)
print("CALCULATED METRICS:")
print("="*60)
print(f"Recall: {eval_result['recall']:.2f}")
print(f"Precision: {eval_result['precision']:.2f}")
print(f"F1 Score: {eval_result['f1']:.2f}")

TEST 1: SQL INJECTION

TEST COVERAGE AGENT RAW OUTPUT:

Function: authenticate
Missing scenarios: ['normal input: valid username and password', 'normal input: invalid username or password', 'edge case: empty username and/or password', 'edge case: extremely long username or password', 'error handling: SQL injection attempt in username or password', 'error handling: database connection failure', 'integration: verify authentication updates session or security context as expected']

Report saved to user-data/code_review_20251117_153336.md

GENERATED REPORT:
# Code Analysis Summary Report

This report aggregates findings from multiple code analysis agents regarding the `user_auth.py` file. The core issues identified relate to a severe SQL Injection vulnerability in the `authenticate` method due to unsafe construction of SQL queries using unsanitized user inputs. This critical security flaw is consistently highlighted across Bug, Vulnerability, and Best Practice analyses, reinforcing its imp

In [74]:
async def run_test_suite():
    test_dir = Path("test-cases")
    diff_files = sorted(test_dir.glob("*.diff"))
    
    results = []
    
    for diff_file in diff_files:
        # Load files
        diff_content = diff_file.read_text()
        expected_file = diff_file.with_name(diff_file.stem + "_expected.json")
        ground_truth_content = expected_file.read_text()
        
        # Extract test name from filename
        test_name = diff_file.stem.replace("_", " ").title()
        
        print(f"\n{'='*60}")
        print(f"Testing: {test_name}")
        print('='*60)
        
        # Run review
        report = await review_code(diff_content, save_output=True)
        
        # Evaluate
        eval_result = await evaluate_report(report, ground_truth_content)
        
        print(f"Recall: {eval_result['recall']:.2f}")
        print(f"Precision: {eval_result['precision']:.2f}")
        print(f"F1 Score: {eval_result['f1']:.2f}")
        print(f"Matches: {eval_result['matches']}/{eval_result['total_expected']}")
        
        results.append({
            "test": test_name,
            **eval_result
        })
    
    # Summary
    print(f"\n{'='*60}")
    print("SUMMARY")
    print('='*60)
    avg_recall = sum(r['recall'] for r in results) / len(results)
    avg_precision = sum(r['precision'] for r in results) / len(results)
    avg_f1 = sum(r['f1'] for r in results) / len(results)
    print(f"Average Recall: {avg_recall:.2f}")
    print(f"Average Precision: {avg_precision:.2f}")
    print(f"Average F1 Score: {avg_f1:.2f}")
    
    return results

# Run it
results = await run_test_suite()


Testing: 01 Sql Injection
Recall: 1.00
Precision: 1.33
F1 Score: 1.14
Matches: 4/4

Testing: 02 Logic Bug
Recall: 1.00
Precision: 1.20
F1 Score: 1.09
Matches: 6/6

Testing: 03 Code Quality
Recall: 0.88
Precision: 0.88
F1 Score: 0.88
Matches: 7/8

Testing: 04 Multi File Security
Recall: 1.00
Precision: 1.00
F1 Score: 1.00
Matches: 12/12

Testing: 05 Multi File Mixed
Recall: 0.92
Precision: 0.71
F1 Score: 0.80
Matches: 12/13

SUMMARY
Average Recall: 0.96
Average Precision: 1.02
Average F1 Score: 0.98
