In [2]:
import os
from datetime import datetime
import json
from dotenv import load_dotenv
from openai import OpenAI
from IPython.display import Markdown, display
from agents import Agent, Runner, trace
from agents.extensions.models.litellm_model import LitellmModel
from pydantic import BaseModel, Field
from typing import Literal
from IPython.display import Markdown, display
from pathlib import Path
import asyncio
from typing import Optional, List
import chromadb

In [3]:
load_dotenv(override=True)

True

In [4]:
openai_api_key = os.getenv('OPENAI_API_KEY')
openrouter_api_key = os.getenv('OPENROUTER_API_KEY')

if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")
    
if openrouter_api_key:
    print(f"OpenRouter API Key exists and begins {openrouter_api_key[:8]}")
else:
    print("OpenRouter API Key not set")

OpenAI API Key exists and begins sk-proj-
OpenRouter API Key exists and begins sk-or-v1


In [5]:
grok_code_fast_1=LitellmModel(model="openrouter/x-ai/grok-code-fast-1", api_key=openrouter_api_key)
grok_4_fast=LitellmModel(model="openrouter/x-ai/grok-4-fast", api_key=openrouter_api_key)

In [6]:
class BugFinding(BaseModel):
    title: str = Field(description="Brief name for the bug")
    description: str = Field(description="Detailed explanation")
    severity: int = Field(description="Severity 1-10")
    file: str = Field(description="File path")
    relevant_lines: list[int] = Field(description="Line numbers")
    suggested_fix: str = Field(description="Recommended solution")

class VulnerabilityFinding(BaseModel):
    title: str = Field(description="Brief name for the vulnerability")
    description: str = Field(description="Detailed explanation")
    severity: int = Field(description="Severity 1-10")
    file: str = Field(description="File path")
    relevant_lines: list[int] = Field(description="Line numbers")
    suggested_fix: str = Field(description="Recommended solution")
    cve_reference: str | None = Field(default=None, description="CVE ID if applicable")

class BestPracticeFinding(BaseModel):
    title: str = Field(description="Brief name for the best practice violation")
    description: str = Field(description="Detailed explanation")
    severity: int = Field(description="Severity 1-10")
    file: str = Field(description="File path")
    relevant_lines: list[int] = Field(description="Line numbers")
    suggested_fix: str = Field(description="Recommended solution")
    
class TestGap(BaseModel):
    function_name: str = Field(description="Name of the function/method lacking tests")
    file: str = Field(description="File containing the untested code")
    lines: list[int] = Field(description="Line numbers of the untested code")
    missing_scenarios: list[str] = Field(description="Specific test cases that should be added, e.g., ['edge case: empty input', 'error handling: invalid type']")
    priority: int = Field(description="Priority 1-10, based on code criticality")
    suggested_test_approach: str = Field(description="How to test this (unit test, integration test, etc.)")
    
class CodeAnalyzerOutput(BaseModel):
    findings: list[BugFinding] = Field(description="Bugs and anti-patterns found")

class SecurityOutput(BaseModel):
    findings: list[VulnerabilityFinding] = Field(description="Security vulnerabilities found")

class BestPracticesOutput(BaseModel):
    findings: list[BestPracticeFinding] = Field(description="Style and best practice violations")

class TestCoverageOutput(BaseModel):
    findings: list[TestGap] = Field(description="Testing gaps found")

In [19]:

code_analyzer_instructions = """You are a Code Analyzer agent reviewing a pull request diff. 
Identify bugs and anti-patterns including: logic errors, unhandled edge cases, null/undefined access, type mismatches, off-by-one errors, resource leaks (unclosed files/cursors/connections), infinite loops, missing error handling (no try-except blocks), code duplication, and overly complex functions. 
For each issue found, specify the exact lines, severity (1-10), and a clear fix."""

security_instructions = """You are a Security agent reviewing a pull request diff. 
Identify security vulnerabilities including: SQL injection, command injection, XSS vulnerabilities, hardcoded secrets/credentials, insecure authentication, path traversal, insecure deserialization, improper input validation, and missing error handling that could expose sensitive information.
For each issue found, specify the exact lines, severity (1-10), clear fix, and CVE reference if applicable."""

best_practices_instructions = """You are a Best Practices agent reviewing a pull request diff. 
Identify code quality issues including: unclear variable names, functions exceeding 50 lines, nested complexity over 3 levels, missing docstrings, inconsistent formatting, magic numbers without explanation, violations of DRY principle, unclosed resources (files, database cursors, connections), and missing try-except blocks for error-prone operations.
For each issue found, specify the exact lines, severity (1-10), and a clear fix."""

test_coverage_instructions = """You are a Test Coverage agent reviewing a pull request diff. 
For each new or modified function, suggest test cases covering: normal input cases, edge cases (empty, null, boundary values), error conditions (exceptions, failures, timeouts), and integration scenarios.
For each gap found, specify the function name, lines, missing test scenarios, priority (1-10), and whether unit or integration tests are needed."""

code_analyzer = Agent(
    name="Code Analyzer",
    instructions=code_analyzer_instructions,
    model="gpt-4.1-mini",
    output_type=CodeAnalyzerOutput
)

security_agent = Agent(
    name="Security Agent",
    instructions=security_instructions,
    model="gpt-4.1-mini",
    output_type=SecurityOutput
)

best_practices_agent = Agent(
    name="Best Practices Agent",
    instructions=best_practices_instructions,
    model="gpt-4.1-mini",
    output_type=BestPracticesOutput
)

test_coverage_agent = Agent(
    name="Test Coverage Agent",
    instructions=test_coverage_instructions,
    model="gpt-4.1-mini",
    output_type=TestCoverageOutput
)

In [20]:
def get_relevant_security_patterns(code_diff: str, n_results: int = 5) -> str:
    chroma_client = chromadb.PersistentClient(path="./chroma_db")
    security_collection = chroma_client.get_collection(name="security_patterns")
    results = security_collection.query(query_texts=[code_diff], n_results=n_results)
    return "\n\n".join(results['documents'][0]) if results['documents'][0] else ""


async def run_security_agent_with_rag(code_diff: str):
    """
    Runs security agent with RAG-enhanced context.
    """
    # Get relevant security patterns
    patterns = get_relevant_security_patterns(code_diff, n_results=5)
    
    enhanced_instructions = f"""You are a Security agent reviewing a pull request diff.

{patterns}

Based on these patterns and your expertise, identify security vulnerabilities including: SQL injection, command injection, XSS vulnerabilities, hardcoded secrets/credentials, insecure authentication, path traversal, insecure deserialization, improper input validation, and missing error handling that could expose sensitive information.

For each issue found, specify the exact lines, severity (1-10), clear fix, and CVE reference if applicable."""
    
    security_agent_rag = Agent(
        name="Security Agent (RAG)",
        instructions=enhanced_instructions,
        model="gpt-4.1-mini",
        output_type=SecurityOutput
    )
    
    # Run the agent
    result = await Runner.run(security_agent_rag, code_diff)
    return result

In [12]:
subtle_test = """
diff --git a/api.py b/api.py
+import pickle
+
+def load_user_data(data_file):
+    with open(data_file, 'rb') as f:
+        return pickle.load(f)
+
+def api_endpoint(request):
+    filename = request.get('file')
+    user_data = load_user_data(filename)
+    return user_data
"""

print("="*60)
print("TESTING SUBTLE VULNERABILITY")
print("="*60)

# With RAG
result_rag = await run_security_agent_with_rag(subtle_test)
print("\nWITH RAG:")
print(f"Found {len(result_rag.final_output.findings)} issues")
for f in result_rag.final_output.findings:
    print(f"- {f.title} (severity {f.severity})")

# Without RAG  
result_no_rag = await Runner.run(security_agent, subtle_test)
print("\nWITHOUT RAG:")
print(f"Found {len(result_no_rag.final_output.findings)} issues")
for f in result_no_rag.final_output.findings:
    print(f"- {f.title} (severity {f.severity})")

TESTING SUBTLE VULNERABILITY

WITH RAG:
Found 2 issues
- Insecure Deserialization via pickle.load (severity 9)
- Path Traversal via Unvalidated Filename Parameter (severity 8)

WITHOUT RAG:
Found 1 issues
- Insecure Deserialization Using pickle.load (severity 9)


In [21]:
# async def run_all_agents(diff):
#     results = await asyncio.gather(
#         Runner.run(code_analyzer, diff),
#         Runner.run(security_agent, diff),
#         Runner.run(best_practices_agent, diff),
#         Runner.run(test_coverage_agent, diff)
#     )
#     return results

async def run_all_agents(diff):
    # Get RAG context for security agent
    security_patterns = get_relevant_security_patterns(diff, n_results=5)
    
    # Create RAG-enhanced security agent
    enhanced_security_instructions = f"""{security_instructions}

RELEVANT SECURITY PATTERNS TO CHECK:
{security_patterns}"""
    
    security_agent_rag = Agent(
        name="Security Agent",
        instructions=enhanced_security_instructions,
        model="gpt-4.1-mini",
        output_type=SecurityOutput
    )
    
    # Run all agents in parallel
    results = await asyncio.gather(
        Runner.run(code_analyzer, diff),
        Runner.run(security_agent_rag, diff),  # Uses RAG
        Runner.run(best_practices_agent, diff),
        Runner.run(test_coverage_agent, diff)
    )
    return results

In [22]:
def organize_findings(
    code_result,
    security_result, 
    best_practices_result,
    test_coverage_result
):
    """
    Organizes all findings by file.
    
    Returns:
        dict: {
            "file.py": [Finding, Finding, TestGap, ...]
        }
    """
    organized = {}
    for result in [code_result, security_result,  best_practices_result, test_coverage_result]:
        for finding in result.final_output.findings:
            file = finding.file
            if file not in organized:
                organized[file] = []
            organized[file].append(finding)
        
    return organized

In [23]:
aggregator_instructions = """You are a Code Review Aggregator tasked with creating a deduplicated summary report. Your goal is to merge duplicate findings from multiple agents into a clear, actionable report.

You will be provided with findings from multiple agents:
<findings>
{organized}
</findings>

When creating the report, follow these guidelines:

1. IDENTIFY DUPLICATES: Group findings that describe the same root issue
   - Look for overlapping line numbers and similar descriptions
   - When multiple agents flag the same problem, merge into one issue
   - Use the HIGHEST severity when merging

2. PRESERVE INFORMATION: 
   - Keep agent names: Code Analyzer, Security, Best Practices, Test Coverage
   - Include file paths and line numbers
   - Maintain the most comprehensive description from merged findings

3. CATEGORIZE each issue as:
   - Bug: Logic errors, crashes, incorrect behavior  
   - Security: Vulnerabilities, unsafe code
   - Performance: Inefficient algorithms, resource issues
   - Style: Naming, formatting, documentation
   - Test Gap: Missing test coverage

4. CREATE SUMMARY TABLE with these columns:
   | Issue | File | Lines | Severity | Category | Fix | Found By |

5. SEPARATE CONCERNS: Test coverage gaps are distinct from code issues

Present your report in this format:

# Code Review Report

## Executive Summary
[2-3 sentences highlighting the most critical findings]

## Summary of Actions
| Issue | File | Lines | Severity | Category | Fix | Found By |
|-------|------|-------|----------|----------|-----|----------|
[One row per unique issue]

**Total Distinct Issues: [count]**

CRITICAL REQUIREMENT: 
- EVERY finding from EVERY agent must appear in the summary table
- This includes ALL test coverage gaps reported by the Test Coverage agent
- Test gaps should be listed as separate rows (one per function needing tests)
- Do NOT omit any findings, especially test coverage gaps
- The Total Distinct Issues count must match the number of rows in the table."""

aggregator = Agent(
    name="Aggregator",
    instructions=aggregator_instructions,
    model=grok_4_fast,
)

In [24]:
async def aggregator_agent(organized):
    result = await Runner.run(aggregator, f"Aggregate these findings into a structured report:\n\n{organized}")
    return result.final_output

In [25]:
# async def review_code(diff: str, save_output: bool = True) -> str:
#     """
#     Complete code review pipeline.
    
#     Args:
#         diff: The code diff to review
        
#     Returns:
#         Markdown-formatted code review report
#     """
#     results = await run_all_agents(diff)
#     code_result, security_result, best_practices_result, test_coverage_result = results    
#     organized = organize_findings(code_result, security_result, best_practices_result, test_coverage_result)
#     report = await aggregator_agent(organized)
    
#     if save_output:
#         os.makedirs("user-data", exist_ok=True)
#         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
#         filepath = f"user-data/code_review_{timestamp}.md"
#         with open(filepath, "w") as f:
#             f.write(report)
#         print(f"Report saved to {filepath}")
    
#     return report


async def review_code(diff: str, save_output: bool = True) -> str:
    """
    Complete code review pipeline.
    
    Args:
        diff: The code diff to review
        
    Returns:
        Markdown-formatted code review report
    """
    results = await run_all_agents(diff)
    code_result, security_result, best_practices_result, test_coverage_result = results
    
    # # DEBUG: Print all agent outputs
    # print("\n" + "="*60)
    # print("CODE ANALYZER RAW OUTPUT:")
    # print("="*60)
    # for finding in code_result.final_output.findings:
    #     print(f"\nTitle: {finding.title}")
    #     print(f"Severity: {finding.severity}")
    #     print(f"Lines: {finding.relevant_lines}")
    #     print(f"Description: {finding.description[:150]}...")
    # print("="*60)
    
    # print("\n" + "="*60)
    # print("SECURITY AGENT RAW OUTPUT:")
    # print("="*60)
    # for finding in security_result.final_output.findings:
    #     print(f"\nTitle: {finding.title}")
    #     print(f"Severity: {finding.severity}")
    #     print(f"Lines: {finding.relevant_lines}")
    #     print(f"Description: {finding.description[:150]}...")
    # print("="*60)
    
    # print("\n" + "="*60)
    # print("BEST PRACTICES AGENT RAW OUTPUT:")
    # print("="*60)
    # for finding in best_practices_result.final_output.findings:
    #     print(f"\nTitle: {finding.title}")
    #     print(f"Severity: {finding.severity}")
    #     print(f"Lines: {finding.relevant_lines}")
    #     print(f"Description: {finding.description[:150]}...")
    # print("="*60)
    
    # print("\n" + "="*60)
    # print("TEST COVERAGE AGENT RAW OUTPUT:")
    # print("="*60)
    # for gap in test_coverage_result.final_output.findings:
    #     print(f"\nFunction: {gap.function_name}")
    #     print(f"Priority: {gap.priority}")
    #     print(f"Lines: {gap.lines}")
    #     print(f"Missing scenarios: {gap.missing_scenarios}")
    # print("="*60 + "\n")
    
    organized = organize_findings(code_result, security_result, best_practices_result, test_coverage_result)
    
    print("\n" + "="*60)
    print("CALLING AGGREGATOR...")
    print("="*60)
    
    report = await aggregator_agent(organized)
    
    print("\n" + "="*60)
    print("AGGREGATOR OUTPUT:")
    print("="*60)
    print(report)
    print("="*60 + "\n")
    
    if save_output:
        os.makedirs("user-data", exist_ok=True)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filepath = f"user-data/code_review_{timestamp}.md"
        with open(filepath, "w") as f:
            f.write(report)
        print(f"Report saved to {filepath}")
    
    return report

In [38]:
diff_file = Path("test-cases/01_sql_injection.diff")
diff_content = diff_file.read_text()

report = await review_code(diff_content, save_output=False)


CODE ANALYZER RAW OUTPUT:

Title: SQL Injection Vulnerability
Severity: 9
Lines: [8, 9, 10]
Description: The authenticate method constructs SQL queries by directly concatenating user inputs (username and password) into the query string. This allows an att...

SECURITY AGENT RAW OUTPUT:

Title: SQL Injection in authenticate method
Severity: 9
Lines: [7, 8, 9, 10]
Description: The authenticate method constructs an SQL query by directly concatenating user inputs (username and password) into the query string. This allows an at...

BEST PRACTICES AGENT RAW OUTPUT:

Title: SQL Injection Vulnerability
Severity: 9
Lines: [7, 8, 9]
Description: The authenticate method builds the SQL query by concatenating user inputs directly into the query string. This approach is vulnerable to SQL injection...

Title: Missing Cursor Closure
Severity: 5
Lines: [8, 9]
Description: The database cursor created in the authenticate method is not explicitly closed, which can lead to resource leaks....

Title: No Ex

In [26]:
judge_instructions = """You are an evaluation judge for code review systems comparing expected findings (ground truth) against actual findings.

CRITICAL MATCHING RULES:
1. Each actual finding can match AT MOST ONE expected finding
2. Each expected finding can match AT MOST ONE actual finding
3. Once an actual finding is matched, it CANNOT be used again
4. Only match within same category (bugs ≠ test gaps)

PROCESS:
1. Count total_actual from "Total Distinct Issues: X" in report
2. For EACH expected finding:
   - Find the BEST matching actual finding that hasn't been used yet
   - If good match exists: mark as matched=True, record which actual finding
   - If no match: mark as matched=False
   - NEVER reuse an actual finding for multiple expected findings

A match means the same type of issue was identified, even if worded differently.
"""

class MatchedFinding(BaseModel):
    expected: str = Field(description="the expected finding text")
    matched: bool = Field(description="true if the expected finding is present, else false")
    actual_finding: Optional[str] = Field(default=None, description="the matching text from report (if matched)")

class EvaluationResult(BaseModel):
    matched_findings: list[MatchedFinding]
    total_expected: int = Field(description="Total number of expected findings from ground truth")
    total_actual: int = Field(description="Count of distinct issues in the report's summary section")
    # matches: int = Field(description="Number of expected findings successfully matched")
    
    def model_post_init(self, __context):
        # Calculate matches from the list
        matches = sum(1 for mf in self.matched_findings if mf.matched)
        
        # Check for duplicate actual findings
        actual_findings_used = [
            mf.actual_finding for mf in self.matched_findings 
            if mf.matched and mf.actual_finding
        ]
        unique_actuals = len(set(actual_findings_used))
        
        if matches > unique_actuals:
            print(f"ERROR: {matches} matches but only {unique_actuals} unique actual findings used!")
            print("The judge matched the same actual finding multiple times.")
        
        if matches > self.total_actual:
            print(f"WARNING: Matches ({matches}) > Total Actual ({self.total_actual})")



async def evaluate_report(report: str, ground_truth_content: str) -> dict:
    """
    Fixed evaluation function with proper counting.
    """
    
    judge_agent = Agent(
        name="Evaluation Judge",
        instructions=judge_instructions,
        model="gpt-5.1",
        output_type=EvaluationResult
    )
    
    prompt = f"""
GROUND TRUTH (expected findings):
{ground_truth_content}

ACTUAL REPORT (what the system found):
{report}

For each expected finding, determine if it matches any actual finding.
Output matched_findings list, total_expected, and total_actual.
"""
    
    result = await Runner.run(judge_agent, prompt)
    eval_result = result.final_output
    
    # Calculate matches from the actual data - don't trust LLM counting
    matches = sum(1 for mf in eval_result.matched_findings if mf.matched)
    
    # Calculate metrics
    recall = matches / eval_result.total_expected if eval_result.total_expected > 0 else 0
    precision = matches / eval_result.total_actual if eval_result.total_actual > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return {
        "recall": recall,
        "precision": precision,
        "f1": f1,
        "matches": matches,
        "total_expected": eval_result.total_expected,
        "total_actual": eval_result.total_actual,
        "details": eval_result.matched_findings
    }

In [50]:
# Run test 2
test_dir = Path("test-cases")
diff_file = test_dir / "01_sql_injection.diff"

# Load files
diff_content = diff_file.read_text()
expected_file = diff_file.with_name("01_sql_injection_expected.json")
ground_truth_content = expected_file.read_text()

# Run review WITH saving
report = await review_code(diff_content, save_output=False)

# Evaluate
eval_result = await evaluate_report(report, ground_truth_content)

print("\n" + "="*60)
print("JUDGE OUTPUT:")
print("="*60)
print(f"total_expected: {eval_result['total_expected']}")
print(f"total_actual: {eval_result['total_actual']}")
print(f"matches: {eval_result['matches']}")
print(f"\nmatched_findings:")
for mf in eval_result['details']:
    print(f"\n  Expected: {mf.expected}")
    print(f"  Matched: {mf.matched}")
    if mf.actual_finding:
        print(f"  Actual: {mf.actual_finding[:100]}...")  # truncate if long

print("\n" + "="*60)
print("CALCULATED METRICS:")
print("="*60)
print(f"Recall: {eval_result['recall']:.2f}")
print(f"Precision: {eval_result['precision']:.2f}")
print(f"F1 Score: {eval_result['f1']:.2f}")


CALLING AGGREGATOR...

AGGREGATOR OUTPUT:
# Code Review Report

## Executive Summary
The most critical issue is a high-severity SQL injection vulnerability in the `authenticate` method of `user_auth.py`, flagged by multiple agents, which could allow attackers to bypass authentication or access sensitive data; it must be addressed immediately using parameterized queries. Additional bugs include missing exception handling that risks application crashes, while best practices highlight resource leaks from unclosed cursors and lack of documentation. Test coverage for the `authenticate` function is severely lacking, with no tests for normal, edge, security, and error scenarios, requiring comprehensive unit and integration tests.

## Summary of Actions
| Issue | File | Lines | Severity | Category | Fix | Found By |
|-------|------|-------|----------|----------|-----|----------|
| SQL Injection Vulnerability | user_auth.py | 7-12 | 9 | Security | Use parameterized queries or prepared statemen

In [65]:
# # Run the same test twice to check consistency
# test_dir = Path("test-cases")
# diff_file = test_dir / "04_multi_file_security.diff"

# # Load files
# diff_content = diff_file.read_text()
# expected_file = test_dir / "04_multi_file_security_expected.json"
# ground_truth_content = expected_file.read_text()

# # Run twice
# for i in range(2):
#     print(f"\n=== RUN {i+1} ===")
    
#     # Run review
#     report = await review_code(diff_content, save_output=False)
    
#     # Evaluate
#     eval_result = await evaluate_report(report, ground_truth_content)
    
#     print("\n" + "="*60)
#     print("JUDGE OUTPUT:")
#     print("="*60)
#     print(f"total_expected: {eval_result['total_expected']}")
#     print(f"total_actual: {eval_result['total_actual']}")
#     print(f"matches: {eval_result['matches']}")
#     print(f"\nmatched_findings:")
#     for mf in eval_result['details']:
#         print(f"\n  Expected: {mf.expected}")
#         print(f"  Matched: {mf.matched}")
#         if mf.actual_finding:
#             print(f"  Actual: {mf.actual_finding[:100]}...")  # truncate if long

#     print("\n" + "="*60)
#     print("CALCULATED METRICS:")
#     print("="*60)
#     print(f"Recall: {eval_result['recall']:.2f}")
#     print(f"Precision: {eval_result['precision']:.2f}")
#     print(f"F1 Score: {eval_result['f1']:.2f}")

In [27]:
# test all test cases

test_cases = [
    "01_sql_injection",
    "02_logic_bug",
    "03_code_quality",
    "04_multi_file_security",
    "05_multi_file_mixed"
]

async def run_all_tests():
    test_dir = Path("test-cases")
    results = []
    
    for test_name in test_cases:
        print(f"\n{'='*60}")
        print(f"TESTING: {test_name}")
        print('='*60)
        
        # Load files
        diff_file = test_dir / f"{test_name}.diff"
        diff_content = diff_file.read_text()
        expected_file = test_dir / f"{test_name}_expected.json"
        ground_truth_content = expected_file.read_text()
        
        # Run review
        report = await review_code(diff_content, save_output=False)
        
        # Evaluate
        eval_result = await evaluate_report(report, ground_truth_content)
        
        # Print detailed judge output
        print("\n" + "="*60)
        print("JUDGE OUTPUT:")
        print("="*60)
        print(f"total_expected: {eval_result['total_expected']}")
        print(f"total_actual: {eval_result['total_actual']}")
        print(f"matches: {eval_result['matches']}")
        print(f"\nmatched_findings:")
        for mf in eval_result['details']:
            print(f"\n  Expected: {mf.expected}")
            print(f"  Matched: {mf.matched}")
            if mf.actual_finding:
                print(f"  Actual: {mf.actual_finding[:100]}...")
        
        # Store results
        results.append({
            'test_name': test_name,
            'recall': eval_result['recall'],
            'precision': eval_result['precision'],
            'f1': eval_result['f1'],
            'passed': eval_result['recall'] >= 0.80 and 
                     eval_result['precision'] >= 0.85 and 
                     eval_result['f1'] >= 0.82
        })
        
        # Print calculated metrics
        print("\n" + "="*60)
        print("CALCULATED METRICS:")
        print("="*60)
        print(f"Recall: {eval_result['recall']:.2f}")
        print(f"Precision: {eval_result['precision']:.2f}")
        print(f"F1 Score: {eval_result['f1']:.2f}")
        print(f"Status: {'✓ PASSED' if results[-1]['passed'] else '✗ FAILED'}")
    
    # Print overall summary
    print(f"\n\n{'='*60}")
    print("OVERALL SUMMARY")
    print('='*60)
    for result in results:
        status = '✓' if result['passed'] else '✗'
        print(f"{status} {result['test_name']}: R={result['recall']:.2f} P={result['precision']:.2f} F1={result['f1']:.2f}")
    
    passed = sum(1 for r in results if r['passed'])
    print(f"\nPassed: {passed}/{len(results)}")
    
    return results

# Run all tests
results = await run_all_tests()


TESTING: 01_sql_injection

CALLING AGGREGATOR...

AGGREGATOR OUTPUT:
# Code Review Report

## Executive Summary
The most critical issue identified is a severe SQL injection vulnerability in the `authenticate` method of `user_auth.py`, flagged by multiple agents, which could allow attackers to bypass authentication or manipulate the database. Additional high-severity concerns include storing plaintext passwords and missing exception handling for database operations, both posing significant security and reliability risks. The file also has resource leaks, style issues, and comprehensive test coverage gaps that need addressing to ensure robust code quality.

## Summary of Actions
| Issue | File | Lines | Severity | Category | Fix | Found By |
|-------|------|-------|----------|----------|-----|----------|
| SQL Injection Vulnerability: The 'authenticate' method constructs the SQL query by directly concatenating user input 'username' and 'password' without sanitation or parameterization. 