In [1]:
import os
from datetime import datetime
import json
from dotenv import load_dotenv
from openai import OpenAI
from IPython.display import Markdown, display
from agents import Agent, Runner, trace
from agents.extensions.models.litellm_model import LitellmModel
from pydantic import BaseModel, Field
from typing import Literal
from IPython.display import Markdown, display
from pathlib import Path
import asyncio
from typing import Optional, List
import chromadb

In [2]:
load_dotenv(override=True)

True

In [3]:
openai_api_key = os.getenv('OPENAI_API_KEY')
openrouter_api_key = os.getenv('OPENROUTER_API_KEY')

if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")
    
if openrouter_api_key:
    print(f"OpenRouter API Key exists and begins {openrouter_api_key[:8]}")
else:
    print("OpenRouter API Key not set")

OpenAI API Key exists and begins sk-proj-
OpenRouter API Key exists and begins sk-or-v1


In [4]:
grok_code_fast_1=LitellmModel(model="openrouter/x-ai/grok-code-fast-1", api_key=openrouter_api_key)
grok_4_fast=LitellmModel(model="openrouter/x-ai/grok-4-fast", api_key=openrouter_api_key)

In [5]:
class BugFinding(BaseModel):
    title: str = Field(description="Brief name for the bug")
    description: str = Field(description="Detailed explanation")
    severity: int = Field(description="Severity 1-10")
    file: str = Field(description="File path")
    relevant_lines: list[int] = Field(description="Line numbers")
    suggested_fix: str = Field(description="Recommended solution")

class VulnerabilityFinding(BaseModel):
    title: str = Field(description="Brief name for the vulnerability")
    description: str = Field(description="Detailed explanation")
    severity: int = Field(description="Severity 1-10")
    file: str = Field(description="File path")
    relevant_lines: list[int] = Field(description="Line numbers")
    suggested_fix: str = Field(description="Recommended solution")
    cve_reference: str | None = Field(default=None, description="CVE ID if applicable")

class BestPracticeFinding(BaseModel):
    title: str = Field(description="Brief name for the best practice violation")
    description: str = Field(description="Detailed explanation")
    severity: int = Field(description="Severity 1-10")
    file: str = Field(description="File path")
    relevant_lines: list[int] = Field(description="Line numbers")
    suggested_fix: str = Field(description="Recommended solution")
    
class TestGap(BaseModel):
    function_name: str = Field(description="Name of the function/method lacking tests")
    file: str = Field(description="File containing the untested code")
    lines: list[int] = Field(description="Line numbers of the untested code")
    missing_scenarios: list[str] = Field(description="Specific test cases that should be added, e.g., ['edge case: empty input', 'error handling: invalid type']")
    priority: int = Field(description="Priority 1-10, based on code criticality")
    suggested_test_approach: str = Field(description="How to test this (unit test, integration test, etc.)")
    
class CodeAnalyzerOutput(BaseModel):
    findings: list[BugFinding] = Field(description="Bugs and anti-patterns found")

class SecurityOutput(BaseModel):
    findings: list[VulnerabilityFinding] = Field(description="Security vulnerabilities found")

class BestPracticesOutput(BaseModel):
    findings: list[BestPracticeFinding] = Field(description="Style and best practice violations")

class TestCoverageOutput(BaseModel):
    findings: list[TestGap] = Field(description="Testing gaps found")

In [6]:

code_analyzer_instructions = """You are a Code Analyzer agent reviewing a pull request diff. 
Identify bugs and anti-patterns including: logic errors, unhandled edge cases, null/undefined access, type mismatches, off-by-one errors, resource leaks (unclosed files/cursors/connections), infinite loops, missing error handling (no try-except blocks), code duplication, and overly complex functions. 
For each issue found, specify the exact lines, severity (1-10), and a clear fix."""

security_instructions = """You are a Security agent reviewing a pull request diff. 
Identify security vulnerabilities including: SQL injection, command injection, XSS vulnerabilities, hardcoded secrets/credentials, insecure authentication, path traversal, insecure deserialization, improper input validation, and missing error handling that could expose sensitive information.
For each issue found, specify the exact lines, severity (1-10), clear fix, and CVE reference if applicable."""

best_practices_instructions = """You are a Best Practices agent reviewing a pull request diff. 
Identify code quality issues including: unclear variable names, functions exceeding 50 lines, nested complexity over 3 levels, missing docstrings, inconsistent formatting, magic numbers without explanation, violations of DRY principle, unclosed resources (files, database cursors, connections), and missing try-except blocks for error-prone operations.
For each issue found, specify the exact lines, severity (1-10), and a clear fix."""

test_coverage_instructions = """You are a Test Coverage agent reviewing a pull request diff. 
For each new or modified function, suggest test cases covering: normal input cases, edge cases (empty, null, boundary values), error conditions (exceptions, failures, timeouts), and integration scenarios.
For each gap found, specify the function name, lines, missing test scenarios, priority (1-10), and whether unit or integration tests are needed."""

code_analyzer = Agent(
    name="Code Analyzer",
    instructions=code_analyzer_instructions,
    model="gpt-4.1-mini",
    output_type=CodeAnalyzerOutput
)

security_agent = Agent(
    name="Security Agent",
    instructions=security_instructions,
    model="gpt-4.1-mini",
    output_type=SecurityOutput
)

best_practices_agent = Agent(
    name="Best Practices Agent",
    instructions=best_practices_instructions,
    model="gpt-4.1-mini",
    output_type=BestPracticesOutput
)

test_coverage_agent = Agent(
    name="Test Coverage Agent",
    instructions=test_coverage_instructions,
    model="gpt-4.1-mini",
    output_type=TestCoverageOutput
)

In [7]:
def get_relevant_security_patterns(code_diff: str, n_results: int = 5) -> str:
    chroma_client = chromadb.PersistentClient(path="./chroma_db")
    security_collection = chroma_client.get_collection(name="security_patterns")
    results = security_collection.query(query_texts=[code_diff], n_results=n_results)
    return "\n\n".join(results['documents'][0]) if results['documents'][0] else ""

def get_relevant_best_practices_patterns(code_diff: str, n_results: int = 5) -> str:
    """Retrieve relevant best practices patterns from ChromaDB"""
    chroma_client = chromadb.PersistentClient(path="./chroma_db")
    best_practices_collection = chroma_client.get_collection(name="best_practices_patterns")
    results = best_practices_collection.query(query_texts=[code_diff], n_results=n_results)
    return "\n\n".join(results['documents'][0]) if results['documents'][0] else ""

In [8]:
# async def run_all_agents(diff):
#     results = await asyncio.gather(
#         Runner.run(code_analyzer, diff),
#         Runner.run(security_agent, diff),
#         Runner.run(best_practices_agent, diff),
#         Runner.run(test_coverage_agent, diff)
#     )
#     return results

async def run_all_agents(diff):
    # Get RAG context for both security and best practices agents
    security_patterns = get_relevant_security_patterns(diff, n_results=5)
    best_practices_patterns = get_relevant_best_practices_patterns(diff, n_results=5)
    
    # Create RAG-enhanced security agent
    enhanced_security_instructions = f"""{security_instructions}

RELEVANT SECURITY PATTERNS TO CHECK:
{security_patterns}"""
    
    # Create RAG-enhanced best practices agent
    enhanced_best_practices_instructions = f"""{best_practices_instructions}

RELEVANT BEST PRACTICES PATTERNS TO CHECK:
{best_practices_patterns}"""
    
    security_agent_rag = Agent(
        name="Security Agent",
        instructions=enhanced_security_instructions,
        model="gpt-4.1-mini",
        output_type=SecurityOutput
    )
    
    best_practices_agent_rag = Agent(
        name="Best Practices Agent",
        instructions=enhanced_best_practices_instructions,
        model="gpt-4.1-mini",
        output_type=BestPracticesOutput
    )
    
    # Run all agents in parallel
    results = await asyncio.gather(
        Runner.run(code_analyzer, diff),
        Runner.run(security_agent_rag, diff),  # Uses RAG
        Runner.run(best_practices_agent_rag, diff),  # Now uses RAG too!
        Runner.run(test_coverage_agent, diff)
    )
    return results

In [9]:
def organize_findings(
    code_result,
    security_result, 
    best_practices_result,
    test_coverage_result
):
    """
    Organizes all findings by file.
    
    Returns:
        dict: {
            "file.py": [Finding, Finding, TestGap, ...]
        }
    """
    organized = {}
    for result in [code_result, security_result,  best_practices_result, test_coverage_result]:
        for finding in result.final_output.findings:
            file = finding.file
            if file not in organized:
                organized[file] = []
            organized[file].append(finding)
        
    return organized

In [10]:
aggregator_instructions = """You are a Code Review Aggregator tasked with creating a deduplicated summary report. Your goal is to merge duplicate findings from multiple agents into a clear, actionable report.

You will be provided with findings from multiple agents:
<findings>
{organized}
</findings>

When creating the report, follow these guidelines:

1. IDENTIFY DUPLICATES: Group findings that describe the same root issue
   - Look for overlapping line numbers and similar descriptions
   - When multiple agents flag the same problem, merge into one issue
   - Use the HIGHEST severity when merging

2. PRESERVE INFORMATION: 
   - Keep agent names: Code Analyzer, Security, Best Practices, Test Coverage
   - Include file paths and line numbers
   - Maintain the most comprehensive description from merged findings

3. CATEGORIZE each issue as:
   - Bug: Logic errors, crashes, incorrect behavior  
   - Security: Vulnerabilities, unsafe code
   - Performance: Inefficient algorithms, resource issues
   - Style: Naming, formatting, documentation
   - Test Gap: Missing test coverage

4. CREATE SUMMARY TABLE with these columns:
   | Issue | File | Lines | Severity | Category | Fix | Found By |

5. SEPARATE CONCERNS: Test coverage gaps are distinct from code issues

Present your report in this format:

# Code Review Report

## Executive Summary
[2-3 sentences highlighting the most critical findings]

## Summary of Actions
| Issue | File | Lines | Severity | Category | Fix | Found By |
|-------|------|-------|----------|----------|-----|----------|
[One row per unique issue]

**Total Distinct Issues: [count]**

CRITICAL REQUIREMENT: 
- EVERY finding from EVERY agent must appear in the summary table
- This includes ALL test coverage gaps reported by the Test Coverage agent
- Test gaps should be listed as separate rows (one per function needing tests)
- Do NOT omit any findings, especially test coverage gaps
- The Total Distinct Issues count must match the number of rows in the table."""

aggregator = Agent(
    name="Aggregator",
    instructions=aggregator_instructions,
    model=grok_4_fast,
)

In [11]:
async def aggregator_agent(organized):
    result = await Runner.run(aggregator, f"Aggregate these findings into a structured report:\n\n{organized}")
    return result.final_output

In [12]:
async def review_code(diff: str, save_output: bool = True) -> str:
    """
    Complete code review pipeline.
    
    Args:
        diff: The code diff to review
        
    Returns:
        Markdown-formatted code review report
    """
    results = await run_all_agents(diff)
    code_result, security_result, best_practices_result, test_coverage_result = results
    
    organized = organize_findings(code_result, security_result, best_practices_result, test_coverage_result)
    
    print("\n" + "="*60)
    print("CALLING AGGREGATOR...")
    print("="*60)
    
    report = await aggregator_agent(organized)
    
    print("\n" + "="*60)
    print("AGGREGATOR OUTPUT:")
    print("="*60)
    print(report)
    print("="*60 + "\n")
    
    if save_output:
        os.makedirs("user-data", exist_ok=True)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filepath = f"user-data/code_review_{timestamp}.md"
        with open(filepath, "w") as f:
            f.write(report)
        print(f"Report saved to {filepath}")
    
    return report

In [13]:
# Hybrid Evaluation (Option 4): Models and Utilities
import re

class HybridEvaluation(BaseModel):
    """Hybrid evaluation combining location metrics + LLM relevance."""
    file_recall: float = Field(description="Recall at file level (0.0-1.0)")
    line_precision: float = Field(description="Precision at line level (0.0-1.0)")
    line_recall: float = Field(description="Recall at line level (0.0-1.0)")
    llm_relevance: float = Field(description="LLM-judged relevance score (0.0-1.0)")
    composite_score: float = Field(description="Combined score: (line_recall + llm_relevance) / 2")

def reverse_diff(bug_patch: str) -> str:
    """Reverses a bug patch to show bug introduction instead of fix."""
    lines = bug_patch.split('\n')
    reversed_lines = []
    for line in lines:
        if line.startswith('---') or line.startswith('+++'):
            reversed_lines.append(line)
        elif line.startswith('-') and not line.startswith('---'):
            reversed_lines.append('+' + line[1:])
        elif line.startswith('+') and not line.startswith('+++'):
            reversed_lines.append('-' + line[1:])
        else:
            reversed_lines.append(line)
    return '\n'.join(reversed_lines)

def parse_changed_locations(bug_patch: str) -> dict:
    """Extract files and lines changed in the patch."""
    changed_files = set()
    changed_lines = {}
    
    current_file = None
    for line in bug_patch.split('\n'):
        # Extract filename from +++ line
        if line.startswith('+++'):
            match = re.search(r'\+\+\+ b/(.+)', line)
            if match:
                current_file = match.group(1)
                changed_files.add(current_file)
                changed_lines[current_file] = set()
        
        # Extract line numbers from @@ hunk headers
        elif line.startswith('@@') and current_file:
            match = re.search(r'@@ -\d+,?\d* \+(\d+),?(\d*)', line)
            if match:
                start = int(match.group(1))
                count = int(match.group(2)) if match.group(2) else 1
                changed_lines[current_file].update(range(start, start + count))
    
    return {'files': changed_files, 'lines': changed_lines}

def parse_flagged_locations(report: str) -> dict:
    """Extract files and lines flagged in the report."""
    flagged_files = set()
    flagged_lines = {}
    
    # Parse markdown table from report
    in_table = False
    for line in report.split('\n'):
        if '| Issue | File | Lines |' in line:
            in_table = True
            continue
        if in_table and line.strip().startswith('|') and not line.strip().startswith('|---'):
            parts = [p.strip() for p in line.split('|')]
            if len(parts) > 3:
                file_path = parts[2]
                lines_str = parts[3]
                
                if file_path and file_path != 'File':
                    flagged_files.add(file_path)
                    if file_path not in flagged_lines:
                        flagged_lines[file_path] = set()
                    
                    # Parse line numbers (e.g., "7-10", "24-25", "9")
                    for line_range in lines_str.split(','):
                        line_range = line_range.strip()
                        if '-' in line_range:
                            start, end = map(int, line_range.split('-'))
                            flagged_lines[file_path].update(range(start, end + 1))
                        elif line_range.isdigit():
                            flagged_lines[file_path].add(int(line_range))
    
    return {'files': flagged_files, 'lines': flagged_lines}

def calculate_location_metrics(actual: dict, flagged: dict) -> dict:
    """
    Calculate location-based overlap metrics.
    
    Recall: Of all actual changed lines, how many did we flag (within 5 line tolerance)?
    Precision: Of all flagged lines, how many correspond to actual changes (within 5 line tolerance)?
    """
    # File-level recall
    file_recall = len(flagged['files'] & actual['files']) / len(actual['files']) if actual['files'] else 0.0
    
    # Line-level metrics
    total_actual_lines = 0
    total_flagged_lines = 0
    actual_lines_matched = 0  # For recall: how many actual lines have a nearby flagged line
    flagged_lines_matched = 0  # For precision: how many flagged lines have a nearby actual line
    
    for file in actual['files']:
        actual_lines = actual['lines'].get(file, set())
        flagged_lines_in_file = flagged['lines'].get(file, set())
        
        total_actual_lines += len(actual_lines)
        total_flagged_lines += len(flagged_lines_in_file)
        
        # Count actual lines that have at least one flagged line within 5 lines (for recall)
        for actual_line in actual_lines:
            if any(abs(actual_line - flagged_line) <= 5 for flagged_line in flagged_lines_in_file):
                actual_lines_matched += 1
        
        # Count flagged lines that have at least one actual line within 5 lines (for precision)
        for flagged_line in flagged_lines_in_file:
            if any(abs(flagged_line - actual_line) <= 5 for actual_line in actual_lines):
                flagged_lines_matched += 1
    
    line_recall = actual_lines_matched / total_actual_lines if total_actual_lines > 0 else 0.0
    line_precision = flagged_lines_matched / total_flagged_lines if total_flagged_lines > 0 else 0.0
    
    return {
        'file_recall': file_recall,
        'line_recall': line_recall,
        'line_precision': line_precision
    }

In [14]:
# Hybrid Evaluation Function

class LLMRelevance(BaseModel):
    """LLM's assessment of how relevant the review findings are to the actual fix."""
    relevance_score: float = Field(description="0.0-1.0: How well the review findings align with the actual fix")
    explanation: str = Field(description="Brief explanation of the score")

async def evaluate_hybrid(report: str, bug_patch: str) -> dict:
    """
    Hybrid evaluation: Location metrics (automated) + LLM relevance (semantic).
    
    Stage 1: Calculate automated location overlap
    Stage 2: If file_recall > 0, use LLM to judge semantic relevance
    
    Returns:
        dict with file_recall, line_precision, line_recall, llm_relevance, composite_score
    """
    
    # Stage 1: Automated location metrics
    actual_locations = parse_changed_locations(bug_patch)
    flagged_locations = parse_flagged_locations(report)
    location_metrics = calculate_location_metrics(actual_locations, flagged_locations)
    
    # Stage 2: LLM relevance (only if there's file overlap)
    llm_relevance = 0.0
    if location_metrics['file_recall'] > 0:
        llm_judge_instructions = """You are evaluating the semantic relevance of code review findings to an actual bug fix.

Given:
1. ACTUAL FIX PATCH: The changes that were made to fix bugs
2. CODE REVIEW REPORT: What the review system found

Rate the relevance (0.0 to 1.0) of the review findings:
- 1.0: Findings directly identify the bugs that were fixed
- 0.7-0.9: Findings flag related issues that would lead to discovering the bugs
- 0.4-0.6: Findings flag the general area but miss specific bugs
- 0.1-0.3: Findings are tangentially related
- 0.0: No relevant findings

Be objective and strict in your assessment."""

        llm_judge = Agent(
            name="Relevance Judge",
            instructions=llm_judge_instructions,
            model="gpt-5.1",
            output_type=LLMRelevance
        )
        
        prompt = f"""
ACTUAL FIX PATCH:
{bug_patch}

CODE REVIEW REPORT:
{report}

Rate the semantic relevance of the review findings to the actual fix.
"""
        
        result = await Runner.run(llm_judge, prompt)
        llm_relevance = result.final_output.relevance_score
    
    # Composite score: average of line recall and LLM relevance
    composite_score = (location_metrics['line_recall'] + llm_relevance) / 2
    
    return {
        'file_recall': location_metrics['file_recall'],
        'line_precision': location_metrics['line_precision'],
        'line_recall': location_metrics['line_recall'],
        'llm_relevance': llm_relevance,
        'composite_score': composite_score
    }

In [15]:
# Hybrid Batch Testing

async def test_bugsinpy_hybrid(bugs_to_test: list[tuple[str, int]]) -> list[dict]:
    """
    Test multiple BugsInPy bugs with hybrid evaluation.
    
    Args:
        bugs_to_test: List of (project, bug_id) tuples
        
    Returns:
        List of evaluation results with hybrid metrics
    """
    results = []
    
    for project, bug_id in bugs_to_test:
        print(f"\n{'='*60}")
        print(f"TESTING: {project} bug {bug_id}")
        print('='*60)
        
        try:
            # Load bug patch
            bug_patch_path = Path(f"BugsInPy/projects/{project}/bugs/{bug_id}/bug_patch.txt")
            bug_patch = bug_patch_path.read_text()
            
            # Reverse diff to show bug introduction
            reversed_diff = reverse_diff(bug_patch)
            
            # Run code review on buggy code
            report = await review_code(reversed_diff, save_output=False)
            
            # Hybrid evaluation
            eval_result = await evaluate_hybrid(report, bug_patch)
            
            # Store result
            result = {
                'project': project,
                'bug_id': bug_id,
                'file_recall': eval_result['file_recall'],
                'line_precision': eval_result['line_precision'],
                'line_recall': eval_result['line_recall'],
                'llm_relevance': eval_result['llm_relevance'],
                'composite_score': eval_result['composite_score'],
                'passed': eval_result['composite_score'] >= 0.60  # 60% threshold for hybrid
            }
            results.append(result)
            
            # Print metrics
            print(f"\nüìç LOCATION METRICS (Automated):")
            print(f"  File Recall: {eval_result['file_recall']:.2%}")
            print(f"  Line Precision: {eval_result['line_precision']:.2%}")
            print(f"  Line Recall: {eval_result['line_recall']:.2%}")
            print(f"\nü§ñ LLM RELEVANCE (Semantic):")
            print(f"  Relevance Score: {eval_result['llm_relevance']:.2%}")
            print(f"\nüéØ COMPOSITE SCORE:")
            print(f"  Score: {eval_result['composite_score']:.2%}")
            print(f"  Status: {'‚úì PASSED' if result['passed'] else '‚úó FAILED'}")
            
        except Exception as e:
            print(f"ERROR: {e}")
            import traceback
            traceback.print_exc()
            results.append({
                'project': project,
                'bug_id': bug_id,
                'error': str(e),
                'passed': False
            })
    
    # Print overall summary
    print(f"\n\n{'='*60}")
    print("OVERALL SUMMARY")
    print('='*60)
    for result in results:
        if 'error' in result:
            print(f"‚úó {result['project']}/{result['bug_id']}: ERROR - {result['error']}")
        else:
            status = '‚úì' if result['passed'] else '‚úó'
            print(f"{status} {result['project']}/{result['bug_id']}: "
                  f"FileRec={result['file_recall']:.0%}, "
                  f"LineRec={result['line_recall']:.0%}, "
                  f"LLM={result['llm_relevance']:.0%}, "
                  f"Composite={result['composite_score']:.0%}")
    
    passed = sum(1 for r in results if r.get('passed', False))
    print(f"\nPassed: {passed}/{len(results)}")
    print(f"Success Rate: {passed/len(results):.0%}")
    
    return results

In [16]:
# Quick test with 4 bugs (commented out - see 10-bug test below)
# bugs_to_test = [
#     ("luigi", 2),
#     ("black", 4),
#     ("keras", 1),
#     ("pandas", 1),
# ]
# 
# results = await test_bugsinpy_hybrid(bugs_to_test)

In [17]:
# Enhanced test with 10 bugs - showing what agents missed

async def test_bugsinpy_with_miss_analysis(bugs_to_test: list[tuple[str, int]]) -> list[dict]:
    """
    Test multiple BugsInPy bugs with detailed miss analysis.
    Shows what the agents caught vs. what they missed.
    """
    results = []
    
    for project, bug_id in bugs_to_test:
        print(f"\n{'='*60}")
        print(f"TESTING: {project} bug {bug_id}")
        print('='*60)
        
        try:
            # Load bug patch
            bug_patch_path = Path(f"BugsInPy/projects/{project}/bugs/{bug_id}/bug_patch.txt")
            bug_patch = bug_patch_path.read_text()
            
            print("\nACTUAL FIX (first 500 chars):")
            print(bug_patch[:500])
            print("..." if len(bug_patch) > 500 else "")
            
            # Reverse diff
            reversed_diff = reverse_diff(bug_patch)
            
            # Run review
            report = await review_code(reversed_diff, save_output=False)
            
            # Hybrid evaluation
            eval_result = await evaluate_hybrid(report, bug_patch)
            
            # Parse locations to show what was missed
            actual_locations = parse_changed_locations(bug_patch)
            flagged_locations = parse_flagged_locations(report)
            
            # Find missed files
            missed_files = actual_locations['files'] - flagged_locations['files']
            
            # Find missed line ranges
            missed_lines = {}
            for file in actual_locations['files']:
                actual_lines = actual_locations['lines'].get(file, set())
                flagged_lines_in_file = flagged_locations['lines'].get(file, set())
                
                # Lines that weren't caught (no flagged line within 5 lines)
                uncaught = []
                for actual_line in actual_lines:
                    if not any(abs(actual_line - flagged_line) <= 5 for flagged_line in flagged_lines_in_file):
                        uncaught.append(actual_line)
                
                if uncaught:
                    missed_lines[file] = sorted(uncaught)
            
            # Store result
            result = {
                'project': project,
                'bug_id': bug_id,
                'file_recall': eval_result['file_recall'],
                'line_precision': eval_result['line_precision'],
                'line_recall': eval_result['line_recall'],
                'llm_relevance': eval_result['llm_relevance'],
                'composite_score': eval_result['composite_score'],
                'passed': eval_result['composite_score'] >= 0.60,
                'missed_files': list(missed_files),
                'missed_lines': missed_lines
            }
            results.append(result)
            
            # Print metrics
            print(f"\nüìç LOCATION METRICS:")
            print(f"  File Recall: {eval_result['file_recall']:.0%}")
            print(f"  Line Precision: {eval_result['line_precision']:.0%}")
            print(f"  Line Recall: {eval_result['line_recall']:.0%}")
            print(f"\nü§ñ LLM RELEVANCE: {eval_result['llm_relevance']:.0%}")
            print(f"üéØ COMPOSITE: {eval_result['composite_score']:.0%}")
            
            # Show what was missed
            if missed_files:
                print(f"\n‚ùå MISSED FILES: {', '.join(missed_files)}")
            
            if missed_lines:
                print(f"\n‚ùå MISSED LINES:")
                for file, lines in missed_lines.items():
                    line_ranges = []
                    start = lines[0]
                    end = start
                    for i in range(1, len(lines)):
                        if lines[i] == end + 1:
                            end = lines[i]
                        else:
                            line_ranges.append(f"{start}-{end}" if start != end else str(start))
                            start = lines[i]
                            end = start
                    line_ranges.append(f"{start}-{end}" if start != end else str(start))
                    print(f"  {file}: lines {', '.join(line_ranges)}")
            
            print(f"\n{'‚úì PASSED' if result['passed'] else '‚úó FAILED'}")
            
        except Exception as e:
            print(f"ERROR: {e}")
            import traceback
            traceback.print_exc()
            results.append({
                'project': project,
                'bug_id': bug_id,
                'error': str(e),
                'passed': False
            })
    
    # Print overall summary
    print(f"\n\n{'='*60}")
    print("OVERALL SUMMARY")
    print('='*60)
    for result in results:
        if 'error' in result:
            print(f"‚úó {result['project']}/{result['bug_id']}: ERROR")
        else:
            status = '‚úì' if result['passed'] else '‚úó'
            missed_info = ""
            if result['missed_files']:
                missed_info += f" | Missed files: {len(result['missed_files'])}"
            if result['missed_lines']:
                total_missed = sum(len(lines) for lines in result['missed_lines'].values())
                missed_info += f" | Missed lines: {total_missed}"
            
            print(f"{status} {result['project']}/{result['bug_id']}: "
                  f"Composite={result['composite_score']:.0%} "
                  f"(LineRec={result['line_recall']:.0%}, LLM={result['llm_relevance']:.0%})"
                  f"{missed_info}")
    
    passed = sum(1 for r in results if r.get('passed', False))
    print(f"\nPassed: {passed}/{len(results)} ({passed/len(results):.0%})")
    
    return results

In [18]:
# Test 10 bugs from different projects
bugs_to_test_10 = [
    ("luigi", 2),
    ("black", 4),
    ("pandas", 1),
    ("scrapy", 1),
    ("thefuck", 1),
    ("matplotlib", 1),
    ("tqdm", 1),
    ("tornado", 1),
    ("httpie", 1),
    ("cookiecutter", 1),
]

results_10 = await test_bugsinpy_with_miss_analysis(bugs_to_test_10)


TESTING: luigi bug 2

ACTUAL FIX (first 500 chars):
diff --git a/luigi/contrib/beam_dataflow.py b/luigi/contrib/beam_dataflow.py
index dd510786..42cdc742 100644
--- a/luigi/contrib/beam_dataflow.py
+++ b/luigi/contrib/beam_dataflow.py
@@ -219,6 +219,7 @@ class BeamDataflowJobTask(MixinNaiveBulkComplete, luigi.Task):
     def __init__(self):
         if not isinstance(self.dataflow_params, DataflowParamKeys):
             raise ValueError("dataflow_params must be of type DataflowParamKeys")
+        super(BeamDataflowJobTask, self).__init__()
 
  
...

CALLING AGGREGATOR...

AGGREGATOR OUTPUT:
# Code Review Report

## Executive Summary
The code in luigi/contrib/beam_dataflow.py has critical bugs related to incomplete class initialization and incorrect handling in the get_target_path method, which could lead to runtime errors or unexpected behavior. Documentation and style issues, such as missing docstrings and inconsistent formatting, reduce code maintainability. Additionally, signific