In [1]:
import os
import openai

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']
from typing import List
from pydantic import BaseModel, Field
from langchain.utils.openai_functions import convert_pydantic_to_openai_function
from langchain.agents import tool

import pickle
import networkx as nx
import ast
import re
import os
from datasets import load_dataset
from dotenv import load_dotenv
from langchain_openai import OpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_community.callbacks import get_openai_callback
load_dotenv()
import commit

def neighbors_by_relation(G, node, relation_type):
    
    neighbors = []
    for u, v, data in G.edges(node, data=True):
        if data.get('relation') == relation_type:
            neighbor = v if u == node else u  # Handle undirected edges
            neighbors.append(neighbor)
    return neighbors

def load_graph(pickle_path):
    """Loads a NetworkX DiGraph from a pickle file."""
    with open(pickle_path, "rb") as f:
        graph = pickle.load(f)
    return graph

dataset = load_dataset("lahirum/SWE_Experimental", split="train")
# filter = [0, 1, 2, 3, 4,5, 6, 7, 8, 9]
# dataset = dataset.select(filter)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def extract_function_classes(file_path):
    try:
        with open(file_path, "r") as file:
            file_content = file.read()
            parsed_data = ast.parse(file_content)
    except Exception as e:  # Catch all types of exceptions
        print(f"Error in file {file_path}: {e}")
        return [], [], ""
    info = []

    for node in ast.walk(parsed_data):
        if isinstance(node, ast.ClassDef):
            info.append(node.name)
           
        elif isinstance(node, ast.FunctionDef) or isinstance(
            node, ast.AsyncFunctionDef
        ):
            if node.name =="__init__":
                continue
            info.append(node.name)             
    return info

import os

def get_file_structure(root_dir: str) -> dict:
    file_structure = {}

    for dirpath, dirnames, filenames in os.walk(root_dir):
      paths = dirpath.split("/")
      
      filenames = [file for file in filenames if file.endswith('.py')]
      rel_path = os.path.join(root_dir, dirpath)
      rel_path = "." if rel_path == "." else rel_path.replace("\\", "/")
      if "test" in dirpath:
          continue
      if not filenames:
        continue
    
    
      filenames = [dirpath+"/"+file for file in filenames]
      file_structure[dirpath] = filenames
    return file_structure


In [3]:
import prompts
import schema
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
from langchain_deepseek import ChatDeepSeek

llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0.1,
    #max_retries=2,
)

llm_deepseek = ChatDeepSeek(
    model="deepseek-reasoner",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    # other params...
)
llm_large = ChatOpenAI(
    model="gpt-4o",
    temperature=0,
    max_retries=2,
)


In [4]:
parser = JsonOutputFunctionsParser()

model_extract = llm_large.bind(
    functions=[convert_pydantic_to_openai_function(schema.SuspiciousComponentOutput)],
    function_call="auto",
)

extract_chain = prompts.prompt_extract | model_extract

model_select = llm.bind(
    functions=[convert_pydantic_to_openai_function(schema.FileSuspicionOutput)],
    function_call="auto",
)
select_chain = prompts.file_path_filter_prompt | model_select

model_filter_list = llm.bind(
    functions=[convert_pydantic_to_openai_function(schema.SuspiciousFilesOutputList)],
    function_call="auto",
)
filter_list_chain = prompts.get_suspicious_file_list_from_list_of_files_prompt | model_filter_list 

model_select_list = llm.bind(
    functions=[convert_pydantic_to_openai_function(schema.SuspiciousFilesOutputList)],
    function_call="auto",
)
select_list_chain = prompts.suspicious_files_filter_list_usingclfn_prompt | model_select_list

model_select_with_reason = llm.bind(
    functions=[convert_pydantic_to_openai_function(schema.SuspiciousFileReasoningOutput)],
    function_call="auto",
)

select_with_reason_chain = prompts.suspicious_files_with_reason_prompt | model_select_with_reason

model_select_directory = llm.bind(
    functions=[convert_pydantic_to_openai_function(schema.SuspiciousDirectoryOutput)],
    function_call="auto",
)
select_directory_chain = prompts.suspicious_directory_prompt | model_select_directory

deep_reasoning_chain = prompts.deep_reasoning_prompt | llm_deepseek

  functions=[convert_pydantic_to_openai_function(schema.SuspiciousComponentOutput)],


In [42]:
i = 0
commit_id = dataset[i]['base_commit']
name = dataset[i]['instance_id'].split("__")[0]
problem_description = dataset[i]['problem_statement']
graph = load_graph(f"graph_{name}.pkl")

In [6]:
with get_openai_callback() as callback:
    result = extract_chain.invoke({"problem_description": problem_description})
    # print(result)
    print(callback)
    


Tokens Used: 751
	Prompt Tokens: 718
		Prompt Tokens Cached: 0
	Completion Tokens: 33
		Reasoning Tokens: 0
Successful Requests: 1
Total Cost (USD): $0.002125


In [7]:
print(result)

content='' additional_kwargs={'function_call': {'arguments': '{"file":"django/views/debug.py","class_function_name":"SafeExceptionReporterFilter.get_safe_settings"}', 'name': 'SuspiciousComponentOutput'}, 'refusal': None} response_metadata={'token_usage': {'completion_tokens': 33, 'prompt_tokens': 718, 'total_tokens': 751, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_90122d973c', 'id': 'chatcmpl-BVm26VRoIxjAE2fqfW8rH7GpqHb0V', 'finish_reason': 'function_call', 'logprobs': None} id='run-e5cce633-5578-484e-97fd-73ba6e64972b-0' usage_metadata={'input_tokens': 718, 'output_tokens': 33, 'total_tokens': 751, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}


In [8]:
import json
from commit import update
def start(inputs):
    problem_description = inputs['problem_description']
    name = inputs['name']
    graph = inputs['graph'] 
    commit_id = inputs['commit_id']
    graph = inputs['graph'] 
    update(name, commit_id)
    with get_openai_callback() as callback:
        result = extract_chain.invoke({"problem_description": problem_description})
        print(callback)
    result = result.additional_kwargs['function_call']['arguments']
    result = json.loads(result)
    result['name'] = name
    result['problem_description'] = problem_description
    result['graph'] = graph
    return result

In [9]:
# print(start(problem_description, name,graph, commit_id))

In [10]:
from utils.compress import get_skeleton
import json
def get_most_suspicious_files(inputs):
    """
    Given a graph and a file, find the most suspicious files related to the given file.
    """
    problem_description = inputs['problem_description']
    graph = inputs['graph']
    file = inputs['file']
    if "/" in file:
        file = file.split("/")[-1]
    if "." in file:
        file = file.split(".")[0]
    suspicious_files = []
    for neighbor in neighbors_by_relation(graph, "module_"+file,  'path'):
        if "test" in neighbor:
            continue
        suspicious_files.append(neighbor)
    with get_openai_callback() as callback:  
        filtered = select_chain.invoke({"problem_description": problem_description, "file_list": suspicious_files})
        print(callback)
    filtered = json.loads(filtered.additional_kwargs['function_call']['arguments'])
    selected_file = filtered['suspicious_file']
    candiate_structure = {}
    for neighbor in neighbors_by_relation(graph, selected_file,  'imports')+[selected_file]:
        try:
            # with open(neighbor, "r", encoding="utf-8") as f:
            #     raw_code = f.read()
            candiate_structure[neighbor] = extract_function_classes(neighbor)
            # get_skeleton(raw_code, keep_constant = False, keep_indent=False, total_lines =15, prefix_lines=5,suffix_lines=5)
        except Exception as e:
            continue
    
    with get_openai_callback() as callback:      
        filtered_list = select_list_chain.invoke({"problem_description": problem_description, "file_structure": candiate_structure})
        print(callback)
    filtered_list = json.loads(filtered_list.additional_kwargs['function_call']['arguments'])
    filtered_list = filtered_list['suspicious_files']
    
    filtered_candidate_structure = {}
    for file in filtered_list:
        try:
            with open(file, "r", encoding="utf-8") as f:
                raw_code = f.read()
            filtered_candidate_structure[file]=get_skeleton(raw_code, keep_constant = False, keep_indent=True, total_lines =15, prefix_lines=5,suffix_lines=5)
        except Exception as e:
            continue
    with get_openai_callback() as callback:  
        answer = select_with_reason_chain.invoke({"problem_description": problem_description, "file_structure": filtered_candidate_structure})
        print(callback)
    answer = json.loads(answer.additional_kwargs['function_call']['arguments'])
        
    return answer['suspicious_files']

In [11]:
def pass_problem_description(inputs):
    problem_description = inputs['problem_description']
    return problem_description

In [12]:
neighbors_by_relation(graph, "module_serializer",  'path')

['django/django/db/migrations/serializer.py']

In [13]:
def get_most_suspicious_files_using_clfn(inputs):
    """
    Given a graph and a file, find the most suspicious files related to the given file.
    """
    
    problem_description = inputs['problem_description']
    graph = inputs['graph']
    class_function_name = inputs['class_function_name']
    print(class_function_name)

    if "." in class_function_name:
        class_function_name = class_function_name.split(".")[0]
    suspicious_files = []
    for neighbor in neighbors_by_relation(graph, "class_"+class_function_name,  'class_path'):
        if "test" in neighbor:
            continue
        suspicious_files.append(neighbor)
    print("class", suspicious_files)
    # filtered = select_list_class_chain.invoke({"problem_description": problem_description, "file_list": suspicious_files})
    # filtered = json.loads(filtered.additional_kwargs['function_call']['arguments'])
    selected_file = suspicious_files #filtered['suspicious_files']
    # print(selected_file)
    
    filtered_candidate_structure = {}
    for file in selected_file:
        try:
            with open(file, "r", encoding="utf-8") as f:
                raw_code = f.read()
            filtered_candidate_structure[file]=get_skeleton(raw_code, keep_constant = False, keep_indent=True, total_lines =15, prefix_lines=5,suffix_lines=5)
        except Exception as e:
            continue
    with get_openai_callback() as callback:  
        answer = select_with_reason_chain.invoke({"problem_description": problem_description, "file_structure": filtered_candidate_structure})
        print(callback)
    answer = json.loads(answer.additional_kwargs['function_call']['arguments'])
        
    return answer['suspicious_files']

In [14]:
def get_most_suspicious_files_using_file_structure(inputs):
    """
    Given a graph and a file, find the most suspicious files related to the given file.
    """
    name = inputs['name']
    problem_description = inputs['problem_description']
    
    file_structure = get_file_structure(name)
    
    directories = file_structure.keys()
    with get_openai_callback() as callback:  
        filtered = select_directory_chain.invoke({"problem_description": problem_description, "directory_list": directories})
        print(callback)
    filtered = json.loads(filtered.additional_kwargs['function_call']['arguments'])
    selected_directory = filtered['suspicious_directory']
    if selected_directory in file_structure:
        suspicious_files = file_structure[selected_directory]
    elif name + "/" + selected_directory in file_structure:
        suspicious_files = file_structure[name + "/" + selected_directory]
    else:
        return
    
    print(len(suspicious_files))
    candiate_structure = {}
    for file in suspicious_files:
        try:
            candiate_structure[file] = extract_function_classes(file)
            # get_skeleton(raw_code, keep_constant = False, keep_indent=False, total_lines =15, prefix_lines=5,suffix_lines=5)
        except Exception as e:
            continue
    with get_openai_callback() as callback:    
        filtered_list = select_list_chain.invoke({"problem_description": problem_description, "file_structure": candiate_structure})
        print(callback)
    filtered_list = json.loads(filtered_list.additional_kwargs['function_call']['arguments'])
    filtered_list = filtered_list['suspicious_files']
    
    
    filtered_candidate_structure = {}
    for file in filtered_list:
        try:
            with open(file, "r", encoding="utf-8") as f:
                raw_code = f.read()
            filtered_candidate_structure[file]=get_skeleton(raw_code, keep_constant = False, keep_indent=True, total_lines =15, prefix_lines=5,suffix_lines=5)
        except Exception as e:
            continue
    
    with get_openai_callback() as callback:
        answer = select_with_reason_chain.invoke({"problem_description": problem_description, "file_structure": filtered_candidate_structure})
        print(callback)
    answer = json.loads(answer.additional_kwargs['function_call']['arguments'])
    print("file", answer['suspicious_files'])
        
    return answer['suspicious_files']

In [15]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_community.retrievers import BM25Retriever
import json

def embedding_retriever(inputs):
    
    problem_description = inputs['problem_description']
    name = inputs['name']
    
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
    
    vector_store = FAISS.load_local(
    f"{name}_faiss_index", embeddings, allow_dangerous_deserialization=True
    )
    results = vector_store.similarity_search(problem_description, k=5,)
    bm_retriever = BM25Retriever.from_documents(results,k=3)
    results = bm_retriever.invoke(problem_description)
    structure = {}
    for result in results:
        structure[result.metadata['filename']] = result.page_content
        
    with get_openai_callback() as callback:
        answer = select_with_reason_chain.invoke({"problem_description": problem_description, "file_structure": structure})
        print(callback)
    answer = json.loads(answer.additional_kwargs['function_call']['arguments'])
        
    return answer['suspicious_files']

    # return results
    
    

In [27]:
print(embedding_retriever({"problem_description":problem_description, "name": name}))

Tokens Used: 1797
	Prompt Tokens: 1527
		Prompt Tokens Cached: 1408
	Completion Tokens: 270
		Reasoning Tokens: 0
Successful Requests: 1
Total Cost (USD): $0.00028544999999999997
[{'file': 'django/tests/staticfiles_tests/storage.py', 'reason': "The issue relates to handling the 'If-Modified-Since' header, which is likely to involve storage mechanisms for static files. The classes in this file, particularly those extending 'storage.Storage', may have methods that interact with headers or caching mechanisms, making it a candidate for where the empty string handling might be incorrectly implemented."}, {'file': 'django/tests/view_tests/views.py', 'reason': "This file contains various view functions that could potentially handle HTTP requests and responses. Since the issue is about the 'If-Modified-Since' header, which is part of HTTP requests, the view functions may be responsible for processing this header. If any of these views are designed to handle conditional requests, they might be 

In [17]:
def final_reasoning(inputs):
    candidates = []
    if inputs['get_suspicious_files']:
        candidates.extend(inputs['get_suspicious_files'])
    if inputs['get_suspicious_files_using_clfn']:
        candidates.extend(inputs['get_suspicious_files_using_clfn'])
    if inputs['get_suspicious_files_using_file_structure']:
        candidates.extend(inputs['get_suspicious_files_using_file_structure'])
    if inputs['embedding_retriever']:   
        candidates.extend(inputs['embedding_retriever'])
    
    for c in candidates:
        print(c)
    problem_description = inputs['problem_description']
    with get_openai_callback() as callback:  
        result = deep_reasoning_chain.invoke({"problem_description": problem_description, "candidates": candidates})
        print(callback)
    result = result.content
    
    
    return result

In [18]:
from langchain_core.runnables import RunnableLambda, RunnableParallel, RunnableSequence

start_run = RunnableLambda(start)
pass_problem_description_run = RunnableLambda(pass_problem_description)
get_suspicious_files_run = RunnableLambda(get_most_suspicious_files)
get_suspicious_files_using_clfn_run = RunnableLambda(get_most_suspicious_files_using_clfn)
get_suspicious_files_using_file_structure_run = RunnableLambda(get_most_suspicious_files_using_file_structure)
embedding_retriever_run = RunnableLambda(embedding_retriever)
final_reasoning_rub = RunnableLambda(final_reasoning)

parallel_run = RunnableParallel(
    {
        "get_suspicious_files": get_suspicious_files_run,
        "get_suspicious_files_using_clfn": get_suspicious_files_using_clfn_run,
        "get_suspicious_files_using_file_structure": get_suspicious_files_using_file_structure_run,
        "embedding_retriever": embedding_retriever_run,
        'problem_description': pass_problem_description_run
    }
)

In [43]:
full_flow = start_run | parallel_run | final_reasoning_rub
args = {"problem_description": problem_description, "name": name, "graph": graph, "commit_id": commit_id}
result = full_flow.invoke(args)

django
DiGraph with 28324 nodes and 41362 edges
Checked out to 4fd3044ca0135da903a70dfb66992293f529ecf1
DiGraph with 27538 nodes and 40138 edges
Tokens Used: 438
	Prompt Tokens: 408
		Prompt Tokens Cached: 0
	Completion Tokens: 30
		Reasoning Tokens: 0
Successful Requests: 1
Total Cost (USD): $0.0013199999999999998
__call__
class ['django/django/contrib/contenttypes/fields.py', 'django/django/contrib/gis/geos/libgeos.py', 'django/django/contrib/gis/geos/prototypes/threadsafe.py', 'django/django/contrib/gis/management/commands/ogrinspect.py', 'django/django/contrib/postgres/fields/array.py', 'django/django/contrib/postgres/fields/hstore.py', 'django/django/contrib/postgres/validators.py', 'django/django/contrib/staticfiles/handlers.py', 'django/django/contrib/syndication/views.py', 'django/django/core/handlers/wsgi.py', 'django/django/core/serializers/__init__.py', 'django/django/core/validators.py', 'django/django/db/models/fields/json.py', 'django/django/db/models/fields/related_descr

In [41]:
print(result)

**Answer:**

Let's evaluate each candidate file's relevance to the issue and assign confidence scores:

1. **`django/django/contrib/admin/options.py`**  
   **Confidence Score: 90/100**  
   **Reasoning:**  
   This file controls the admin's form and formset handling, including validation and re-rendering after errors. The issue arises because the admin reinitializes inline formsets with **submitted data** as the new "initial" values after a validation error. This causes the hidden input (`initial-relatedmodel_set-0-plop`) to reflect the submitted data (e.g., `"test"`) instead of the model's original default (`list`). On resubmission, the form compares the new submission (empty) against the updated initial (`"test"`), bypassing validation. The user’s fix (`show_hidden_initial=False`) avoids this by forcing the original default to persist. The logic for rebuilding formsets with incorrect initial data likely resides here.

2. **`django/django/forms/models.py`**  
   **Confidence Score: 7

In [None]:
#50 boundfield

In [26]:
# for r in result:
#     print(r)
#i =27

{'file': 'django/django/forms/fields.py', 'reason': 'The URLField class is defined here, which is responsible for handling URL inputs. The issue specifically mentions that the clean method of URLField throws a ValueError instead of a ValidationError. Since the clean method is part of the URLField class, this file is directly related to the problem.'}
{'file': 'django/django/core/validators.py', 'reason': "The URLValidator class is defined in this file, which is likely used by the URLField to validate the URL input. The issue indicates that the validation process is resulting in a ValueError, which suggests that the URLValidator's implementation may be involved in the error handling process. Therefore, this file is also suspicious."}
{'file': 'django/django/forms/fields.py', 'reason': 'The issue involves the `URLField` class, which is defined in this file. The `clean` method of `URLField` is responsible for validating the input URL, and the error message indicates that a `ValueError` is