In [45]:
import os
import openai
from typing import List
from pydantic import BaseModel, Field
import pickle
import networkx as nx
import ast
import re
import json
import chromadb
import cProfile
import pstats

from datasets import load_dataset
from pprint import pprint

from langchain_groq import ChatGroq
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_core.utils.function_calling import convert_pydantic_to_openai_function
from langchain.agents import tool
from langchain_openai import OpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_community.callbacks import get_openai_callback
from langsmith import traceable

from commit import update
from utils.utils import serialize_dict_to_json, deserialize_json_to_dict
from utils.chunk import SimpleFixedLengthChunker
from utils.compress import get_skeleton

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key = os.environ['OPENAI_API_KEY']

sflc = SimpleFixedLengthChunker()

def neighbors_by_relation(G, node, relation_type):
    
    neighbors = []
    for u, v, data in G.edges(node, data=True):
        if data.get('relation') == relation_type:
            neighbor = v if u == node else u  # Handle undirected edges
            neighbors.append(neighbor)
    return neighbors

def load_graph(pickle_path):
    """Loads a NetworkX DiGraph from a pickle file."""
    with open(pickle_path, "rb") as f:
        graph = pickle.load(f)
    return graph

dataset = load_dataset("lahirum/SWE_Experimental", split="train")
# filter = [0, 1, 2, 3, 4,5, 6, 7, 8, 9]
# dataset = dataset.select(filter)


In [46]:
def extract_function_classes(file_path):
    try:
        with open(file_path, "r") as file:
            file_content = file.read()
            parsed_data = ast.parse(file_content)
    except Exception as e:  # Catch all types of exceptions
        print(f"Error in file {file_path}: {e}")
        return [], [], ""
    info = []

    for node in ast.walk(parsed_data):
        if isinstance(node, ast.ClassDef):
            info.append(node.name)
           
        elif isinstance(node, ast.FunctionDef) or isinstance(
            node, ast.AsyncFunctionDef
        ):
            if node.name =="__init__":
                continue
            info.append(node.name)             
    return info

import os

def get_file_structure(root_dir: str) -> dict:
    file_structure = {}

    for dirpath, dirnames, filenames in os.walk(root_dir):
      paths = dirpath.split("/")
      
      filenames = [file for file in filenames if file.endswith('.py')]
      rel_path = os.path.join(root_dir, dirpath)
      rel_path = "." if rel_path == "." else rel_path.replace("\\", "/")
      if "test" in dirpath:
          continue
      if not filenames:
        continue
    
    
      filenames = [dirpath+"/"+file for file in filenames]
      file_structure[dirpath] = filenames
    return file_structure


In [47]:
import prompts
import schema
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
from langchain_deepseek import ChatDeepSeek

llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0.1,
    #max_retries=2,
)

llm_deepseek = ChatDeepSeek(
    model="deepseek-reasoner",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    # other params...
)

llm_large = ChatOpenAI(
    model="gpt-4o",
    temperature=0,
    max_retries=2,
)


In [48]:
parser = JsonOutputFunctionsParser()

model_extract = llm_large.bind(
    functions=[convert_pydantic_to_openai_function(schema.SuspiciousComponentOutput)],
    function_call="auto",
)

extract_chain = prompts.prompt_extract | model_extract

model_select = llm.bind(
    functions=[convert_pydantic_to_openai_function(schema.FileSuspicionOutput)],
    function_call="auto",
)
select_chain = prompts.file_path_filter_prompt | model_select

model_filter_list = llm.bind(
    functions=[convert_pydantic_to_openai_function(schema.SuspiciousFilesOutputList)],
    function_call="auto",
)
filter_list_chain = prompts.get_suspicious_file_list_from_list_of_files_prompt | model_filter_list 

model_select_list = llm.bind(
    functions=[convert_pydantic_to_openai_function(schema.SuspiciousFilesOutputList)],
    function_call="auto",
)
select_list_chain = prompts.suspicious_files_filter_list_usingclfn_prompt | model_select_list

model_select_with_reason = llm.bind(
    functions=[convert_pydantic_to_openai_function(schema.SuspiciousFileReasoningOutput)],
    function_call="auto",
)

select_with_reason_chain = prompts.suspicious_files_with_reason_prompt | model_select_with_reason

model_select_directory = llm.bind(
    functions=[convert_pydantic_to_openai_function(schema.SuspiciousDirectoryOutput)],
    function_call="auto",
)
select_directory_chain = prompts.suspicious_directory_prompt | model_select_directory

generate_multiple_descriptions = prompts.prompt_embedding_retriver | llm

deep_reasoning_chain = prompts.deep_reasoning_prompt | llm_deepseek

In [49]:
i = 1
commit_id = dataset[i]['base_commit']
name = dataset[i]['instance_id'].split("__")[0]
problem_description = dataset[i]['problem_statement']
graph = load_graph(f"graph_{name}.pkl")

In [50]:
with get_openai_callback() as callback:
    result = extract_chain.invoke({"problem_description": problem_description})
    # print(result)
    print(callback)

Tokens Used: 560
	Prompt Tokens: 528
		Prompt Tokens Cached: 0
	Completion Tokens: 32
		Reasoning Tokens: 0
Successful Requests: 1
Total Cost (USD): $0.00164


In [51]:
print(result)

content='' additional_kwargs={'function_call': {'arguments': '{"file":"django/db/migrations/operations/fields.py","class_function_name":"AddField"}', 'name': 'SuspiciousComponentOutput'}, 'refusal': None} response_metadata={'token_usage': {'completion_tokens': 32, 'prompt_tokens': 528, 'total_tokens': 560, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_a288987b44', 'id': 'chatcmpl-BdzEqVnJLhQn3YwqKrwBRqMXuGn7R', 'service_tier': 'default', 'finish_reason': 'function_call', 'logprobs': None} id='run--afee1f77-a916-4002-a1a3-bd1a411df094-0' usage_metadata={'input_tokens': 528, 'output_tokens': 32, 'total_tokens': 560, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}


In [52]:
import json
from commit import update

@traceable
def start(inputs):
    problem_description = inputs['problem_description']
    name = inputs['name']
    graph = inputs['graph'] 
    commit_id = inputs['commit_id']
    graph = inputs['graph'] 
    update(name, commit_id)
    with get_openai_callback() as callback:
        result = extract_chain.invoke({"problem_description": problem_description})
        print(callback)
    result = result.additional_kwargs['function_call']['arguments']
    result = json.loads(result)
    result['name'] = name
    result['problem_description'] = problem_description
    result['graph'] = graph
    return result

In [53]:
# print(start(problem_description, name,graph, commit_id))

In [54]:
from utils.compress import get_skeleton
import json

@traceable
def get_most_suspicious_files(inputs):
    """
    Given a graph and a file, find the most suspicious files related to the given file.
    """
    problem_description = inputs['problem_description']
    graph = inputs['graph']
    file = inputs['file']
    if "/" in file:
        file = file.split("/")[-1]
    if "." in file:
        file = file.split(".")[0]
    suspicious_files = []
    for neighbor in neighbors_by_relation(graph, "module_"+file,  'path'):
        if "test" in neighbor:
            continue
        suspicious_files.append(neighbor)
    with get_openai_callback() as callback:  
        filtered = select_chain.invoke({"problem_description": problem_description, "file_list": suspicious_files})
        print(callback)
    filtered = json.loads(filtered.additional_kwargs['function_call']['arguments'])
    selected_file = filtered['suspicious_file']
    candiate_structure = {}
    for neighbor in neighbors_by_relation(graph, selected_file,  'imports')+[selected_file]:
        try:
            # with open(neighbor, "r", encoding="utf-8") as f:
            #     raw_code = f.read()
            candiate_structure[neighbor] = extract_function_classes(neighbor)
            # get_skeleton(raw_code, keep_constant = False, keep_indent=False, total_lines =15, prefix_lines=5,suffix_lines=5)
        except Exception as e:
            continue
    
    with get_openai_callback() as callback:      
        filtered_list = select_list_chain.invoke({"problem_description": problem_description, "file_structure": candiate_structure})
        print(callback)
    filtered_list = json.loads(filtered_list.additional_kwargs['function_call']['arguments'])
    filtered_list = filtered_list['suspicious_files']
    
    filtered_candidate_structure = {}
    for file in filtered_list:
        try:
            with open(file, "r", encoding="utf-8") as f:
                raw_code = f.read()
            filtered_candidate_structure[file]=get_skeleton(raw_code, keep_constant = False, keep_indent=True, total_lines =15, prefix_lines=5,suffix_lines=5)
        except Exception as e:
            continue
    with get_openai_callback() as callback:  
        answer = select_with_reason_chain.invoke({"problem_description": problem_description, "file_structure": filtered_candidate_structure})
        print(callback)
    answer = json.loads(answer.additional_kwargs['function_call']['arguments'])
        
    return answer['suspicious_files']

In [55]:
@traceable
def pass_problem_description(inputs):
    problem_description = inputs['problem_description']
    return problem_description

In [56]:
neighbors_by_relation(graph, "module_serializer",  'path')

[]

In [57]:
@traceable
def get_most_suspicious_files_using_clfn(inputs):
    """
    Given a graph and a file, find the most suspicious files related to the given file.
    """
    
    problem_description = inputs['problem_description']
    graph = inputs['graph']
    class_function_name = inputs['class_function_name']
    print(class_function_name)

    if "." in class_function_name:
        class_function_name = class_function_name.split(".")[0]
    suspicious_files = []
    for neighbor in neighbors_by_relation(graph, "class_"+class_function_name,  'class_path'):
        if "test" in neighbor:
            continue
        suspicious_files.append(neighbor)
    print("class", suspicious_files)
    # filtered = select_list_class_chain.invoke({"problem_description": problem_description, "file_list": suspicious_files})
    # filtered = json.loads(filtered.additional_kwargs['function_call']['arguments'])
    selected_file = suspicious_files #filtered['suspicious_files']
    # print(selected_file)
    
    filtered_candidate_structure = {}
    for file in selected_file:
        try:
            with open(file, "r", encoding="utf-8") as f:
                raw_code = f.read()
            filtered_candidate_structure[file]=get_skeleton(raw_code, keep_constant = False, keep_indent=True, total_lines =15, prefix_lines=5,suffix_lines=5)
        except Exception as e:
            continue
    with get_openai_callback() as callback:  
        answer = select_with_reason_chain.invoke({"problem_description": problem_description, "file_structure": filtered_candidate_structure})
        print(callback)
    answer = json.loads(answer.additional_kwargs['function_call']['arguments'])
        
    return answer['suspicious_files']

In [58]:
@traceable
def get_most_suspicious_files_using_file_structure(inputs):
    """
    Given a graph and a file, find the most suspicious files related to the given file.
    """
    name = inputs['name']
    problem_description = inputs['problem_description']
    
    file_structure = get_file_structure(name)
    
    directories = file_structure.keys()
    with get_openai_callback() as callback:  
        filtered = select_directory_chain.invoke({"problem_description": problem_description, "directory_list": directories})
        print(callback)
    filtered = json.loads(filtered.additional_kwargs['function_call']['arguments'])
    selected_directory = filtered['suspicious_directory']
    if selected_directory in file_structure:
        suspicious_files = file_structure[selected_directory]
    elif name + "/" + selected_directory in file_structure:
        suspicious_files = file_structure[name + "/" + selected_directory]
    else:
        return
    
    print(len(suspicious_files))
    candiate_structure = {}
    for file in suspicious_files:
        try:
            candiate_structure[file] = extract_function_classes(file)
            # get_skeleton(raw_code, keep_constant = False, keep_indent=False, total_lines =15, prefix_lines=5,suffix_lines=5)
        except Exception as e:
            continue
    with get_openai_callback() as callback:    
        filtered_list = select_list_chain.invoke({"problem_description": problem_description, "file_structure": candiate_structure})
        print(callback)
    filtered_list = json.loads(filtered_list.additional_kwargs['function_call']['arguments'])
    filtered_list = filtered_list['suspicious_files']
    
    
    filtered_candidate_structure = {}
    for file in filtered_list:
        try:
            with open(file, "r", encoding="utf-8") as f:
                raw_code = f.read()
            filtered_candidate_structure[file]=get_skeleton(raw_code, keep_constant = False, keep_indent=True, total_lines =15, prefix_lines=5,suffix_lines=5)
        except Exception as e:
            continue
    
    with get_openai_callback() as callback:
        answer = select_with_reason_chain.invoke({"problem_description": problem_description, "file_structure": filtered_candidate_structure})
        print(callback)
    answer = json.loads(answer.additional_kwargs['function_call']['arguments'])
    print("file", answer['suspicious_files'])
        
    return answer['suspicious_files']

In [59]:
@traceable
def embedding_retriever(inputs):
    problem_description = inputs['problem_description']
    name = inputs['name']

    with get_openai_callback() as callback:
        multiple_descriptions = generate_multiple_descriptions.invoke({"problem_description": inputs["problem_description"]})
        print(callback)

    problem_description = \
        f"""## **Original GitHub issue description**:\n\n{inputs["problem_description"]}\n\n\n## **Generated descriptions**:\n\n{multiple_descriptions.content}"""
    
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

    chroma_client = chromadb.PersistentClient(f"chroma_db")
    collection = chroma_client.get_collection(name=f"{name}_chroma_index")

    vector_store = Chroma(
        client=chroma_client,
        collection_name=f"{name}_chroma_index",
        embedding_function=embeddings,
    )

    results = vector_store.similarity_search(problem_description, k=10,)
    # results = vector_store.max_marginal_relevance_search(problem_description, k=10, lambda_mult=0.5)

    file = deserialize_json_to_dict("django_file_ids.json")
    structure = {}
    for result in results:
        file_ids = file[result.metadata["filename"]].split(":")
        chunk_docs_of_file = vector_store.get_by_ids(file_ids)
        structure[result.metadata['filename']] = get_skeleton(sflc.dechunk_docs(chunk_docs_of_file))
        
    with get_openai_callback() as callback:
        answer = select_with_reason_chain.invoke({"problem_description": problem_description, "file_structure": structure})
        print(callback)
    answer = json.loads(answer.additional_kwargs['function_call']['arguments'])

    return answer['suspicious_files']
    
    
    

In [60]:
# print(embedding_retriever({"problem_description":problem_description, "name": name}))

In [61]:
@traceable
def final_reasoning(inputs):
    candidates = []
    if inputs['get_suspicious_files']:
        candidates.extend(inputs['get_suspicious_files'])
    if inputs['get_suspicious_files_using_clfn']:
        candidates.extend(inputs['get_suspicious_files_using_clfn'])
    if inputs['get_suspicious_files_using_file_structure']:
        candidates.extend(inputs['get_suspicious_files_using_file_structure'])
    if inputs['embedding_retriever']:   
        candidates.extend(inputs['embedding_retriever'])
    
    for c in candidates:
        print(c)
    problem_description = inputs['problem_description']
    with get_openai_callback() as callback:  
        result = deep_reasoning_chain.invoke({"problem_description": problem_description, "candidates": candidates})
        print(callback)
    result = result.content
    
    
    return result

In [62]:
from langchain_core.runnables import RunnableLambda, RunnableParallel, RunnableSequence

start_run = RunnableLambda(start)
pass_problem_description_run = RunnableLambda(pass_problem_description)
get_suspicious_files_run = RunnableLambda(get_most_suspicious_files)
get_suspicious_files_using_clfn_run = RunnableLambda(get_most_suspicious_files_using_clfn)
get_suspicious_files_using_file_structure_run = RunnableLambda(get_most_suspicious_files_using_file_structure)
embedding_retriever_run = RunnableLambda(embedding_retriever)
final_reasoning_run = RunnableLambda(final_reasoning)

parallel_run = RunnableParallel(
    {
        "get_suspicious_files": get_suspicious_files_run,
        "get_suspicious_files_using_clfn": get_suspicious_files_using_clfn_run,
        "get_suspicious_files_using_file_structure": get_suspicious_files_using_file_structure_run,
        "embedding_retriever": embedding_retriever_run,
        'problem_description': pass_problem_description_run
    }
)

In [63]:
full_flow = start_run | parallel_run | final_reasoning_run
args = {"problem_description": problem_description, "name": name, "graph": graph, "commit_id": commit_id}
result = full_flow.invoke(args)

django
DiGraph with 26295 nodes and 33405 edges
Node 'django/django/apps/__init__.py' does not exist.
Node 'django/django/conf/locale/__init__.py' does not exist.
Node 'django/django/conf/locale/ar/formats.py' does not exist.
Node 'django/django/conf/locale/ar_DZ/formats.py' does not exist.
Node 'django/django/conf/locale/az/formats.py' does not exist.
Node 'django/django/conf/locale/bg/formats.py' does not exist.
Node 'django/django/conf/locale/bn/formats.py' does not exist.
Node 'django/django/conf/locale/bs/formats.py' does not exist.
Node 'django/django/conf/locale/ca/formats.py' does not exist.
Node 'django/django/contrib/sitemaps/management/__init__.py' does not exist.
Node 'django/django/conf/locale/cs/formats.py' does not exist.
Node 'django/django/conf/locale/cy/formats.py' does not exist.
Node 'django/django/conf/locale/da/formats.py' does not exist.
Node 'django/django/conf/locale/de/formats.py' does not exist.
Node 'django/django/conf/locale/de_CH/formats.py' does not exist

In [64]:
print(result)

### Analysis of Candidate Files

Based on the issue description, the core problem is that Django's migration system generates an incorrect reference (`appname.models.Capability.default`) for a nested class method used as a `Field.default`. The correct reference should include the parent class (`appname.models.Profile.Capability.default`). Below is the evaluation of each candidate file:

```json
[
  {
    "file": "django/django/db/migrations/autodetector.py",
    "confidence": 90,
    "reasoning": "This file handles change detection and migration operation generation. The issue stems from improper path resolution for nested classes during default value serialization. Since the autodetector is responsible for introspecting model structures (including nested classes) and generating operation arguments, it's the most likely location where the parent class context is being dropped during reference construction. The high confidence comes from the fact that nested class handling must be imple

In [65]:
#50 boundfield

In [66]:
# for r in result:
#     print(r)
#i =27