In [1]:
import os
import openai
from typing import List
from pydantic import BaseModel, Field
import pickle
import networkx as nx
import ast
import re
import json
import chromadb
import cProfile
import pstats

from datasets import load_dataset
from pprint import pprint

from langchain_groq import ChatGroq
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_core.utils.function_calling import convert_pydantic_to_openai_function
from langchain.agents import tool
from langchain_openai import OpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_community.callbacks import get_openai_callback

from commit import update
from utils.utils import serialize_dict_to_json, deserialize_json_to_dict
from utils.chunk import SimpleFixedLengthChunker
from utils.compress import get_skeleton

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key = os.environ['OPENAI_API_KEY']

sflc = SimpleFixedLengthChunker()

dataset = load_dataset("lahirum/SWE_Experimental", split="train")
# filter = [0, 1, 2, 3, 4,5, 6, 7, 8, 9]
# dataset = dataset.select(filter)

In [2]:
def load_graph(pickle_path):
    """Loads a NetworkX DiGraph from a pickle file."""
    with open(pickle_path, "rb") as f:
        graph = pickle.load(f)
    return graph

In [3]:
import prompts
import schema
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
# from langchain_deepseek import ChatDeepSeek

llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0.1,
    #max_retries=2,
)

# llm_deepseek = ChatDeepSeek(
#     model="deepseek-reasoner",
#     temperature=0,
#     max_tokens=None,
#     timeout=None,
#     max_retries=2,
#     # other params...
# )

llm_large = ChatOpenAI(
    model="gpt-4o",
    temperature=0,
    max_retries=2,
)


In [4]:
parser = JsonOutputFunctionsParser()

model_extract = llm_large.bind(
    functions=[convert_pydantic_to_openai_function(schema.SuspiciousComponentOutput)],
    function_call="auto",
)
extract_chain = prompts.prompt_extract | model_extract

model_select = llm.bind(
    functions=[convert_pydantic_to_openai_function(schema.FileSuspicionOutput)],
    function_call="auto",
)
select_chain = prompts.file_path_filter_prompt | model_select

model_filter_list = llm.bind(
    functions=[convert_pydantic_to_openai_function(schema.SuspiciousFilesOutputList)],
    function_call="auto",
)
filter_list_chain = prompts.get_suspicious_file_list_from_list_of_files_prompt | model_filter_list 

model_select_list = llm.bind(
    functions=[convert_pydantic_to_openai_function(schema.SuspiciousFilesOutputList)],
    function_call="auto",
)
select_list_chain = prompts.suspicious_files_filter_list_usingclfn_prompt | model_select_list

model_select_with_reason = llm.bind(
    functions=[convert_pydantic_to_openai_function(schema.SuspiciousFileReasoningOutput)],
    function_call="auto",
)
select_with_reason_chain = prompts.suspicious_files_with_reason_prompt | model_select_with_reason

model_select_directory = llm.bind(
    functions=[convert_pydantic_to_openai_function(schema.SuspiciousDirectoryOutput)],
    function_call="auto",
)
select_directory_chain = prompts.suspicious_directory_prompt | model_select_directory

generate_multiple_descriptions = prompts.prompt_embedding_retriver | llm

# deep_reasoning_chain = prompts.deep_reasoning_prompt | llm_deepseek

  functions=[convert_pydantic_to_openai_function(schema.SuspiciousComponentOutput)],


In [5]:
# def start(inputs):
#     problem_description = inputs['problem_description']
#     name = inputs['name']
#     graph = inputs['graph'] 
#     commit_id = inputs['commit_id']
#     graph = inputs['graph'] 

#     update(name, commit_id)

#     with get_openai_callback() as callback:
#         result = extract_chain.invoke({"problem_description": problem_description})
#         print(callback)

#     result = result.additional_kwargs['function_call']['arguments']
#     result = json.loads(result)
#     result['name'] = name
#     result['problem_description'] = problem_description
#     result['graph'] = graph
    
#     return result

In [6]:
def embedding_retriever(inputs):
    problem_description = inputs['problem_description']
    name = inputs['name']
    
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

    chroma_client = chromadb.PersistentClient(f"chroma_db")
    collection = chroma_client.get_collection(name=f"{name}_chroma_index")

    vector_store = Chroma(
        client=chroma_client,
        collection_name=f"{name}_chroma_index",
        embedding_function=embeddings,
    )

    results = vector_store.similarity_search(problem_description, k=10,)
    # results = vector_store.max_marginal_relevance_search(problem_description, k=10, lambda_mult=0.5)

    file = deserialize_json_to_dict("django_file_ids.json")
    structure = {}
    for result in results:
        file_ids = file[result.metadata["filename"]].split(":")
        chunk_docs_of_file = vector_store.get_by_ids(file_ids)
        structure[result.metadata['filename']] = get_skeleton(sflc.dechunk_docs(chunk_docs_of_file))
        
    with get_openai_callback() as callback:
        answer = select_with_reason_chain.invoke({"problem_description": problem_description, "file_structure": structure})
        print(callback)
    answer = json.loads(answer.additional_kwargs['function_call']['arguments'])

    return answer['suspicious_files']
    

In [None]:
retrivals = []

for i in range(2, 30):
    ips = {}

    ips["commit_id"] = dataset[i]['base_commit']
    ips["name"]= dataset[i]['instance_id'].split("__")[0]
    ips["problem_description"] = dataset[i]['problem_statement']
    ips["graph"] = load_graph(f"graph_{ips['name']}.pkl")

    answer = generate_multiple_descriptions.invoke({"problem_description": ips["problem_description"]})

    ips["problem_description"] = \
        f"""## **Original GitHub issue description**:\n\n{ips["problem_description"]}\n\n\n## **Generated descriptions**:\n\n{answer.content}"""

    update(ips['name'], ips['commit_id'])
    ans = embedding_retriever(ips)

    temp_dict = {dataset[i]['instance_id']: {}}
    for level, a in enumerate(ans):
        temp_dict[dataset[i]['instance_id']][level] = {
            "identified_file": a['file'],
            "erroneous_file": dataset[i]["erroneous_file"].strip(),
        }
    retrivals.append(temp_dict)
    serialize_dict_to_json(retrivals, "embedding_based_retrievals.json")

django
DiGraph with 23714 nodes and 30175 edges
Node 'django/django/apps/__init__.py' does not exist.
Node 'django/django/conf/locale/__init__.py' does not exist.
Node 'django/django/conf/locale/ar/formats.py' does not exist.
Node 'django/django/conf/locale/ar_DZ/formats.py' does not exist.
Node 'django/django/conf/locale/az/formats.py' does not exist.
Node 'django/django/conf/locale/bg/formats.py' does not exist.
Node 'django/django/conf/locale/bn/formats.py' does not exist.
Node 'django/django/conf/locale/bs/formats.py' does not exist.
Node 'django/django/conf/locale/ca/formats.py' does not exist.
Node 'django/django/conf/locale/ckb/formats.py' does not exist.
Node 'django/django/conf/locale/cs/formats.py' does not exist.
Node 'django/django/conf/locale/cy/formats.py' does not exist.
Node 'django/django/conf/locale/da/formats.py' does not exist.
Node 'django/django/conf/locale/de/formats.py' does not exist.
Node 'django/django/conf/locale/de_CH/formats.py' does not exist.
Node 'djang


KeyboardInterrupt

