In [None]:
# rq1 & rq2 query
from openai import OpenAI
import json
import os

# API configuration
basic_api_key = "api_key"
basic_api_base = "api_url"

def llm_output(system_message,human_message):
    client = OpenAI(api_key=basic_api_key, base_url=basic_api_base)
    response = client.chat.completions.create(
        model="deepseek-chat", # model
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": human_message},
        ],
        response_format={'type': 'json_object'},
        stream=False
    )
    answer = response.choices[0].message.content
    return (answer)

# rq1 prompt
def query_rq1(code,des,cwe):
    system1 = '''
### Code Security Analysis Engine Protocol:
You are a high-precision code vulnerability analysis engine, strictly follow the following requirements:

### Task:
You will be provided with: a complete source code, a description of the vulnerability, and the corresponding CWE(Common Weakness Enumeration) type.
Analyze the source code and identify ALL code blocks matching the description and CWE type. Output them in JSON format.

### EXAMPLE JSON OUTPUT:
[{"Start_line": <int>,
"End_line": <int>,
"Code": "<exact_lines>",
"Function": "<enclosing_function, or "Not in function">",
"Reason": "<technical_explanation>",
"Probability": <0-100, confidence_score>},
{...}]

### Output Specifications:
1. Output STRICT JSON array ONLY, parsable by Python json.loads()
2. Return [] if NO vulnerabilities found
3. MUST correlate findings with BOTH textual description AND CWE technical specifications
4. Extract code blocks with SURGICAL PRECISION - strictly necessary lines only
'''
    human1='''
<Source code>: %s
<Description>: %s
<CWE>: %s
'''%(code,des,cwe)

    a1 = llm_output(system1,human1)
    return a1

# rq2 prompt
# 2.1 affected component
def query_afcomp(code,concept_content):
    system1 = '''
### Code Security Analysis Engine Protocol:
You are a high-precision code vulnerability analysis engine, strictly follow the following requirements:

### Task:
You will be provided with: a complete source code, a component(specific module/feature) affected by a vulnerability.
Analyze the source code and identify ALL vulnerable code blocks matching the affected component. Output them in JSON format.

### EXAMPLE JSON OUTPUT:
[{"Start_line": <int>,
"End_line": <int>,
"Code": "<exact_lines>",
"Function": "<enclosing_function, or "Not in function">",
"Reason": "<technical_explanation>",
"Probability": <0-100, confidence_score>},
{...}]

### Output Specifications:
1. Output STRICT JSON array ONLY, parsable by Python json.loads()
2. Return [] if NO vulnerabilities found
3. Extract code blocks with SURGICAL PRECISION - strictly necessary lines only
'''
    human1='''
<Source code>: %s
<Affected component>: %s
'''%(code,concept_content)
    a1 = llm_output(system1,human1)
    return a1

# 2.2 attack vector
def query_atvec(code,concept_content):
    system1 = '''
### Code Security Analysis Engine Protocol:
You are a high-precision code vulnerability analysis engine, strictly follow the following requirements:

### Task:
You will be provided with: a complete source code, an attack vector of a vulnerability.
Analyze the source code and identify ALL vulnerable code blocks that may be exploited through the attack vector. Output them in JSON format.

### EXAMPLE JSON OUTPUT:
[{"Start_line": <int>,
"End_line": <int>,
"Code": "<exact_lines>",
"Function": "<enclosing_function, or "Not in function">",
"Reason": "<technical_explanation>",
"Probability": <0-100, confidence_score>},
{...}]

### Output Specifications:
1. Output STRICT JSON array ONLY, parsable by Python json.loads()
2. Return [] if NO vulnerabilities found
3. Extract code blocks with SURGICAL PRECISION - strictly necessary lines only
'''
    human1='''
<Source code>: %s
<Attack vector>: %s
'''%(code,concept_content)
    a1 = llm_output(system1,human1)
    return a1

# 2.3 impact
def query_impact(code,concept_content):
    system1 = '''
### Code Security Analysis Engine Protocol:
You are a high-precision code vulnerability analysis engine, strictly follow the following requirements:

### Task:
You will be provided with: a complete source code, an impact of a vulnerability.
Analyze the source code and identify ALL vulnerable code blocks that may cause the impact. Output them in JSON format.

### EXAMPLE JSON OUTPUT:
[{"Start_line": <int>,
"End_line": <int>,
"Code": "<exact_lines>",
"Function": "<enclosing_function, or "Not in function">",
"Reason": "<technical_explanation>",
"Probability": <0-100, confidence_score>},
{...}]

### Output Specifications:
1. Output STRICT JSON array ONLY, parsable by Python json.loads()
2. Return [] if NO vulnerabilities found
3. Extract code blocks with SURGICAL PRECISION - strictly necessary lines only
'''
    human1='''
<Source code>: %s
<Impact>: %s
'''%(code,concept_content)
    a1 = llm_output(system1,human1)
    return a1

# 2.4 root cause
def query_root(code,concept_content):
    system1 = '''
### Code Security Analysis Engine Protocol:
You are a high-precision code vulnerability analysis engine, strictly follow the following requirements:

### Task:
You will be provided with: a complete source code, the root cause of a vulnerability.
Analyze the source code and identify ALL vulnerable code blocks that may be attacked by the root cause. Output them in JSON format.

### EXAMPLE JSON OUTPUT:
[{"Start_line": <int>,
"End_line": <int>,
"Code": "<exact_lines>",
"Function": "<enclosing_function, or "Not in function">",
"Reason": "<technical_explanation>",
"Probability": <0-100, confidence_score>},
{...}]

### Output Specifications:
1. Output STRICT JSON array ONLY, parsable by Python json.loads()
2. Return [] if NO vulnerabilities found
3. Extract code blocks with SURGICAL PRECISION - strictly necessary lines only
'''
    human1='''
<Source code>: %s
<Root cause>: %s
'''%(code,concept_content)
    a1 = llm_output(system1,human1)
    return a1    

# 2.5 CWE type
def query_cwe(code,concept_content):
    system1 = '''
### Code Security Analysis Engine Protocol:
You are a high-precision code vulnerability analysis engine, strictly follow the following requirements:

### Task:
You will be provided with: a complete source code, a CWE (Common Weakness Enumeration) type of a vulnerability.
Analyze the source code and identify ALL vulnerable code blocks matching the CWE type. Output them in JSON format.

### EXAMPLE JSON OUTPUT:
[{"Start_line": <int>,
"End_line": <int>,
"Code": "<exact_lines>",
"Function": "<enclosing_function, or "Not in function">",
"Reason": "<technical_explanation>",
"Probability": <0-100, confidence_score>},
{...}]

### Output Specifications:
1. Output STRICT JSON array ONLY, parsable by Python json.loads()
2. Return [] if NO vulnerabilities found
3. Extract code blocks with SURGICAL PRECISION - strictly necessary lines only
'''
    human1='''
<Source code>: %s
<CWE>: %s
'''%(code,concept_content)
    a1 = llm_output(system1,human1)
    return a1    

# Integrate queries to achieve maximum cache hits

# Switching between using java, python, or other languages
language = "python"

input_file = f"./data/{language}/{language}_vul_2.jsonl"

num = 0 # Read the file line by line, with one CVE per line
with open(input_file, 'rb') as f1:
    for line in f1:
        if num >= 0: # Starting from which line, usually set to 0
            if num >= 223: # Ending to which line, total: python-223, java-199
                break
            else:
                data_info = json.loads(line)
                filenum = data_info["num"]
        else: # skip below the starting point
            num += 1
            continue

        cve = data_info["cve"]
        filename_list = data_info["filename_list"]
        des = data_info["des"]
        cwe_list = data_info["cwe_list"]
        cwe_name_lst = data_info["cwe_name_lst"]
        cwe = f"{cwe_list[0]},{cwe_name_lst[0]}"
        
        des_filename = f"./data/{language}/{language}_des_extract_fix/{filenum}.txt"
        with open(des_filename, 'r', encoding='utf-8') as f2:
            des_concepts = f2.read()
        des_concepts = json.loads(des_concepts)

        print(f"Start query {num},{filenum}")

        # Data used for rq1
        data_rq1 = {}
        output_file_rq1 = []
        wrong_flag_rq1 = 0
        filename_new_rq1 = f"./data/{language}/RQ1/{language}_output/{filenum}.json"

        # Data used for rq2 afcomp
        data_rq2_afcomp = {}
        output_file_rq2_afcomp = []
        wrong_flag_rq2_afcomp = 0
        filename_new_rq2_afcomp = f"./data/{language}/RQ2/afcomp_output/{filenum}.json"
        # Data used for rq2 atvec
        data_rq2_atvec = {}
        output_file_rq2_atvec = []
        wrong_flag_rq2_atvec = 0
        filename_new_rq2_atvec = f"./data/{language}/RQ2/atvec_output/{filenum}.json"
        # Data used for rq2 impact
        data_rq2_impact = {}
        output_file_rq2_impact = []
        wrong_flag_rq2_impact = 0
        filename_new_rq2_impact = f"./data/{language}/RQ2/impact_output/{filenum}.json"
        # Data used for rq2 root
        data_rq2_root = {}
        output_file_rq2_root = []
        wrong_flag_rq2_root = 0
        filename_new_rq2_root = f"./data/{language}/RQ2/root_output/{filenum}.json"
        # Data used for rq2 cwe
        data_rq2_cwe = {}
        output_file_rq2_cwe = []
        wrong_flag_rq2_cwe = 0
        filename_new_rq2_cwe = f"./data/{language}/RQ2/cwe_output/{filenum}.json"

        for i in range(0,len(filename_list)):
            filename_original = filename_list[i].strip(f'./data/{language}/{language}_filter/').strip('.txt').strip()
            with open(filename_list[i], 'r') as f3:
                code = f3.read()

            # rq1 query
            if os.path.exists(filename_new_rq1): 
                print(f"{num},{filenum} rq1 already query")
                wrong_flag_rq1 = 1
            else:
                data_file = {}
                try:
                    answer = query_rq1(code,des,cwe)
                    if answer != []:
                        candidate = json.loads(answer)
                    else:
                        candidate = answer
                    data_file = {"file_name":filename_original,"output_content":candidate}
                    output_file_rq1.append(data_file)
                except:
                    print(f"{num},{filenum}_{i} rq1 can not parse")
                    wrong_flag_rq1 = 1

            # rq2 afcomp query
            if os.path.exists(filename_new_rq2_afcomp): 
                print(f"{num},{filenum} rq2_afcomp already query")
                wrong_flag_rq2_afcomp = 1
            else:
                data_file = {}
                concept_content = des_concepts["Affected Component"]
                if concept_content is None: 
                    print(f"{num},{filenum} rq2_afcomp is null")
                    wrong_flag_rq2_afcomp = 1
                else:
                    try:
                        answer = query_afcomp(code,concept_content)
                        if answer != []:
                            candidate = json.loads(answer)
                        else:
                            candidate = answer
                        data_file = {"filename": filename_original, "output_content":candidate}
                        output_file_rq2_afcomp.append(data_file)
                    except:
                        print(f"{num},{filenum}_{i} rq2_afcomp can not parse")
                        wrong_flag_rq2_afcomp = 1

            # rq2 atvec query
            if os.path.exists(filename_new_rq2_atvec): 
                print(f"{num},{filenum} rq2_atvec already query")
                wrong_flag_rq2_atvec = 1
            else:
                data_file = {}
                concept_content = des_concepts["Attack Vector"]
                if concept_content is None: 
                    print(f"{num},{filenum} rq2_atvec is null")
                    wrong_flag_rq2_atvec = 1
                else:
                    try:
                        answer = query_atvec(code,concept_content)
                        if answer != []:
                            candidate = json.loads(answer)
                        else:
                            candidate = answer
                        data_file = {"filename": filename_original, "output_content":candidate}
                        output_file_rq2_atvec.append(data_file)
                    except:
                        print(f"{num},{filenum}_{i} rq2_atvec can not parse")
                        wrong_flag_rq2_atvec = 1

            # rq2 impact query
            if os.path.exists(filename_new_rq2_impact): 
                print(f"{num},{filenum} rq2_impact already query")
                wrong_flag_rq2_impact = 1
            else:
                data_file = {}
                concept_content = des_concepts["Impact"]
                if concept_content is None: 
                    print(f"{num},{filenum} rq2_impact is null")
                    wrong_flag_rq2_impact = 1
                else:
                    try:
                        answer = query_impact(code,concept_content)
                        if answer != []:
                            candidate = json.loads(answer)
                        else:
                            candidate = answer
                        data_file = {"filename": filename_original, "output_content":candidate}
                        output_file_rq2_impact.append(data_file)
                    except:
                        print(f"{num},{filenum}_{i} rq2_impact can not parse")
                        wrong_flag_rq2_impact = 1

            # rq2 root query
            if os.path.exists(filename_new_rq2_root): 
                print(f"{num},{filenum} rq2_root already query")
                wrong_flag_rq2_root = 1
            else:
                data_file = {}
                concept_content = des_concepts["Root Cause"]
                if concept_content is None: # 如果不存在，就跳过 
                    print(f"{num},{filenum} rq2_root is null")
                    wrong_flag_rq2_root = 1
                else:
                    try:
                        answer = query_root(code,concept_content)
                        if answer != []:
                            candidate = json.loads(answer)
                        else:
                            candidate = answer
                        data_file = {"filename": filename_original, "output_content":candidate}
                        output_file_rq2_root.append(data_file)
                    except:
                        print(f"{num},{filenum}_{i} rq2_root can not parse")
                        wrong_flag_rq2_root = 1

            # rq2 cwe query
            if os.path.exists(filename_new_rq2_cwe): 
                print(f"{num},{filenum} rq2_cwe already query")
                wrong_flag_rq2_cwe = 1
            else:
                data_file = {}
                if cwe_list == ["NVD-CWE-Other"] or cwe_list == ["NVD-CWE-noinfo"]: 
                    print(f"{num},{filenum} rq2_cwe is null")
                    wrong_flag_rq2_cwe = 1
                else:
                    try:
                        answer = query_cwe(code,cwe)
                        if answer != []:
                            candidate = json.loads(answer)
                        else:
                            candidate = answer
                        data_file = {"filename": filename_original, "output_content":candidate}
                        output_file_rq2_cwe.append(data_file)
                    except:
                        print(f"{num},{filenum}_{i} rq2_cwe can not parse")
                        wrong_flag_rq2_cwe = 1

        #rq1 result written to file
        if wrong_flag_rq1 == 0:
            data_rq1 = {"cve":cve, "cwe":cwe, "file_content":output_file_rq1}
            with open(filename_new_rq1,"w") as frq1:
                json.dump(data_rq1,frq1)
        else:
            print(f"{num},{filenum} rq1 is not save")


        #rq2 afcomp result written to file
        if wrong_flag_rq2_afcomp == 0:
            data_rq2_afcomp = {"cve":cve, "concept":"Affected Component","concept_content":des_concepts["Affected Component"], "file_content":output_file_rq2_afcomp}
            with open(filename_new_rq2_afcomp,"w") as frq2afcomp:
                json.dump(data_rq2_afcomp,frq2afcomp)
        else:
            print(f"{num},{filenum} rq2_afcomp is not save")

        #rq2 atvec result written to file
        if wrong_flag_rq2_atvec == 0:
            data_rq2_atvec = {"cve":cve, "concept":"Attack Vector","concept_content":des_concepts["Attack Vector"], "file_content":output_file_rq2_atvec}
            with open(filename_new_rq2_atvec,"w") as frq2atvec:
                json.dump(data_rq2_atvec,frq2atvec)
        else:
            print(f"{num},{filenum} rq2_atvec is not save")

        #rq2 impact result written to file
        if wrong_flag_rq2_impact == 0:
            data_rq2_impact = {"cve":cve, "concept":"Impact","concept_content":des_concepts["Impact"], "file_content":output_file_rq2_impact}
            with open(filename_new_rq2_impact,"w") as frq2impact:
                json.dump(data_rq2_impact,frq2impact)
        else:
            print(f"{num},{filenum} rq2_impact is not save")

        #rq2 root result written to file
        if wrong_flag_rq2_root == 0:
            data_rq2_root = {"cve":cve, "concept":"Root Cause","concept_content":des_concepts["Root Cause"], "file_content":output_file_rq2_root}
            with open(filename_new_rq2_root,"w") as frq2root:
                json.dump(data_rq2_root,frq2root)
        else:
            print(f"{num},{filenum} rq2_root is not save")            

        #rq2 cwe result written to file
        if wrong_flag_rq2_cwe == 0:
            data_rq2_cwe = {"cve":cve, "cwe":cwe, "file_content":output_file_rq2_cwe}
            with open(filename_new_rq2_cwe,"w") as frq2cwe:
                json.dump(data_rq2_cwe,frq2cwe)
        else:
            print(f"{num},{filenum} rq2_cwe is not save") 

        num += 1 # next line
print("end")