In [1]:
def asbool(x):
    if isinstance(x, bool):
        return x
    elif isinstance(x, str):
        return x.lower() in ["true", "1", "yes"]
    elif isinstance(x, int):
        return x == 1
    else:
        return False

asbool(True), asbool("True"), asbool("1"), asbool(1), asbool("yes"), asbool(False), asbool("false"), asbool("0"), asbool(0), asbool("no")

(True, True, True, True, True, False, False, False, False, False)

## Agent

In [2]:
#get all jsonl files in data/output

import os
import json
import pandas as pd

jsonl_files = [f for f in os.listdir("../data/output") if f.endswith(".jsonl")]

results = []

for file in jsonl_files:
    print(file)
    split = file.split("_")
    subset = "_".join(split[0:2])
    model = "_".join(split[2:]).replace(".jsonl", "")
    print(f"Model: {model}, Subset: {subset}")
    data = []
    with open(os.path.join("../data/output", file), 'r') as f:
        for line in f:
            data.append(json.loads(line))
    df = pd.DataFrame(data)
    cve_df = df.apply(lambda x: pd.Series({
        "cve_id": x["input"]["cve"],
        "answer": asbool(x["output"]["answer"]),
        "label": asbool(x["input"]["label"]),
        #"rank": x["metadata"]["rank"],
        "confidence": x["output"]["confidence"],
        #"datetime": extract_commit_date_utc(x["input"]["diff"]),
    }), axis=1)
    cve_df["correct"] = cve_df["answer"].eq(cve_df["label"])
    
    
    correct_cves = cve_df.groupby("cve_id")["correct"].all().sum()
    total_cves = len(cve_df["cve_id"].unique())

    print(f"{correct_cves} CVEs are correctly classified out of {total_cves} total.") #1288 total CVEs
    print(f"Accuracy: {correct_cves / total_cves:.2%}")
    
    from sklearn.metrics import classification_report
    y_true = cve_df["label"]
    y_pred = cve_df["answer"]

    # Classification report (precision, recall, f1)
    report = classification_report(y_true, y_pred, output_dict=True)
    
    from sklearn.metrics import confusion_matrix

    # Confusion matrix
    cf_matrix = confusion_matrix(y_true, y_pred)
    results.append({
        "subset": subset,
        "model": model,
        "correct_cves": correct_cves,
        "total_cves": total_cves,
        "cve_accuracy": correct_cves / total_cves,
        "accuracy": report['accuracy'],
        "precision": report['True']['precision'],
        "recall": report['True']['recall'],
        "f1": report['True']['f1-score'],
    })



PatchFinder_top10_Llama-3.3-70B-Instruct.jsonl
Model: Llama-3.3-70B-Instruct, Subset: PatchFinder_top10
358 CVEs are correctly classified out of 1250 total.
Accuracy: 28.64%
<function random_array at 0x169364680>
<function random at 0x169364720>
<function rand at 0x1693649a0>
<ufunc 'betainc'>
<ufunc 'betaincc'>
<ufunc 'chdtr'>
<ufunc 'chdtrc'>
<ufunc 'erf'>
<ufunc 'erfc'>
<ufunc 'entr'>
<ufunc 'expit'>
<ufunc 'i0'>
<ufunc 'i0e'>
<ufunc 'i1'>
<ufunc 'i1e'>
<ufunc 'log_ndtr'>
<ufunc 'logit'>
<ufunc 'gammaln'>
<ufunc 'gammainc'>
<ufunc 'gammaincc'>
<ufunc 'ndtr'>
<ufunc 'ndtri'>
<ufunc 'rel_entr'>
<ufunc 'stdtr'>
<ufunc 'stdtrit'>
<ufunc 'xlogy'>
<function logsumexp at 0x168fb1ee0>
<function softmax at 0x168fb2200>
<function log_softmax at 0x168fb22a0>
<function bandwidth at 0x168fb31a0>
<function issymmetric at 0x168fb32e0>
<function ishermitian at 0x168fb3240>
<function eig at 0x168fb3e20>
<function eigh at 0x168fb3ec0>
<function eig_banded at 0x147be8180>
<function eigvals at 0x147be8

In [3]:
import pandas as pd

results_df = pd.DataFrame.from_dict(results)
results_df

Unnamed: 0,subset,model,correct_cves,total_cves,cve_accuracy,accuracy,precision,recall,f1
0,PatchFinder_top10,Llama-3.3-70B-Instruct,358,1250,0.2864,0.805962,0.301165,0.941176,0.456315
1,random_10,Llama-3.3-70B-Instruct,682,1252,0.544728,0.934498,0.740161,0.856281,0.793998
2,PatchFinder_top10,Qwen3-235B-A22B-Instruct-2507,553,1250,0.4424,0.86665,0.390458,0.951628,0.553721
3,random_10,Qwen3-235B-A22B-Instruct-2507,852,1252,0.680511,0.952958,0.8186,0.875888,0.846276


In [4]:
results_df.to_csv("../data/results_agent.csv", index=False)

## LLM4VFD

In [5]:
jsonl_files

['PatchFinder_top10_Llama-3.3-70B-Instruct.jsonl',
 'random_10_Llama-3.3-70B-Instruct.jsonl',
 'PatchFinder_top10_Qwen3-235B-A22B-Instruct-2507.jsonl',
 'random_10_Qwen3-235B-A22B-Instruct-2507.jsonl']

In [6]:
#get all jsonl files in data/output

import os
import json
import pandas as pd

jsonl_files = [f for f in os.listdir("../data/baselines/LLM4VFD/output") if f.endswith(".jsonl")]

results = []

for file in jsonl_files:
    print(file)
    split = file.split("_")
    subset = "_".join(split[0:2])
    model = "_".join(split[2:]).replace(".jsonl", "")
    print(f"Model: {model}, Subset: {subset}")
    data = []
    with open(os.path.join("../data/baselines/LLM4VFD/output", file), 'r') as f:
        for line in f:
            data.append(json.loads(line))
    df = pd.DataFrame(data)
    cve_df = df.apply(lambda x: pd.Series({
        "cve_id": x["input"]["cve"],
        "answer": asbool(x["output"]["vulnerability_fix"]),
        "label": asbool(x["input"]["label"]),
    }), axis=1)
    cve_df["correct"] = cve_df["answer"].eq(cve_df["label"])
    
    
    correct_cves = cve_df.groupby("cve_id")["correct"].all().sum()
    total_cves = len(cve_df["cve_id"].unique())

    print(f"{correct_cves} CVEs are correctly classified out of {total_cves} total.") #1288 total CVEs
    print(f"Accuracy: {correct_cves / total_cves:.2%}")
    
    from sklearn.metrics import classification_report
    y_true = cve_df["label"]
    y_pred = cve_df["answer"]

    # Classification report (precision, recall, f1)
    report = classification_report(y_true, y_pred, output_dict=True)
    
    from sklearn.metrics import confusion_matrix

    # Confusion matrix
    cf_matrix = confusion_matrix(y_true, y_pred)
    results.append({
        "subset": subset,
        "model": model,
        "correct_cves": correct_cves,
        "total_cves": total_cves,
        "cve_accuracy": correct_cves / total_cves,
        "accuracy": report['accuracy'],
        "precision": report['True']['precision'],
        "recall": report['True']['recall'],
        "f1": report['True']['f1-score'],
    })



PatchFinder_top10_Llama-3.3-70B-Instruct.jsonl
Model: Llama-3.3-70B-Instruct, Subset: PatchFinder_top10
464 CVEs are correctly classified out of 1249 total.
Accuracy: 37.15%
random_10_Llama-3.3-70B-Instruct.jsonl
Model: Llama-3.3-70B-Instruct, Subset: random_10
668 CVEs are correctly classified out of 1247 total.
Accuracy: 53.57%
PatchFinder_top10_Qwen3-235B-A22B-Instruct-2507.jsonl
Model: Qwen3-235B-A22B-Instruct-2507, Subset: PatchFinder_top10
465 CVEs are correctly classified out of 1249 total.
Accuracy: 37.23%
random_10_Qwen3-235B-A22B-Instruct-2507.jsonl
Model: Qwen3-235B-A22B-Instruct-2507, Subset: random_10
703 CVEs are correctly classified out of 1248 total.
Accuracy: 56.33%


In [7]:
import pandas as pd

results_df = pd.DataFrame.from_dict(results)
results_df

Unnamed: 0,subset,model,correct_cves,total_cves,cve_accuracy,accuracy,precision,recall,f1
0,PatchFinder_top10,Llama-3.3-70B-Instruct,464,1249,0.371497,0.801443,0.288396,0.871628,0.433395
1,random_10,Llama-3.3-70B-Instruct,668,1247,0.535686,0.926778,0.762993,0.729656,0.745952
2,PatchFinder_top10,Qwen3-235B-A22B-Instruct-2507,465,1249,0.372298,0.798122,0.29083,0.926716,0.442721
3,random_10,Qwen3-235B-A22B-Instruct-2507,703,1248,0.563301,0.927257,0.735146,0.79391,0.763399


In [8]:
results_df.to_csv("../data/results_LLM4VFD.csv", index=False)

## PatchFinder

In [9]:

import os
import json
import pandas as pd


csv_files = [f for f in os.listdir("../data/baselines/PatchFinder") if f.startswith("predictions_candidates_") and f.endswith(".csv")]

results = []

for file in csv_files:
    split = file.split("_")
    model = "codereviewer"
    subset = "_".join(split[-2:]).replace(".csv", "")
    print(f"Model: {model}, Subset: {subset}")
    
    df = pd.read_csv(os.path.join("../data/baselines/PatchFinder", file))
    # calc correct if pred >= 0.5 and label is True
    cve_df = df.apply(lambda x: pd.Series({
        "cve_id": x["cve"],
        "answer": asbool(x["prediction"] >= 0.5),
        "label": asbool(x["label"]),
    }), axis=1)
    cve_df["correct"] = cve_df["answer"].eq(cve_df["label"])
    
    
    correct_cves = cve_df.groupby("cve_id")["correct"].all().sum()
    total_cves = len(cve_df["cve_id"].unique())

    print(f"{correct_cves} CVEs are correctly classified out of {total_cves} total.") #1288 total CVEs
    print(f"Accuracy: {correct_cves / total_cves:.2%}")
    
    from sklearn.metrics import classification_report
    y_true = cve_df["label"]
    y_pred = cve_df["answer"]

    # Classification report (precision, recall, f1)
    report = classification_report(y_true, y_pred, output_dict=True)
    
    from sklearn.metrics import confusion_matrix

    # Confusion matrix
    cf_matrix = confusion_matrix(y_true, y_pred)
    results.append({
        "subset": subset,
        "model": model,
        "correct_cves": correct_cves,
        "total_cves": total_cves,
        "cve_accuracy": correct_cves / total_cves,
        "accuracy": report['accuracy'],
        "precision": report['True']['precision'],
        "recall": report['True']['recall'],
        "f1": report['True']['f1-score'],
    })

Model: codereviewer, Subset: PatchFinder_top10
655 CVEs are correctly classified out of 1252 total.
Accuracy: 52.32%
Model: codereviewer, Subset: random_10
244 CVEs are correctly classified out of 1252 total.
Accuracy: 19.49%


In [10]:
import pandas as pd

results_df = pd.DataFrame.from_dict(results)
results_df

Unnamed: 0,subset,model,correct_cves,total_cves,cve_accuracy,accuracy,precision,recall,f1
0,PatchFinder_top10,codereviewer,655,1252,0.523163,0.896959,0.398195,0.367253,0.382098
1,random_10,codereviewer,244,1252,0.194888,0.879785,0.859719,0.231018,0.364177


In [11]:
results_df.to_csv("../data/results_PatchFinder.csv", index=False)