In [26]:
from openai import OpenAI
import pandas as pd
from tqdm import tqdm
import time
import re

In [27]:
# Set your API key
client = OpenAI(api_key= "API KEY")

In [28]:
# Load the full dataset
df = pd.read_csv("/Users/arushijain/PycharmProjects/AIPoweredCodeAssistant/datasets/security/test_preprocessed.csv")

In [29]:
# Store outputs
llm_outputs = []

In [30]:
# Prompt template
def create_prompt(code_snippet):
    return f"""
You are a security auditor. Analyze the following C code and determine whether it contains any security vulnerabilities (such as buffer overflow, memory leaks, use-after-free, or unsafe pointer manipulation). If yes, explain what they are. If not, state it's safe.

Code:
{code_snippet}

Answer with one of the following:
- 'VULNERABLE' and the reason
- 'SAFE' and why
"""

In [31]:
# LLM query function
def get_llm_response(prompt):
    try:
        response = client.chat.completions.create(
            model="gpt-4.1",
            messages=[{"role": "user", "content": prompt}],
            temperature = 0
        )
        completion = response.choices[0].message.content
        return completion
    except Exception as e:
        return f"ERROR: {e}"

In [32]:
# Iterate through the entire dataset
for _, row in tqdm(df.iterrows(), total=len(df)):
    code = row['func']
    target = row['target']
    prompt = create_prompt(code)
    response = get_llm_response(prompt)

    # Add short delay to avoid rate limiting
    time.sleep(1.5)

    # Parse prediction from response using regex
    match = re.search(r"\b(SAFE|VULNERABLE)\b", response.upper())
    pred = match.group(1) if match else "UNKNOWN"

    llm_outputs.append({
        "id": row['id'],
        "true_label": "VULNERABLE" if target else "SAFE",
        "llm_pred": pred,
        "response": response
    })

100%|██████████| 2732/2732 [6:55:22<00:00,  9.12s/it]      


In [34]:
# Save results
results_df = pd.DataFrame(llm_outputs)
results_df.to_csv("llm_vulnerability_results_full.csv", index=False)

In [35]:
# Print accuracy
from sklearn.metrics import confusion_matrix, classification_report

valid_results = results_df[results_df['llm_pred'].isin(["SAFE", "VULNERABLE"])]
accuracy = (valid_results['true_label'] == valid_results['llm_pred']).mean()
print(f"\nLLM Classification Accuracy on Full Test Set: {accuracy:.2f}")
print(confusion_matrix(results_df['true_label'], results_df['llm_pred']))
print(classification_report(results_df['true_label'], results_df['llm_pred']))


LLM Classification Accuracy on Full Test Set: 0.54
[[750   0 727]
 [  0   0   0]
 [534   1 720]]
              precision    recall  f1-score   support

        SAFE       0.58      0.51      0.54      1477
     UNKNOWN       0.00      0.00      0.00         0
  VULNERABLE       0.50      0.57      0.53      1255

    accuracy                           0.54      2732
   macro avg       0.36      0.36      0.36      2732
weighted avg       0.54      0.54      0.54      2732



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
