In [1]:
!pip install requests pandas scikit-learn tiktoken python-dotenv




In [2]:
import os
import pandas as pd
import time
import tiktoken
from sklearn.metrics import classification_report
from huggingface_hub import InferenceClient

# Load your HF token from the environment (or .env)
os.environ["HF_TOKEN"] = "token"
HF_TOKEN = os.environ["HF_TOKEN"]

# Create an InferenceClient using the Cerebras provider
client = InferenceClient(
    provider="cerebras",
    api_key=HF_TOKEN
)

# Model ID remains the same
MODEL_ID = "meta-llama/Llama-3.3-70B-Instruct"
client = InferenceClient(
    provider="cerebras",
    model=MODEL_ID,
    api_key=HF_TOKEN
)

print("InferenceClient ready for model:", MODEL_ID)


InferenceClient ready for model: meta-llama/Llama-3.3-70B-Instruct


In [3]:
# ─── Cell 3 ───
# Load the preprocessed test CSV
test_df = pd.read_csv("/Users/arushijain/PycharmProjects/AIPoweredCodeAssistant/datasets/security/test_preprocessed.csv")

# Quick sanity checks
print(f"✅ Loaded test_df: {test_df.shape[0]} rows × {test_df.shape[1]} columns")
print("Columns:", test_df.columns.tolist())

# Show the first func snippet (truncated) to verify it loaded correctly
sample_code = test_df.loc[0, 'func']
print("\nSample ‘func’ snippet (first 200 chars):\n", sample_code[:200].replace('\n', ' '))


✅ Loaded test_df: 2732 rows × 17 columns
Columns: ['id', 'func', 'target', 'project', 'commit_id', 'tokens', 'input_ids', 'attention_mask', 'func_length', 'num_loops', 'has_eval', 'has_system', 'num_if', 'num_return', 'uses_pointer', 'uses_buffer', 'is_short_func']

Sample ‘func’ snippet (first 200 chars):
 int ff_get_wav_header(AVFormatContext *s, AVIOContext *pb,                        AVCodecContext *codec, int size, int big_endian)  {      int id;      uint64_t bitrate;        if (size < 14) {       


In [4]:
import json

for idx in [0, 1]:
    code = test_df.loc[idx, 'func']
    print(f"\n=== DEBUG SAMPLE {idx} ===\n")

    # Truncated code preview
    preview = code[:200].replace('\n', ' ')
    print("Code snippet (first 200 chars):", preview, "...\n")

    # Build chat messages in HF chat format
    messages = [
        {"role": "system", "content": "You are a cybersecurity expert. Only reply with Yes or No."},
        {"role": "user",   "content": f"Is the following C code vulnerable? Reply with Yes or No only:\n{code}"}
    ]

    print("Messages payload:")
    print(json.dumps(messages, indent=2), "\n")

    # Send via InferenceClient
    resp = client.chat.completions.create(
        model=MODEL_ID,
        messages=messages,
        max_tokens=10,
        temperature=0.0,
    )

    # Inspect the raw response
    print("Raw response object:", resp, "\n")

    # Extract the generated text
    generated = resp.choices[0].message.content
    print("Generated text:", repr(generated), "\n")



=== DEBUG SAMPLE 0 ===

Code snippet (first 200 chars): int ff_get_wav_header(AVFormatContext *s, AVIOContext *pb,                        AVCodecContext *codec, int size, int big_endian)  {      int id;      uint64_t bitrate;        if (size < 14) {        ...

Messages payload:
[
  {
    "role": "system",
    "content": "You are a cybersecurity expert. Only reply with Yes or No."
  },
  {
    "role": "user",
  }
] 

Raw response object: ChatCompletionOutput(choices=[ChatCompletionOutputComplete(finish_reason='stop', index=0, message=ChatCompletionOutputMessage(role='assistant', content='Yes', tool_call_id=None, tool_calls=None), logprobs=None)], created=1753301473, id='chatcmpl-b0aece59-05f6-4b68-8949-00beb508ee43', model='llama-3.3-70b', system_fingerprint='fp_d9197bd9bfb96c41ef64', usage=ChatCompletionOutputUsage(completion_tokens=2, prompt_tokens=1140, total_tokens=1142, prompt_tokens_details={'cached_tokens': 0}), object='chat.completion', time_info={'queue_time': 0.00105336, 'pr

In [5]:
def analyze_code_vulnerability_llama(code_snippet):
    messages = [
        {"role": "system", "content": "You are a cybersecurity expert. Only reply with Yes or No."},
        {"role": "user",   "content": f"Is the following C code vulnerable? Reply with Yes or No only:\n{code_snippet}"}
    ]
    try:
        resp = client.chat.completions.create(
            messages=messages,
            max_tokens=10,
            temperature=0.0,
        )
        reply = resp.choices[0].message.content.strip().lower()
        if reply.startswith("yes"):
            return 1
        if reply.startswith("no"):
            return 0
        return -1
    except Exception as e:
        print("Error in Llama call:", e)
        return -1


In [6]:
max_entries = 5
test_subset = test_df.head(max_entries).copy()

results = []
for idx, row in test_subset.iterrows():
    code = row["func"]
    print(f"\n[{idx}] Analyzing snippet…")
    pred = analyze_code_vulnerability_llama(code)
    print(f"[{idx}] Prediction: {pred}")
    results.append(pred)
    time.sleep(1)   # short pause to avoid throttling

test_subset["llm_pred"] = results



[0] Analyzing snippet…
[0] Prediction: 1

[1] Analyzing snippet…
[1] Prediction: 1

[2] Analyzing snippet…
[2] Prediction: 1

[3] Analyzing snippet…
[3] Prediction: 1

[4] Analyzing snippet…
[4] Prediction: 1


In [7]:
# Filter out any error predictions
eval_df = test_subset[test_subset["llm_pred"] >= 0]

if eval_df.empty:
    print("No successful predictions to evaluate.")
else:
    print("LLM Test Set Performance:\n")
    print(classification_report(
        eval_df["target"],
        eval_df["llm_pred"],
        target_names=["Safe", "Vulnerable"]
    ))
    print("\nPrediction counts:\n", eval_df["llm_pred"].value_counts())


LLM Test Set Performance:

              precision    recall  f1-score   support

        Safe       0.00      0.00      0.00         3
  Vulnerable       0.40      1.00      0.57         2

    accuracy                           0.40         5
   macro avg       0.20      0.50      0.29         5
weighted avg       0.16      0.40      0.23         5


Prediction counts:
 llm_pred
1    5
Name: count, dtype: int64


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
# Pick the first snippet classified as vulnerable
vuln_df = test_subset[test_subset["llm_pred"] == 1]

if vuln_df.empty:
    print("No vulnerable code predicted. Skipping exploit generation.")
else:
    example_code = vuln_df.iloc[0]["func"]
    messages = [
        {"role": "system", "content": "You are an ethical hacker. Provide only the exploit steps."},
        {"role": "user",   "content": f"Provide a simulated exploit for the following vulnerable C code:\n{example_code}"}
    ]
    resp = client.chat.completions.create(
        model=MODEL_ID,
        messages=messages,
        max_tokens=300,
        temperature=0.3,
    )
    exploit = resp.choices[0].message.content.strip()
    print("Exploit simulation:\n", exploit)


Exploit simulation:
 ### Exploit Steps

The given C code appears to be a part of a media processing library, specifically designed to parse WAV file headers. To simulate an exploit, we'll focus on potential vulnerabilities such as buffer overflows, integer overflows, or out-of-bounds reads.

1. **Integer Overflow Vulnerability**:
   - **Location**: The calculation of `bitrate` (`bitrate = avio_rl32(pb) * 8LL;`).
   - **Exploit Step**: Craft a WAV file where the 32-bit value read by `avio_rl32(pb)` is close to the maximum value that can be represented by a 32-bit integer (e.g., `0xFFFFFFFF`). When multiplied by 8, this could overflow and wrap around, potentially causing unexpected behavior or allowing for further exploitation if the resulting `bitrate` value is used in a way that can be controlled by an attacker.

2. **Buffer Overflow Vulnerability**:
   - **Location**: The handling of `cbSize` in the WAVEFORMATEX parsing logic (`int cbSize = avio_rl16(pb);`).
   - **Exploit Step**: Cre