<a href="https://colab.research.google.com/github/areesha-13/RE-Aided-VA/blob/main/RE_Aided_VA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Environment Setup & Drive Mounting

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Dataset Loading

In [None]:
from datasets import load_dataset

ds = load_dataset("LorenzH/juliet_test_suite_c_1_3")
print(ds)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

jts_c_1_3_train.csv:   0%|          | 0.00/237M [00:00<?, ?B/s]

jts_c_1_3_test.csv:   0%|          | 0.00/59.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/80706 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/20177 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['index', 'filename', 'class', 'good', 'bad'],
        num_rows: 80706
    })
    test: Dataset({
        features: ['index', 'filename', 'class', 'good', 'bad'],
        num_rows: 20177
    })
})


Vulnerability Scope Definition

In [None]:
TARGET_CWE_PATTERNS = [
    "CWE121_",  # Buffer Overflow
    "CWE190_",  # Integer Overflow
    "CWE416_",  # Use-After-Free
    "CWE476_",  # Null Pointer Dereference
    "CWE134_"   # Format String
]

Dataset Filtering

In [None]:
filtered = [
    s for s in ds["train"]
    if any(p in s["filename"] for p in TARGET_CWE_PATTERNS)
]
len(filtered)

16279

Balanced Sampling

In [None]:
import random

SAMPLES_PER_CWE = 40  # fast + defensible
balanced = []

for p in TARGET_CWE_PATTERNS:
    samples = [s for s in filtered if p in s["filename"]]
    balanced.extend(random.sample(samples, SAMPLES_PER_CWE))
len(balanced)

200

Structured Dataset Serialization

In [None]:
import json

out_path = "/content/drive/MyDrive/juliet_eval.jsonl"

with open(out_path, "w") as f:
    for s in balanced:
        # Write vulnerable sample
        record_vuln = {
            "cwe": s["filename"].split("/")[1].split("_")[0],
            "code": s["bad"],
            "label": "vulnerable"
        }
        f.write(json.dumps(record_vuln) + "\n")

        # Write safe sample
        record_safe = {
            "cwe": s["filename"].split("/")[1].split("_")[0],
            "code": s["good"],
            "label": "safe"
        }
        f.write(json.dumps(record_safe) + "\n")

print("Saved:", out_path)


Saved: /content/drive/MyDrive/juliet_eval.jsonl


Baseline Embedding + Classifier

In [None]:
!pip install -q transformers torch scikit-learn

import json
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

# Load data
data = []
with open("/content/drive/MyDrive/juliet_eval.jsonl") as f:
    for line in f:
        data.append(json.loads(line))

# Limit for speed + presentation
data = data[:300]

X = [d["code"] for d in data]
y = [1 if d["label"] == "vulnerable" else 0 for d in data]

# Model
model_name = "microsoft/codebert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Embedding extraction (CLS token)
embeddings = []

for code in tqdm(X):
    inputs = tokenizer(
        code,
        truncation=True,
        padding="max_length",
        max_length=256,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(cls_embedding[0])

X_emb = np.array(embeddings)

# Train / test split
X_train, X_test, y_train, y_test = train_test_split(
    X_emb, y, test_size=0.3, random_state=42
)

# Train classifier
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Evaluate
preds = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, preds))
print("\nClassification Report:\n")
print(classification_report(y_test, preds, target_names=["safe", "vulnerable"]))


100%|██████████| 300/300 [06:13<00:00,  1.25s/it]

Accuracy: 0.7

Classification Report:

              precision    recall  f1-score   support

        safe       0.74      0.59      0.66        44
  vulnerable       0.67      0.80      0.73        46

    accuracy                           0.70        90
   macro avg       0.71      0.70      0.70        90
weighted avg       0.71      0.70      0.70        90






Improved Embedding Strategy

In [None]:
!pip install -q transformers torch scikit-learn numpy tqdm

import json
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

# Load data
data = []
with open("/content/drive/MyDrive/juliet_eval.jsonl") as f:
    for line in f:
        data.append(json.loads(line))

data = data[:300]  # stable + fast

X = [d["code"] for d in data]
y = np.array([1 if d["label"] == "vulnerable" else 0 for d in data])

# Model
model_name = "microsoft/codebert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Mean pooling function
def mean_pooling(output, attention_mask):
    token_embeddings = output.last_hidden_state
    mask = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return (token_embeddings * mask).sum(1) / mask.sum(1)

# Extract embeddings
embeddings = []
for code in tqdm(X):
    inputs = tokenizer(
        code,
        truncation=True,
        padding="max_length",
        max_length=384,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        emb = mean_pooling(outputs, inputs["attention_mask"])
        embeddings.append(emb.cpu().numpy()[0])

X_emb = np.array(embeddings)

# Train / test split
X_train, X_test, y_train, y_test = train_test_split(
    X_emb, y, test_size=0.3, random_state=42, stratify=y
)

# Class-weighted classifier
clf = LogisticRegression(
    max_iter=2000,
    class_weight="balanced",
    n_jobs=-1
)

clf.fit(X_train, y_train)

# Evaluate
preds = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, preds))
print("\nClassification Report:\n")
print(classification_report(y_test, preds, target_names=["safe", "vulnerable"]))


100%|██████████| 300/300 [08:58<00:00,  1.79s/it]


Accuracy: 0.8222222222222222

Classification Report:

              precision    recall  f1-score   support

        safe       0.85      0.78      0.81        45
  vulnerable       0.80      0.87      0.83        45

    accuracy                           0.82        90
   macro avg       0.82      0.82      0.82        90
weighted avg       0.82      0.82      0.82        90

