In [1]:
!pip install transformers scikit-learn torch --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m82.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m80.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m46.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import zipfile
import os
import re
import random
from pathlib import Path
from transformers import RobertaTokenizer, RobertaModel
import torch
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
zip_path = "/content/Rust_CVE_Samples.zip"
extract_dir = "/content/rust_cve_dataset"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print("Extracted folders:", os.listdir(extract_dir))


Extracted folders: ['Rust_CVE_Samples', '__MACOSX']


In [4]:
def load_dataset(base_path):
    X = []
    y = []
    for label_dir, label in [("Negative", 0), ("Positive", 1)]:
        folder = Path(base_path)/label_dir
        for file_path in folder.glob("*.rs"):
            code = file_path.read_text(encoding="utf-8", errors="ignore")
            X.append(code)
            y.append(label)
    return X, y

X_raw, y = load_dataset("/content/rust_cve_dataset/Rust_CVE_Samples")
print(f"Loaded {len(X_raw)} code samples (positive + negative)")


Loaded 162 code samples (positive + negative)


In [12]:
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = RobertaModel.from_pretrained("microsoft/codebert-base")
model.eval()

def embed_code_batch(snippets, batch_size=8):
    embeddings = []
    for i in range(0, len(snippets), batch_size):
        batch = snippets[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.extend(batch_embeddings)
    return np.array(embeddings)

X_emb = embed_code_batch(X_raw)
print("Embeddings shape:", X_emb.shape)


Embeddings shape: (162, 768)


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_emb, y, test_size=0.2, random_state=42, stratify=y)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)


print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.27      0.24      0.25        17
           1       0.28      0.31      0.29        16

    accuracy                           0.27        33
   macro avg       0.27      0.27      0.27        33
weighted avg       0.27      0.27      0.27        33

Confusion Matrix:
 [[ 4 13]
 [11  5]]


In [5]:
def insert_unsafe_block(code):
    # Wrap random function body in unsafe block
    functions = re.findall(r"(pub\s+unsafe\s+fn\s+\w+\(.*?\)\s*{)", code)
    if not functions:
        return code
    fn_to_mutate = random.choice(functions)
    mutated_code = code.replace(fn_to_mutate, fn_to_mutate + "\n    unsafe {", 1)
    mutated_code += "\n    }"  # add closing bracket (rough fix)
    return mutated_code

def remove_unwrap(code):
    return re.sub(r"\.unwrap\(\)", "", code)

def replace_indexing(code):
    return code.replace("[i]", ".get(i).unwrap_or(&default)")

def mutate_code(code, mutation_count=1):
    mutations = [insert_unsafe_block, remove_unwrap, replace_indexing]
    for _ in range(mutation_count):
        mutation = random.choice(mutations)
        code = mutation(code)
    return code


In [6]:
def augment_dataset(X_raw, y, num_augments_per_sample=1):
    X_aug = []
    y_aug = []
    for code, label in zip(X_raw, y):
        for _ in range(num_augments_per_sample):
            mutated = mutate_code(code)
            X_aug.append(mutated)
            y_aug.append(label)  # assume label stays the same
    return X_raw + X_aug, y + y_aug
X_raw_aug, y_aug = augment_dataset(X_raw, y, num_augments_per_sample=1)

In [7]:
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = RobertaModel.from_pretrained("microsoft/codebert-base")
model.eval()

def embed_code_batch(snippets, batch_size=8):
    embeddings = []
    for i in range(0, len(snippets), batch_size):
        batch = snippets[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.extend(batch_embeddings)
    return np.array(embeddings)

X_emb_aug = embed_code_batch(X_raw_aug)

print("Embeddings shape:", X_emb_aug.shape)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Embeddings shape: (324, 768)


In [8]:
def extract_manual_features(code):
    return [
        code.count("unsafe"),
        code.count(".unwrap()"),
        code.count("from_raw_parts"),
        code.count("as *mut"),
        code.count("Vec::from_raw_parts"),
        len(code.splitlines())
    ]

manual_feats_aug = np.array([extract_manual_features(code) for code in X_raw_aug])
X_combined = np.hstack((X_emb_aug, manual_feats_aug))

print("Combined shape:", X_combined.shape)  # should be (324, 774)

Combined shape: (324, 774)


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_aug, test_size=0.2, random_state=42, stratify=y_aug)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)


print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.85      0.85      0.85        33
           1       0.84      0.84      0.84        32

    accuracy                           0.85        65
   macro avg       0.85      0.85      0.85        65
weighted avg       0.85      0.85      0.85        65

Confusion Matrix:
 [[28  5]
 [ 5 27]]


In [11]:
from sklearn.metrics import roc_auc_score

# For ROC AUC, we need predicted probabilities
y_proba = clf.predict_proba(X_test)[:, 1]  # Probability of the positive class

roc_auc = roc_auc_score(y_test, y_proba)
print("ROC AUC Score:", roc_auc)


ROC AUC Score: 0.9043560606060607


In [14]:
def augment_dataset(X_raw, y, num_augments_per_sample=1):
    X_aug = []
    y_aug = []
    for code, label in zip(X_raw, y):
        for _ in range(num_augments_per_sample):
            mutated = mutate_code(code,2)
            X_aug.append(mutated)
            y_aug.append(label)  # assume label stays the same
    return X_raw + X_aug, y + y_aug
X_raw_aug, y_aug = augment_dataset(X_raw, y, num_augments_per_sample=1)

In [15]:
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = RobertaModel.from_pretrained("microsoft/codebert-base")
model.eval()

def embed_code_batch(snippets, batch_size=8):
    embeddings = []
    for i in range(0, len(snippets), batch_size):
        batch = snippets[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.extend(batch_embeddings)
    return np.array(embeddings)

X_emb_aug = embed_code_batch(X_raw_aug)
print("Embeddings shape:", X_emb_aug.shape)


Embeddings shape: (324, 768)


In [16]:
def extract_manual_features(code):
    return [
        code.count("unsafe"),
        code.count(".unwrap()"),
        code.count("from_raw_parts"),
        code.count("as *mut"),
        code.count("Vec::from_raw_parts"),
        len(code.splitlines())
    ]

manual_feats_aug = np.array([extract_manual_features(code) for code in X_raw_aug])
X_combined = np.hstack((X_emb_aug, manual_feats_aug))

print("Combined shape:", X_combined.shape)  # should be (324, 774)

Combined shape: (324, 774)


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_aug, test_size=0.2, random_state=42, stratify=y_aug)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)


print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.84      0.79      0.81        33
           1       0.79      0.84      0.82        32

    accuracy                           0.82        65
   macro avg       0.82      0.82      0.82        65
weighted avg       0.82      0.82      0.82        65

Confusion Matrix:
 [[26  7]
 [ 5 27]]


In [18]:

# For ROC AUC, we need predicted probabilities
y_proba = clf.predict_proba(X_test)[:, 1]  # Probability of the positive class

roc_auc = roc_auc_score(y_test, y_proba)
print("ROC AUC Score:", roc_auc)


ROC AUC Score: 0.8877840909090908
