In [2]:
import torch

print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU detected:", torch.cuda.get_device_name(0))
else:
    print("⚠️ No GPU detected.")


CUDA available: True
GPU detected: NVIDIA GeForce RTX 4090 Laptop GPU


# Imports

In [3]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from torch.nn.functional import softmax
import torch.nn as nn


  from .autonotebook import tqdm as notebook_tqdm


# Training

In [4]:
df = pd.read_csv("cgt-main/consolidated.csv", sep=";")

# check solidity exist, remove "nan" value
def has_source(fp_sol):
    sol_path = os.path.join(r"D:\new440\cgt-main\source", f"{fp_sol}.sol")
    return os.path.isfile(sol_path)

df = df[df["fp_sol"].apply(has_source)]

# transfer property_holds to label： "t" -> 1， "f" -> 0
df = df[df['property_holds'].isin(['t', 'f'])]
df['label'] = df['property_holds'].map({'t': 1, 'f': 0})

# read Solidity Source code, sol as txt
def read_source(fp_sol):
    sol_path = os.path.join(r"D:\new440\cgt-main\source", f"{fp_sol}.sol")
    try:
        with open(sol_path, "r", encoding="utf-8") as f:
            return f.read()
    except FileNotFoundError:
        print(f"file {sol_path} missing, continue")
        return ""

df['code'] = df['fp_sol'].apply(read_source)

# filter "nan" out，select code and label
data = df.dropna(subset=['code', 'label'])
data = data[data['code'] != ""]
solidity_data = []
for _, row in tqdm(data.iterrows(), total=len(data), desc="Loading Solidity files"):
    fp_sol = row["fp_sol"]
    sol_path = os.path.join("cgt-main", "source", f"{fp_sol}.sol")
    try:
        with open(sol_path, "r", encoding="utf-8") as f:
            code = f.read()
        solidity_data.append({
            "contract_name": row["contractname"],
            "code": code,
            "bug_type": row["property"],
            "swc_id": row["swc"],
            "dasp_id": row["dasp"]
        })
    except FileNotFoundError:
        pass
        
# JSON file saved
df_solidity = pd.DataFrame(solidity_data)
df_solidity.to_json("aaaaa.json", indent=4)

# train-validate separate
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['code'].tolist(), data['label'].tolist(), test_size=0.2, random_state=42
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions)
    }

# define PyTorch Dataset class
class SolidityDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        # auto cut code lenth
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors="pt"
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# Define weighted loss inside a custom Trainer
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # Manually set class weights here
        weight = torch.tensor([1.0, 2.5]).to(logits.device)
        loss_fct = nn.CrossEntropyLoss(weight=weight)
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss


# load CodeBERT tokenizer and model form（microsoft/codebert-base）
model_name = "microsoft/codebert-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=2)

# !!!!!!!!!!!!!!!!!!!!!!! need further explore
train_dataset = SolidityDataset(train_texts, train_labels, tokenizer)
val_dataset = SolidityDataset(val_texts, val_labels, tokenizer)

# Trainer define and tune
training_args = TrainingArguments(
    output_dir="./codebert-finetuned",  
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=500,
    warmup_steps=100,
    weight_decay=0.01,
    learning_rate=2e-5,
    logging_dir='./logs',
    logging_steps=100,
    save_total_limit=15,
    disable_tqdm=False,
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    greater_is_better=True
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# TRAINING
#trainer.train()
trainer.train(resume_from_checkpoint="./codebert-finetuned/checkpoint-4000")

predictions = trainer.predict(val_dataset)
logits = predictions.predictions
labels = predictions.label_ids

# Use custom threshold on softmax probs
probs = softmax(torch.tensor(logits), dim=1).numpy()
custom_preds = (probs[:, 1] > 0.4).astype(int)  # try 0.4 or lower

print(classification_report(labels, custom_preds, zero_division=0))

# Show result
eval_result = trainer.evaluate()
print("Evaluation results:", eval_result)

Loading Solidity files: 100%|██████████| 19456/19456 [00:01<00:00, 12257.01it/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
4500,0.6568,0.643886,0.733299
5000,0.6358,0.651655,0.725591
5500,0.6298,0.656355,0.706835


              precision    recall  f1-score   support

           0       0.82      0.46      0.59      2825
           1       0.34      0.73      0.46      1067

    accuracy                           0.53      3892
   macro avg       0.58      0.59      0.52      3892
weighted avg       0.69      0.53      0.55      3892



Evaluation results: {'eval_loss': 0.6438862681388855, 'eval_accuracy': 0.7332990750256937, 'eval_runtime': 52.3619, 'eval_samples_per_second': 74.329, 'eval_steps_per_second': 9.301, 'epoch': 3.0}


# Testing

In [6]:
predictions = trainer.predict(val_dataset)
preds = predictions.predictions.argmax(axis=-1)
true_labels = predictions.label_ids

# Generate classification report
report = classification_report(true_labels, preds)
print(report)

              precision    recall  f1-score   support

           0       0.77      0.89      0.83      2825
           1       0.52      0.31      0.39      1067

    accuracy                           0.73      3892
   macro avg       0.65      0.60      0.61      3892
weighted avg       0.71      0.73      0.71      3892



In [7]:
trainer.save_model("best-codebert-model")

In [None]:
import os
import glob
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.nn.functional import softmax
from tqdm import tqdm

def main():
    MODEL_DIR     = "best-codebert-model"               
    CONTRACTS_DIR = "smartbugs-wild-master/contracts"  
    OUTPUT_CSV    = "smartbugs_predictions.csv"        
    THRESHOLD     = 0.4                                 
    SAVE_INTERVAL = 2000                                 

    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
    model     = RobertaForSequenceClassification.from_pretrained(MODEL_DIR)
    model.to(device).eval()
    file_paths = glob.glob(os.path.join(CONTRACTS_DIR, "*.sol"))
    results = []


    for idx, path in enumerate(tqdm(file_paths, desc="Predicting contracts")):
        with open(path, "r", encoding="utf-8") as f:
            code = f.read()
        encoding = tokenizer(
            code,
            truncation=True,
            padding="max_length",
            max_length=512,
            return_tensors="pt"
        ).to(device)


        with torch.no_grad():
            outputs = model(**encoding)
        logits = outputs.logits
        probs  = softmax(logits, dim=-1).cpu().numpy()[0]

        pred_argmax = int(probs.argmax())
        pred_thresh = int(probs[1] > THRESHOLD)

        results.append({
            "file":          os.path.basename(path),
            "prob_no_bug":   float(probs[0]),
            "prob_bug":      float(probs[1]),
            "pred_argmax":   pred_argmax,
            "pred_thresh":   pred_thresh
        })

        if (idx + 1) % SAVE_INTERVAL == 0:
            pd.DataFrame(results).to_csv(OUTPUT_CSV, index=False)
            print(f"Saved {idx+1} predictions so far to {OUTPUT_CSV}")

    pd.DataFrame(results).to_csv(OUTPUT_CSV, index=False)
    print(f"Saved all {len(results)} predictions to {OUTPUT_CSV}")

if __name__ == "__main__":
    main()


Predicting contracts:   4%|▍         | 2008/47398 [00:44<14:13, 53.21it/s]

Saved 2000 predictions so far to smartbugs_predictions.csv


Predicting contracts:   8%|▊         | 4005/47398 [01:21<12:57, 55.82it/s]

Saved 4000 predictions so far to smartbugs_predictions.csv


Predicting contracts:  13%|█▎        | 6008/47398 [01:59<11:49, 58.37it/s]

Saved 6000 predictions so far to smartbugs_predictions.csv


Predicting contracts:  17%|█▋        | 8001/47398 [02:40<11:42, 56.10it/s]

Saved 8000 predictions so far to smartbugs_predictions.csv


Predicting contracts:  21%|██        | 10011/47398 [03:22<10:40, 58.39it/s]

Saved 10000 predictions so far to smartbugs_predictions.csv


Predicting contracts:  25%|██▌       | 12010/47398 [04:03<09:55, 59.46it/s]

Saved 12000 predictions so far to smartbugs_predictions.csv


Predicting contracts:  30%|██▉       | 14002/47398 [05:37<26:10, 21.27it/s]  

Saved 14000 predictions so far to smartbugs_predictions.csv


Predicting contracts:  34%|███▍      | 16004/47398 [07:04<21:44, 24.07it/s]  

Saved 16000 predictions so far to smartbugs_predictions.csv


Predicting contracts:  38%|███▊      | 18004/47398 [08:33<25:06, 19.51it/s]  

Saved 18000 predictions so far to smartbugs_predictions.csv


Predicting contracts:  42%|████▏     | 20003/47398 [09:58<18:52, 24.19it/s]  

Saved 20000 predictions so far to smartbugs_predictions.csv


Predicting contracts:  46%|████▋     | 22005/47398 [11:27<17:33, 24.11it/s]

Saved 22000 predictions so far to smartbugs_predictions.csv


Predicting contracts:  51%|█████     | 24003/47398 [12:59<21:30, 18.13it/s]  

Saved 24000 predictions so far to smartbugs_predictions.csv


Predicting contracts:  55%|█████▍    | 26004/47398 [14:31<15:12, 23.44it/s]

Saved 26000 predictions so far to smartbugs_predictions.csv


Predicting contracts:  59%|█████▉    | 28004/47398 [15:59<17:02, 18.96it/s]

Saved 28000 predictions so far to smartbugs_predictions.csv


Predicting contracts:  63%|██████▎   | 30001/47398 [17:27<14:19, 20.23it/s]

Saved 30000 predictions so far to smartbugs_predictions.csv


Predicting contracts:  68%|██████▊   | 32002/47398 [18:53<13:19, 19.25it/s]

Saved 32000 predictions so far to smartbugs_predictions.csv


Predicting contracts:  72%|███████▏  | 34002/47398 [20:21<11:25, 19.53it/s]

Saved 34000 predictions so far to smartbugs_predictions.csv


Predicting contracts:  76%|███████▌  | 36005/47398 [21:51<08:10, 23.24it/s]

Saved 36000 predictions so far to smartbugs_predictions.csv


Predicting contracts:  80%|████████  | 38003/47398 [23:20<07:34, 20.66it/s]

Saved 38000 predictions so far to smartbugs_predictions.csv


Predicting contracts:  84%|████████▍ | 40002/47398 [24:50<06:46, 18.18it/s]

Saved 40000 predictions so far to smartbugs_predictions.csv


Predicting contracts:  89%|████████▊ | 42004/47398 [26:17<03:32, 25.41it/s]

Saved 42000 predictions so far to smartbugs_predictions.csv


Predicting contracts:  93%|█████████▎| 44002/47398 [27:47<03:03, 18.47it/s]

Saved 44000 predictions so far to smartbugs_predictions.csv


Predicting contracts:  97%|█████████▋| 46004/47398 [29:10<01:13, 18.92it/s]

Saved 46000 predictions so far to smartbugs_predictions.csv


Predicting contracts: 100%|██████████| 47398/47398 [30:09<00:00, 26.19it/s]


Saved all 47398 predictions to smartbugs_predictions.csv


In [3]:
import pandas as pd

df = pd.read_csv("smartbugs_predictions.csv")

total = len(df)

num_bug = int(df["pred_thresh"].sum())

print(f"Total contrast: {total}")
print(f"Prediced valnerable: {num_bug}")
print(f"Percentage: {num_bug/total:.2%}")


Total contrast: 47398
Prediced valnerable: 33679
Percentage: 71.06%
