In [21]:
import torch

print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU detected:", torch.cuda.get_device_name(0))
else:
    print("⚠️ No GPU detected.")


CUDA available: True
GPU detected: NVIDIA GeForce RTX 4090 Laptop GPU


# Imports

In [22]:
import os
import pandas as pd
import torch
import gc
from torch.utils.data import Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from torch.nn.functional import softmax
import numpy as np


# Data Prep

In [23]:
df = pd.read_csv("cgt-main/consolidated.csv", sep=";")

# check solidity exist, remove "nan" value
def has_source(fp_sol):
    sol_path = os.path.join(r"D:\new440\cgt-main\source", f"{fp_sol}.sol")
    return os.path.isfile(sol_path)

df = df[df["fp_sol"].apply(has_source)]

# transfer property_holds to label： "t" -> 1， "f" -> 0
df = df[df['property_holds'].isin(['t', 'f'])]
df['label'] = df['property_holds'].map({'t': 1, 'f': 0})

# read Solidity Source code, sol as txt
def read_source(fp_sol):
    sol_path = os.path.join(r"D:\new440\cgt-main\source", f"{fp_sol}.sol")
    try:
        with open(sol_path, "r", encoding="utf-8") as f:
            return f.read()
    except FileNotFoundError:
        print(f"file {sol_path} missing, continue")
        return ""

df['code'] = df['fp_sol'].apply(read_source)

# filter "nan" out，select code and label
data = df.dropna(subset=['code', 'label'])
data = data[data['code'] != ""]
solidity_data = []
for _, row in tqdm(data.iterrows(), total=len(data), desc="Loading Solidity files"):
    fp_sol = row["fp_sol"]
    sol_path = os.path.join("cgt-main", "source", f"{fp_sol}.sol")
    try:
        with open(sol_path, "r", encoding="utf-8") as f:
            code = f.read()
        solidity_data.append({
            "contract_name": row["contractname"],
            "code": code,
            "bug_type": row["property"],
            "swc_id": row["swc"],
            "dasp_id": row["dasp"]
        })
    except FileNotFoundError:
        pass
        
# JSON file saved
df_solidity = pd.DataFrame(solidity_data)
df_solidity.to_json("aaaaa.json", indent=4)

# train-validate separate
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['code'].tolist(), data['label'].tolist(), test_size=0.2, random_state=42
)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions)
    }


# define PyTorch Dataset class
class SolidityDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        # auto cut code lenth
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors="pt"
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item
    
# Define weighted loss inside a custom Trainer
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # Manually set class weights here
        weight = torch.tensor([1.0, 2.5]).to(logits.device)
        loss_fct = torch.nn.CrossEntropyLoss(weight=weight)
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss


Loading Solidity files: 100%|██████████| 19456/19456 [00:01<00:00, 9916.47it/s] 


# CodeBERT

In [28]:
# load CodeBERT tokenizer and model form（microsoft/codebert-base）
model_name = "microsoft/codebert-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=2)

# !!!!!!!!!!!!!!!!!!!!!!! need further explore
train_dataset = SolidityDataset(train_texts, train_labels, tokenizer)
val_dataset = SolidityDataset(val_texts, val_labels, tokenizer)

# Trainer define and tune
training_args = TrainingArguments(
    output_dir="./codebert-finetuned",  
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    warmup_steps=100,
    weight_decay=0.01,
    learning_rate=2e-5,
    logging_dir='./logs',
    logging_steps=100,
    save_total_limit=2,
    disable_tqdm=False,
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    greater_is_better=True
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# TRAINING
trainer.train()

predictions = trainer.predict(val_dataset)

# Show result
eval_result = trainer.evaluate()
print("Evaluation results:", eval_result)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
1000,0.6954,0.664312,0.652621
2000,0.6689,0.644885,0.615365
3000,0.6645,0.623435,0.699383
4000,0.6173,0.730574,0.729959
5000,0.6348,0.681392,0.741007
6000,0.6648,0.625305,0.743063
7000,0.6614,0.928683,0.744604
8000,0.6482,0.626905,0.720195
9000,0.626,0.64771,0.721737


Evaluation results: {'eval_loss': 0.9286831021308899, 'eval_accuracy': 0.7446043165467626, 'eval_runtime': 40.8803, 'eval_samples_per_second': 95.205, 'eval_steps_per_second': 23.801, 'epoch': 2.3130300693909023}


In [29]:
predictions = trainer.predict(val_dataset)
preds = predictions.predictions.argmax(axis=-1)
true_labels = predictions.label_ids

# Generate classification report
report = classification_report(true_labels, preds)
print(report)

              precision    recall  f1-score   support

           0       0.77      0.92      0.84      2825
           1       0.57      0.29      0.39      1067

    accuracy                           0.74      3892
   macro avg       0.67      0.60      0.61      3892
weighted avg       0.72      0.74      0.71      3892



# GraphCodeBert

In [12]:

# Load tokenizer and model
model_name_gcb = "microsoft/graphcodebert-base"
tokenizer_gcb = RobertaTokenizer.from_pretrained(model_name_gcb)
model_gcb = RobertaForSequenceClassification.from_pretrained(model_name_gcb, num_labels=2)

# Dataset
train_dataset_gcb = SolidityDataset(train_texts, train_labels, tokenizer_gcb)
val_dataset_gcb = SolidityDataset(val_texts, val_labels, tokenizer_gcb)

# TrainingArguments
training_args_gcb = TrainingArguments(
    output_dir="./graphcodebert-finetuned",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=500,
    warmup_steps=100,
    weight_decay=0.01,
    learning_rate=2e-5,
    logging_dir='./logs_gcb',
    logging_steps=100,
    save_total_limit=2,
    disable_tqdm=False,
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    greater_is_better=True
)


# Trainer
trainer_gcb = WeightedTrainer(
    model=model_gcb,
    args=training_args_gcb,
    train_dataset=train_dataset_gcb,
    eval_dataset=val_dataset_gcb,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_gcb.to(device)
trainer_gcb.train()
predictions_gcb = trainer_gcb.predict(val_dataset_gcb)
eval_result_gcb = trainer_gcb.evaluate()
print("GraphCodeBERT Eval:", eval_result_gcb)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
500,0.6571,0.646285,0.561665
1000,0.6369,0.642249,0.700925
1500,0.6422,0.622176,0.670092
2000,0.6432,0.623613,0.717112
2500,0.5959,0.655784,0.70298


GraphCodeBERT Eval: {'eval_loss': 0.6236129403114319, 'eval_accuracy': 0.7171120246659815, 'eval_runtime': 37.7259, 'eval_samples_per_second': 103.165, 'eval_steps_per_second': 6.468, 'epoch': 3.0}


# CodeT5+

In [31]:
# Clear CUDA cache
torch.cuda.empty_cache()

# Collect unused Python objects
gc.collect()

def compute_metrics(eval_pred):
    if hasattr(eval_pred, "predictions"):
        logits = eval_pred.predictions
        labels = eval_pred.label_ids
    else:
        logits, labels = eval_pred
    if isinstance(logits, (tuple, list)):
        logits = logits[0]
    if isinstance(logits, torch.Tensor):
        logits = logits.detach().cpu().numpy()
    if isinstance(labels, torch.Tensor):
        labels = labels.detach().cpu().numpy()

    preds = np.argmax(logits, axis=-1)
    return {"eval_accuracy": accuracy_score(labels, preds)}

# Load tokenizer and model
# model_name_c5p = "Salesforce/codet5p-770m"
model_name_c5p = "Salesforce/codet5-base"
tokenizer_c5p = AutoTokenizer.from_pretrained(model_name_c5p)
model_c5p = AutoModelForSequenceClassification.from_pretrained(model_name_c5p, num_labels=2)

# Dataset
train_dataset_c5p = SolidityDataset(train_texts, train_labels, tokenizer_c5p)
val_dataset_c5p = SolidityDataset(val_texts, val_labels, tokenizer_c5p)

from torch.utils.data import Subset
val_dataset_small = Subset(val_dataset_c5p, range(500))

# TrainingArguments
training_args_c5p = TrainingArguments(
    output_dir="./codet5p-finetuned",
    num_train_epochs=3,
    fp16=True,
    fp16_full_eval=True,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="steps",
    eval_steps=1000,
    eval_accumulation_steps=8, 
    save_steps=1000,
    warmup_steps=100,
    weight_decay=0.01,
    learning_rate=2e-5,
    logging_dir='./logs_c5p',
    logging_steps=100,
    save_total_limit=2,
    disable_tqdm=False,
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    greater_is_better=True
)

# Trainer
trainer_c5p = WeightedTrainer(
    model=model_c5p,
    args=training_args_c5p,
    train_dataset=train_dataset_c5p,
    eval_dataset=val_dataset_small, # faster to eval
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_c5p.to(device)
trainer_c5p.train()

predictions_c5p = trainer_c5p.predict(val_dataset_c5p)
eval_result_c5p = trainer_c5p.evaluate()
print("CodeT5+ Eval:", eval_result_c5p)

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at Salesforce/codet5-base and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
1000,0.7414,0.727766,0.348
2000,0.7123,0.708254,0.556
3000,0.6521,0.641556,0.718
4000,0.6203,0.665528,0.746
5000,0.6204,0.6458,0.736
6000,0.6812,0.6325,0.696


There were missing keys in the checkpoint model loaded: ['transformer.encoder.embed_tokens.weight', 'transformer.decoder.embed_tokens.weight'].


CodeT5+ Eval: {'eval_accuracy': 0.746, 'eval_loss': 0.6655176281929016, 'eval_runtime': 9.202, 'eval_samples_per_second': 54.336, 'eval_steps_per_second': 13.584, 'epoch': 1.5420200462606015}
