In [6]:
%pip install transformers torch tqdm nltk

Note: you may need to restart the kernel to use updated packages.


In [7]:
import os
import json
import zipfile
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import RobertaTokenizer, RobertaForMaskedLM, AdamW
from tqdm import tqdm
import numpy as np

In [23]:
class SolidityDataset(Dataset):
    def __init__(self, folder_path, tokenizer, max_length=256):
        self.folder_path = folder_path
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.samples = []

        print(f"Reading files from folder: {folder_path}")

        # Iterate through all .txt files in the directory
        file_count = 0
        valid_count = 0
        
        for root, _, files in os.walk(folder_path):
            for file_name in files:
                if file_name.endswith('.txt'):  # Only process .txt files
                    file_count += 1
                    file_path = os.path.join(root, file_name)
                    try:
                        # Load the JSON data from the file
                        with open(file_path, 'r') as file:
                            data = json.load(file)
                            code = data.get("Code")
                            if code:
                                self.samples.append(code)
                                valid_count += 1
                            else:
                                print(f"Warning: 'Code' field not found in {file_name}")
                    except json.JSONDecodeError:
                        print(f"Error decoding JSON in file: {file_name}")
                    except Exception as e:
                        print(f"Unexpected error with file {file_name}: {e}")

        print(f"Processed {file_count} files. Loaded {valid_count} valid samples.")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        code = self.samples[idx]
        encoded_input = self.tokenizer(
            code,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return encoded_input['input_ids'].squeeze(), encoded_input['attention_mask'].squeeze()


# Initialize dataset with logging
folder_path = "/kaggle/input/solidity-code"

dataset = SolidityDataset(folder_path=solidity_code_path, tokenizer=tokenizer)

# Check if the dataset contains any samples
if len(dataset) == 0:
    raise ValueError(f"No samples found in folder: {folder_path}. Please check your data.")

# Split dataset
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print(f"Dataset split into {len(train_dataset)} training samples and {len(val_dataset)} validation samples.")

Reading files from folder: /kaggle/input/solidity-code
Processed 21298 files. Loaded 21281 valid samples.
Dataset split into 17024 training samples and 4257 validation samples.


In [24]:
# Load pre-trained CodeBERT
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = RobertaForMaskedLM.from_pretrained("microsoft/codebert-base")

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNor

In [26]:
# Define Early Stopping
class EarlyStopping:
    def __init__(self, patience=3, mode="max"):
        self.patience = patience
        self.mode = mode
        self.best_score = None
        self.counter = 0
        self.should_stop = False

    def __call__(self, current_score):
        if self.best_score is None or (
            (self.mode == "max" and current_score > self.best_score) or
            (self.mode == "min" and current_score < self.best_score)
        ):
            self.best_score = current_score
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.should_stop = True

# Define fine-tuning function
def fine_tune_model(model, train_loader, val_loader, tokenizer, epochs=10, learning_rate=1e-5):
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    early_stopping = EarlyStopping(patience=3, mode="max")
    
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        correct_predictions = 0
        total_masked_tokens = 0

        with tqdm(total=len(train_loader), desc=f"Epoch {epoch+1}/{epochs}") as pbar:
            for input_ids, attention_mask in train_loader:
                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)

                # Mask tokens
                labels = input_ids.clone()
                probability_matrix = torch.full(labels.shape, 0.25)  # Increased masking probability to 25%
                masked_indices = torch.bernoulli(probability_matrix).bool()
                labels[~masked_indices] = -100

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                logits = outputs.logits

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                total_loss += loss.item()

                predictions = torch.argmax(logits, dim=-1)
                mask_token_indices = (labels != -100)
                correct_predictions += (predictions[mask_token_indices] == labels[mask_token_indices]).sum().item()
                total_masked_tokens += mask_token_indices.sum().item()

                pbar.set_postfix(loss=total_loss / (pbar.n + 1), accuracy=correct_predictions / total_masked_tokens)
                pbar.update(1)

        # Validation step
        model.eval()
        val_loss = 0
        val_correct = 0
        val_total_masked = 0

        with torch.no_grad():
            for input_ids, attention_mask in val_loader:
                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)

                labels = input_ids.clone()
                probability_matrix = torch.full(labels.shape, 0.25)
                masked_indices = torch.bernoulli(probability_matrix).bool()
                labels[~masked_indices] = -100

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                logits = outputs.logits

                val_loss += outputs.loss.item()
                predictions = torch.argmax(logits, dim=-1)
                mask_token_indices = (labels != -100)
                val_correct += (predictions[mask_token_indices] == labels[mask_token_indices]).sum().item()
                val_total_masked += mask_token_indices.sum().item()

        val_accuracy = val_correct / val_total_masked
        print(f"Epoch {epoch+1}/{epochs} - Train Loss: {total_loss/len(train_loader):.4f} | "
              f"Val Loss: {val_loss/len(val_loader):.4f} | Val Accuracy: {val_accuracy:.4f}")

        early_stopping(val_accuracy)
        if early_stopping.should_stop:
            print(f"Early stopping at epoch {epoch+1}")
            break

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
            
# Fine-tune the model
fine_tune_model(model, train_loader, val_loader, tokenizer)

Epoch 1/10: 100%|██████████| 2128/2128 [17:36<00:00,  2.01it/s, accuracy=0.92, loss=0.651] 


Epoch 1/10 - Train Loss: 0.6511 | Val Loss: 0.0235 | Val Accuracy: 0.9943


Epoch 2/10: 100%|██████████| 2128/2128 [17:32<00:00,  2.02it/s, accuracy=0.993, loss=0.0239]


Epoch 2/10 - Train Loss: 0.0239 | Val Loss: 0.0084 | Val Accuracy: 0.9955


Epoch 3/10: 100%|██████████| 2128/2128 [17:32<00:00,  2.02it/s, accuracy=0.995, loss=0.01]  


Epoch 3/10 - Train Loss: 0.0100 | Val Loss: 0.0063 | Val Accuracy: 0.9960


Epoch 4/10:  31%|███       | 662/2128 [05:27<12:05,  2.02it/s, accuracy=0.996, loss=0.0077] 


KeyboardInterrupt: 

In [27]:
# Save the fine-tuned model
save_dir = "/kaggle/working/finetuned_model/"
os.makedirs(save_dir, exist_ok=True)
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

print(f"Model saved to {save_dir}")

Model saved to /kaggle/working/finetuned_model/


In [28]:
# Test load

from transformers import RobertaTokenizer, RobertaForMaskedLM

save_dir = "/kaggle/working/finetuned_model/"

# Load the tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained(save_dir)
model = RobertaForMaskedLM.from_pretrained(save_dir)

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("Model and tokenizer loaded successfully!")

def get_embeddings(model, code_snippet):
    model.eval()
    with torch.no_grad():
        encoded_input = tokenizer(
            code_snippet,
            max_length=256,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        input_ids = encoded_input['input_ids'].to(device)
        attention_mask = encoded_input['attention_mask'].to(device)

        # Get the last hidden state as embeddings
        outputs = model.roberta(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = outputs.last_hidden_state[:, 0, :]  # Use CLS token embedding
        return embeddings.cpu().numpy()

# Example usage:
code_snippet = "function add(uint256 a, uint256 b) public pure returns (uint256) { return a + b; }"
embedding = get_embeddings(model, code_snippet)
print(embedding)


Model and tokenizer loaded successfully!
[[ 4.05681223e-01 -3.65111262e-01 -9.25921023e-01  1.15476130e-02
   2.88898349e-01  2.14949012e-01  3.53764266e-01 -1.15859821e-01
  -2.95966744e-01 -5.41716993e-01  6.43636763e-01 -1.64671063e-01
  -4.27890003e-01 -2.05586225e-01  6.39046609e-01 -1.27622500e-01
  -7.40435898e-01 -5.17755091e-01  3.94157648e-01 -2.48501658e-01
  -8.95656109e-01  1.52037680e-01 -1.10072277e-01  7.81248808e-01
   5.62808871e-01 -9.96119320e-01  9.59416091e-01  1.03596938e+00
  -9.69998166e-02  3.71292442e-01  9.31655288e-01 -1.32916045e+00
  -4.18929189e-01 -8.03332180e-02 -2.88462728e-01  7.02840328e-01
   4.26392823e-01  4.13250476e-01  2.30653286e-01  8.30822825e-01
  -7.70308256e-01 -8.91805589e-02  2.42958814e-01 -2.20116958e-01
  -4.35930014e-01  1.24288857e-01 -7.07863644e-02 -1.17232827e-02
   3.90237004e-01 -4.76665109e-01  3.01774204e-01 -6.53944671e-01
   1.74259290e-01  4.23410684e-01  6.19129241e-01 -1.59401804e-01
  -1.23386487e-01  7.30553448e-01  