In [1]:
%pip install transformers torch tqdm



In [5]:
import os
import json
import zipfile
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaModel, AdamW
from tqdm import tqdm

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
class SolidityDataset(Dataset):
    def __init__(self, zip_path, tokenizer, max_length=256):
        self.zip_path = zip_path
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.samples = []

        # Open the zip file
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            # List all files in the zip archive
            file_list = zip_ref.namelist()

            # Iterate through each file in the zip
            for file_name in file_list:
                # Check if the file is a .txt file
                if file_name.endswith('.txt'):
                    # Open the file directly from the zip
                    with zip_ref.open(file_name) as file:
                        try:
                            # Load the JSON data from the file
                            data = json.load(file)

                            # Get the "code" field if it exists
                            code = data.get("Code")
                            if code:
                                self.samples.append(code)
                            else:
                                print(f"Warning: 'code' field not found in {file_name}")

                        except json.JSONDecodeError:
                            print(f"Error decoding JSON in file: {file_name}")
                        except Exception as e:
                            print(f"Unexpected error with file {file_name}: {e}")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        code = self.samples[idx]
        encoded_input = self.tokenizer(
            code,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return encoded_input['input_ids'].squeeze(), encoded_input['attention_mask'].squeeze()


In [7]:
from transformers import RobertaTokenizer, RobertaModel

# Load pre-trained GraphCodeBERT
tokenizer = RobertaTokenizer.from_pretrained("microsoft/graphcodebert-base")
model = RobertaModel.from_pretrained("microsoft/graphcodebert-base")

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Some weights of RobertaModel were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropou

In [12]:
def fine_tune_model(model, dataloader, epochs=3, learning_rate=1e-5):
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for input_ids, attention_mask in tqdm(dataloader):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token embedding for each sequence

            # Contrastive or embedding learning objective here
            # For simplicity, use L2 regularization as a placeholder for your fine-tuning loss
            loss = (embeddings ** 2).sum()  # Simplified objective, replace as needed

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(dataloader)}")

# Initialize dataset and dataloader
dataset = SolidityDataset(zip_path="/content/drive/My Drive/Colab Notebooks/processed_repositories.zip", tokenizer=tokenizer)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# Fine-tune the model
fine_tune_model(model, dataloader)




100%|██████████| 2661/2661 [16:38<00:00,  2.67it/s]


Epoch 1/3, Loss: 219.7194443781185


100%|██████████| 2661/2661 [16:42<00:00,  2.66it/s]


Epoch 2/3, Loss: 93.22134813067161


100%|██████████| 2661/2661 [16:41<00:00,  2.66it/s]

Epoch 3/3, Loss: 57.92295193197314





In [13]:
def get_embeddings(model, code_snippet):
    model.eval()
    with torch.no_grad():
        encoded_input = tokenizer(
            code_snippet,
            max_length=256,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        input_ids = encoded_input['input_ids'].to(device)
        attention_mask = encoded_input['attention_mask'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = outputs.last_hidden_state[:, 0, :]  # Use CLS token embedding
        return embeddings.cpu().numpy()

# Example usage:
code_snippet = "function add(uint256 a, uint256 b) public pure returns (uint256) { return a + b; }"
embedding = get_embeddings(model, code_snippet)
print(embedding)


[[ 1.12693151e-02  1.67147908e-02 -1.79229043e-02  1.22786751e-02
   9.40082893e-02 -1.90160610e-02  6.31893426e-02  8.71188641e-02
   2.53150798e-03 -1.71079263e-02  7.69901043e-03  4.02079187e-02
  -2.27990355e-02 -9.26613621e-03  6.22524992e-02  2.03210041e-02
  -2.13840287e-02  6.57762364e-02  5.41016497e-02 -8.65189172e-03
  -7.92333297e-03  7.57061541e-02  6.75280541e-02  1.76334176e-02
   4.37132604e-02 -2.33456083e-02  1.92256700e-02  9.74326655e-02
  -2.94204266e-03  1.70235448e-02  1.61178894e-02  2.98657287e-02
   3.30194049e-02  1.04138954e-02  4.88821492e-02  4.52986732e-02
   7.20111057e-02 -1.87147558e-02 -1.24845710e-02  3.20849344e-02
   1.55683607e-02 -5.65463230e-02  3.14878896e-02  1.18020233e-02
   1.95525866e-02  7.00365752e-02  2.37055942e-02  4.39225957e-02
   5.40768579e-02  2.56278887e-02  2.94013385e-04  9.36264917e-02
   9.31167454e-02 -8.74448568e-03 -6.14797398e-02  2.96781901e-02
  -5.61094843e-02  2.00962648e-02  2.97649931e-02  6.56321784e-03
   4.65468

In [14]:
# Define the directory to save the model
save_dir = '/content/drive/My Drive/finetuned_model/'

# Save the fine-tuned model and tokenizer
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

print(f"Model saved to {save_dir}")

Model saved to /content/drive/My Drive/finetuned_model/
