# Explore Data 

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


In [8]:
df = pd.read_parquet('../data/train/clone-detection-600k-5fold.parquet')
# first 2000 rows
# df = df.head(2000)

In [9]:
columns = ["code1", "code2", "similar"]
df = df[columns]
# drop columns with NaN
df = df.dropna()

In [10]:
# train test split, target column being the similar column
X = df.drop(columns=['similar'])
y = df['similar']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [11]:
# fine tune code-bert using the train set to act as an encoder
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader, Dataset

import torch

# tokenize the code 
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
def tokenize_pair(code_1, code_2):
    return tokenizer(code_1, code_2, padding='max_length', truncation=True, max_length=512, return_tensors='pt')

class CodePairsDataset(Dataset):
    def __init__(self, codes1, codes2, labels):
        self.codes1 = codes1
        self.codes2 = codes2
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        encoding = tokenizer(self.codes1[idx], self.codes2[idx], return_tensors='pt', padding='max_length', truncation=True, max_length=512)
        return {**{k: v.squeeze(0) for k, v in encoding.items()}, 'labels': torch.tensor(self.labels[idx])}



In [12]:
train_dataset = CodePairsDataset(X_train['code1'].values, X_train['code2'].values, y_train.values)
val_dataset = CodePairsDataset(X_test['code1'].values, X_test['code2'].values, y_test.values)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)


In [13]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW


# Load CodeBERT with a classification head
model = RobertaForSequenceClassification.from_pretrained('microsoft/codebert-base', num_labels=2)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
device = "cpu"
model.to(device)
# use apple m1

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [15]:
optimizer = AdamW(model.parameters(), lr=1e-5)




In [16]:
import torch
import math

print(torch.backends.mps.is_available()) #the MacOS is higher than 12.3+
print(torch.backends.mps.is_built()) #MPS is activated

True
True


In [17]:
num_epochs = 3
model.train()

for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}")


In [None]:
model.save_pretrained('tuned-code-bert')

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
model.eval()
predictions, true_labels = [], []

for batch in val_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, axis=-1).tolist())
        true_labels.extend(batch['labels'].tolist())

accuracy = accuracy_score(true_labels, predictions)
precision = precision_score(true_labels, predictions)
recall = recall_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)
print(f"Validation Accuracy: {accuracy}")

In [None]:
# save the model
model.save_pretrained('../models/code-bert')

In [37]:
import torch
from torch.utils.data import DataLoader, Dataset

class CodePairsDataset(Dataset):
    def __init__(self, pairs, labels):
        self.pairs = pairs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        code_1, code_2 = self.pairs[idx]
        labels = self.labels[idx]
        encoding = tokenizer(code_1, code_2, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
        return {**encoding, 'labels': torch.tensor(labels)}

# Assuming pairs and labels are your dataset loaded from a CSV or other source
dataset = CodePairsDataset(pairs=X_train[['code1', 'code2']].values, labels=y_train.values)
data_loader = DataLoader(dataset, batch_size=16, shuffle=True)


In [39]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=1e-5)
model.train()
num_epochs = 3

for epoch in range(num_epochs):
    for batch in data_loader:
        outputs = model(**{k: v.to(model.device) for k, v in batch.items()})
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        print(f"Loss: {loss.item()}")


{'input_ids': tensor([[[    0,   282,  5457,  ...,     1,     1,     1]],

        [[    0, 41975, 47427,  ...,     1,     1,     1]],

        [[    0,   487,  5214,  ...,     1,     1,     1]],

        ...,

        [[    0,   282,  5457,  ...,   642, 10975,     2]],

        [[    0, 10431, 39825,  ...,  1437,  1108,     2]],

        [[    0,    29,  5214,  ...,     1,     1,     1]]]), 'attention_mask': tensor([[[1, 1, 1,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 0, 0, 0]],

        ...,

        [[1, 1, 1,  ..., 1, 1, 1]],

        [[1, 1, 1,  ..., 1, 1, 1]],

        [[1, 1, 1,  ..., 0, 0, 0]]]), 'labels': tensor([0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0])}




ValueError: too many values to unpack (expected 2)

In [28]:

from transformers import BertModel, BertConfig
import torch.nn as nn

class CloneModel(nn.Module):
    def __init__(self):
        super(CloneModel, self).__init__()
        self.bert = BertModel.from_pretrained("microsoft/codebert-base")
        self.fc = nn.Linear(768, 1)
        
    def forward(self, input1, input2):
        output1 = self.bert(**input1).last_hidden_state[:, 0, :]
        output2 = self.bert(**input2).last_hidden_state[:, 0, :]
        output = torch.abs(output1 - output2)
        output = self.fc(output)
        return output
    
model = CloneModel()
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)


You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


In [29]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
criterion.to(device)

for epoch in range(5):
    model.train()
    for code1, code2, target in train_loader:
        code1 = {key: value.to(device) for key, value in code1.items()}
        code2 = {key: value.to(device) for key, value in code2.items()}
        target = target.to(device)
        
        optimizer.zero_grad()
        output = model(code1, code2)
        loss = criterion(output, target.unsqueeze(1).float())
        loss.backward()
        optimizer.step()
        
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

# save the model
torch.save(model.state_dict(), '../models/code-bert-encoder.pth')






TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'tokenizers.Encoding'>

In [18]:
from transformers import AutoTokenizer, AutoModel
import torch
example_code = """
num = 5
print(num)
"""
eample_code_2 = """
num = 10
print(num)
"""

code_list = [example_code, eample_code_2]

#  Byte Pair Encoding, BPE tokenizer of CodeBERT, get tokens
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
tokens = tokenizer.tokenize(code_list, padding=True, truncation=True, return_tensors="pt")
model = AutoModel.from_pretrained("microsoft/codebert-base")
embeddings = model(**tokens)



TypeError: RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): RobertaIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): RobertaOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
  )
  (pooler): RobertaPooler(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (activation): Tanh()
  )
) argument after ** must be a mapping, not list

TypeError: RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): RobertaIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): RobertaOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
  )
  (pooler): RobertaPooler(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (activation): Tanh()
  )
) argument after ** must be a mapping, not list