In [13]:
import dask 
import numpy as np
import dask.dataframe as dd
import pandas as pd
import time 
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader, Dataset


In [14]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from scipy.special import softmax
from sklearn.metrics.pairwise import manhattan_distances, euclidean_distances, cosine_similarity

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [15]:
from sentence_transformers import SentenceTransformer

def encode_code(code, model):
    model = SentenceTransformer(model)
    code = code.to_list()
    return model.encode(code)


In [16]:



def get_cosine_similarity(embeddings_1, embeddings_2):
    similarity = []
    for i in range(len(embeddings_1)):
        sim = cosine_similarity(embeddings_1[i].reshape(1, -1), embeddings_2[i].reshape(1, -1))[0][0]
        if sim > 0.5:
            similarity.append(1)
        similarity.append(0)
    return similarity

def get_soft_cosine_similarity(embeddings_1, embeddings_2):
    similarity = []
    for i in range(len(embeddings_1)):
        sim = cosine_similarity(embeddings_1[i].reshape(1, -1), embeddings_2[i].reshape(1, -1))[0][0]
        sim = softmax(sim)
        if sim > 0.5:
            similarity.append(1)
        similarity.append(0)
    return similarity


def get_similarity(df, model):
    code_1 = df['code1']
    code_2 = df['code2']
    embeddings_1 = []
    embeddings_2 = []
    embeddings_1 = code_1.map_partitions(encode_code, model).compute(scheduler='processes')
    embeddings_2 = code_2.map_partitions(encode_code, model).compute(scheduler='processes')
    # save the embeddings locally
    np.save(f"../data/embed/{model.replace('/', '_')}_embeddings_1.npy", embeddings_1)
    np.save(f"../data/embed/{model.replace('/', '_')}_embeddings_2.npy", embeddings_2)
    
    # return get_cosine_similarity(embeddings_1, embeddings_2)
    return [embeddings_1, embeddings_2]

def validate(sim, y):
    accuracy = accuracy_score(y, sim)
    print(f"Accuracy: {accuracy}")
    f1 = f1_score(y, sim)
    print(f"F1: {f1}")
    precision = precision_score(y, sim)
    print(f"Precision: {precision}")
    recall = recall_score(y, sim)
    print(f"Recall: {recall}")
    return accuracy, f1, precision, recall

def peform_experiment(ddf, model):
    temp_df = ddf.copy()
    # scatter the data
    temp_df = temp_df.persist()
    sim = get_similarity(temp_df, model)
    # return validate(sim, y)
    return sim





ddf = dd.read_parquet("../data/val/clone-detection-600k-5fold.parquet") 
ddf_100 = ddf.copy()
models = [
    'sentence-transformers/all-MiniLM-L12-v1',
    "davanstrien/code-prompt-similarity-model",
    'annakotarba/sentence-similarity',
]
# copy the keys of the models dictionary to benchmark
benchmark_dict = {}

for model in models:
    print(f"Model: {model}")
    start = time.time()
    benchmark_dict[model] = peform_experiment(ddf_100, model)
    print(f"Time: {time.time()-start}")

# 

Model: sentence-transformers/all-MiniLM-L12-v1


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [4]:
ddf = dd.read_parquet("../data/val/clone-detection-600k-5fold.parquet") 
ddf_100 = ddf.sample(frac=0.006)

In [5]:
import torch
from torch.utils.data import Dataset
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")

def tokenize_pair(code_1, code_2):
    return tokenizer(code_1, code_2, padding='max_length', truncation=True, max_length=512, return_tensors='pt')

class CodePairsDataset(Dataset):
    def __init__(self, codes1, codes2, labels):
        self.codes1 = codes1
        self.codes2 = codes2
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        encoding = tokenizer(self.codes1[idx], self.codes2[idx], return_tensors='pt', padding='max_length', truncation=True, max_length=512)
        return {**{k: v.squeeze(0) for k, v in encoding.items()}, 'labels': torch.tensor(self.labels[idx])}


In [9]:
ddf_100 
y_test = ddf_100['similar']

In [10]:

model = RobertaForSequenceClassification.from_pretrained('../tuned-code-bert', num_labels=2)
val_dataset = CodePairsDataset(ddf_100 ['code1'].values, ddf_100 ['code2'].values, y_test.values)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
device = "cpu"
predictions, true_labels = [], []
for batch in val_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, axis=-1).tolist())
        true_labels.extend(batch['labels'].tolist())




NameError: name 'accuracy_score' is not defined

In [12]:
accuracy = accuracy_score(true_labels, predictions)
precision = precision_score(true_labels, predictions)
recall = recall_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)
print(f"Validation Accuracy: {accuracy}")
print(f"Validation Precision: {precision}")
print(f"Validation Recall: {recall}")
print(f"Validation F1: {f1}")


Validation Accuracy: 0.93188280380764
Validation Precision: 0.9539267015706806
Validation Recall: 0.906693207265489
Validation F1: 0.9297104222477357
