In [1]:
repo = 'keon/algorithms'

import json

def funcs_to_lists(funcs, func_codes, docs):
    for func_name, func_info in funcs.items():
        if func_info.get("source_code") is not None:
            func_codes.append(func_info["source_code"])
        if func_info.get("doc") is None:
            continue
        for key in ["full", "long_description", "short_description"]:
            if func_info["doc"].get(key) is not None:
                docs.append(f"{func_name} {func_info['doc'].get(key)}")
                break

def file_to_lists(filename):
    func_codes = []
    docs = []
    with open(filename, "r") as f:
        dic = json.load(f)
    dic.pop("readme_files", None)
    for dir_name, files in dic.items():
        for file in files:
            if file.get("functions") is not None:
                funcs_to_lists(file["functions"], func_codes, docs)
            if file.get("classes") is not None:
                for class_name, class_info in file["classes"].items():
                    if class_info.get("methods") is not None:
                        funcs_to_lists(class_info["methods"], func_codes, docs)
    return func_codes
    
repo_info = {}
function_list = file_to_lists(f"content/output/{repo}/directory_info.json")
repo_info["funcs"] = function_list

In [2]:
from transformers import RobertaTokenizer, RobertaConfig, RobertaModel
from transformers import AutoTokenizer, AutoModel

import torch
import torch.nn as nn
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModel.from_pretrained("clda/codebert-python")


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model2 = AutoModel.from_pretrained("clda/graphcodebert-python")

In [4]:
from tqdm import tqdm

print("Generating code embeddings for dataset ... ")
code_embeddings = []
for func in tqdm(repo_info["funcs"]):
    code_embeddings.append(model2(tokenizer(func,return_tensors='pt', max_length=512)['input_ids'])[1])
    
print("Dataset code embeddings generated!")

Generating code embeddings for dataset ... 


  0%|                                                                                          | 0/1171 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|███████████████████████████████████████████████████████████████████████████████| 1171/1171 [01:22<00:00, 14.12it/s]

Dataset code embeddings generated!





In [5]:
code_embeddings = torch.stack([torch.tensor(embedding) for embedding in code_embeddings])

code_vecs = torch.squeeze(code_embeddings)

  code_embeddings = torch.stack([torch.tensor(embedding) for embedding in code_embeddings])


In [6]:
query = "Function to calcualte cosine similarity"
query_vec = model2(tokenizer(query,return_tensors='pt')['input_ids'])[1]

scores=torch.einsum("ab,cb->ac",query_vec,code_vecs)
scores=torch.softmax(scores,-1)

top_scores, top_indices = torch.topk(scores[0], k=3, largest=True)

print("Query:", query)
for score, idx in zip(top_scores, top_indices):
    code_embedding = repo_info["funcs"][idx]
    print("Code:", code_embedding)
    print("Similarity:", score.item())

Query: Function to calcualte cosine similarity
Code: def cosine_similarity(vec1, vec2):
    """
    Calculate cosine similarity between given two vectors
    :type vec1: list
    :type vec2: list
    """
    if len(vec1) != len(vec2):
        raise ValueError('The two vectors must be the same length. Got shape ' + str(len(vec1)) + ' and ' + str(len(vec2)))
    norm_a = _l2_distance(vec1)
    norm_b = _l2_distance(vec2)
    similarity = 0.0
    for (vec1_element, vec2_element) in zip(vec1, vec2):
        similarity += vec1_element * vec2_element
    similarity /= norm_a * norm_b
    return similarity
Similarity: 0.9495675563812256
Code: def test_cosine_similarity(self):
    vec_a = [1, 1, 1]
    vec_b = [-1, -1, -1]
    vec_c = [1, 2, -1]
    self.assertAlmostEqual(cosine_similarity(vec_a, vec_a), 1)
    self.assertAlmostEqual(cosine_similarity(vec_a, vec_b), -1)
    self.assertAlmostEqual(cosine_similarity(vec_a, vec_c), 0.4714045208)
Similarity: 0.0504324734210968
Code: def scc(graph)