This notebook demonstrates the full process of `SemanticCodeSearch` using fine-tuned GraphCodeBERT model, which implement the code-to-code search.

### Download test repositories and run `inspect4py` on them

In [1]:
# Repository picked from https://github.com as an example
repo = 'keon/algorithms'

In [2]:
!inspect4py --version

inspect4py, version 0.0.6


!mkdir -p content/output
%cd content/

!mkdir -p {repo} && git clone {f"https://github.com/{repo}.git"} {repo}
!inspect4py -i {repo} -o output/{repo} -sc -rm

### Extract docstrings and functions from repositories.

In [3]:
import json

def funcs_to_lists(funcs, func_codes, docs):
    for func_name, func_info in funcs.items():
        if func_info.get("source_code") is not None:
            func_codes.append(func_info["source_code"])
        if func_info.get("doc") is None:
            continue
        for key in ["full", "long_description", "short_description"]:
            if func_info["doc"].get(key) is not None:
                docs.append(f"{func_name} {func_info['doc'].get(key)}")
                break

def file_to_lists(filename):
    func_codes = []
    docs = []
    with open(filename, "r") as f:
        dic = json.load(f)
    dic.pop("readme_files", None)
    for dir_name, files in dic.items():
        for file in files:
            if file.get("functions") is not None:
                funcs_to_lists(file["functions"], func_codes, docs)
            if file.get("classes") is not None:
                for class_name, class_info in file["classes"].items():
                    if class_info.get("methods") is not None:
                        funcs_to_lists(class_info["methods"], func_codes, docs)
    return func_codes, docs

In [4]:
repo_info = {}
function_list, docstring_list = file_to_lists(f"content/output/{repo}/directory_info.json")
# repo_info["docs"] = docstring_list
repo_info["funcs"] = function_list

### Download UniXCoder, fine-tuned model and install requirements

In [5]:
# !wget https://raw.githubusercontent.com/microsoft/CodeBERT/master/UniXcoder/unixcoder.py

### Generate embeddings for all repositories

In [6]:
import torch
from unixcoder import UniXcoder

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
code_search_model = UniXcoder("Lazyhope/unixcoder-nine-advtest")
clone_detection_model = UniXcoder("Lazyhope/unixcoder-clone-detection")
code_search_model.to(device)
clone_detection_model.to(device)

def get_code_embeddings(code, model):
    tokens_ids = model.tokenize([code], max_length=512, mode="<encoder-only>")
    source_ids = torch.tensor(tokens_ids).to(device)
    _, embeddings = code_search_model(source_ids)

    return embeddings

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
from tqdm import tqdm

# Generate code embeddings for all funcs in repository
print(f" - Generating func embeddings for repo - ")
code_embeddings = []
for func in tqdm(repo_info["funcs"]):
    code_embeddings.append(get_code_embeddings(func, clone_detection_model))    

 - Generating func embeddings for repo - 


100%|███████████████████████████████████████████████████████████████████████████████| 1171/1171 [01:02<00:00, 18.81it/s]


### Evaluations & Results

In [8]:
query = """
def test_topsort(self):
    res_recursive = top_sort_recursive(self.depGraph)
    self.assertTrue(res_recursive.index('g') < res_recursive.index('e'))
    
    res_iterative = top_sort(self.depGraph)
    self.assertTrue(res_iterative.index('g') < res_iterative.index('e'))

"""

In [22]:
from torch.nn import CosineSimilarity

input_embedding = get_code_embeddings(query, clone_detection_model)

cosine_sim = CosineSimilarity(dim=1, eps=1e-8)
# cosine_sim = CosineSimilarity(dim=1)
similarities = cosine_sim(input_embedding, torch.stack(code_embeddings))

In [27]:
# Convert similarities tensor to a list
similarities_list = similarities.tolist()

# Combine functions with cosine similarities
func_similarities = list(zip(repo_info["funcs"], similarities_list))

# Sort the func_similarities list based on cosine similarities
sorted_similarities = sorted(func_similarities, key=lambda x: x[1], reverse=True)

num_similar_funcs = 5  # Specify the number of similar functions to retrieve
most_similar_funcs = sorted_similarities[:num_similar_funcs]

# Extract the function names from the most_similar_funcs list
similar_func_names = [func for func, _ in most_similar_funcs]

# Output the function names
print('Similiar code snippets:')
for func_name in similar_func_names:
    print(f'\n------------------------------------------------------------------\n {func_name}')

Similiar code snippets:

------------------------------------------------------------------
 def test_topsort(self):
    res = top_sort_recursive(self.depGraph)
    self.assertTrue(res.index('g') < res.index('e'))
    res = top_sort(self.depGraph)
    self.assertTrue(res.index('g') < res.index('e'))

------------------------------------------------------------------
 def test_gnome_sort(self):
    self.assertTrue(is_sorted(gnome_sort([1, 3, 2, 5, 65, 23, 57, 1232])))

------------------------------------------------------------------
 def dfs_transposed(vertex, graph, order, visited):
    """
    Perform a depth first search traversal of the graph starting at the given vertex.
    Stores the order in which nodes were visited to the list, in transposed order.
    """
    visited[vertex] = True
    for adjacent in graph[vertex]:
        if not visited[adjacent]:
            dfs_transposed(adjacent, graph, order, visited)
    order.append(vertex)

-----------------------------------------