This notebook implement Code-To-Code Search Engine with fine-tuned `UniXcoder` model.

### Download test repositories and run `inspect4py` on them

In [1]:
# Repository picked from https://github.com as an example
repo = 'keon/algorithms'

In [2]:
!inspect4py --version

inspect4py, version 0.0.6


In [3]:
!mkdir -p content/output
%cd content/

!mkdir -p {repo} && git clone {f"https://github.com/{repo}.git"} {repo}
!inspect4py -i {repo} -o output/{repo} -sc -rm
%cd ..

/cs/home/cd271/Documents/Project/Examples/SemanticCodeSearch/search_engine/Code2Code/Bi-Encoders/content
fatal: destination path 'keon/algorithms' already exists and is not an empty directory.
Error when processing test_monomial.py:  <class 'AttributeError'>
Error when processing test_polynomial.py:  <class 'AttributeError'>
Error when processing stooge_sort.py:  <class 'AttributeError'>
Error when processing walls_and_gates.py:  <class 'AttributeError'>
Error when processing pacific_atlantic.py:  <class 'AttributeError'>
Error when processing count_islands.py:  <class 'AttributeError'>
Error when processing separate_chaining_hashtable.py:  <class 'AttributeError'>
Error when processing hashtable.py:  <class 'AttributeError'>
Error when processing stack.py:  <class 'AttributeError'>
Added in funct/method elias_generic , argument named unary, number of argument 0
Added in funct/method elias_generic , argument named elias_gamma, number of argument 0
Error when processing search_in_sorted

### Extract docstrings and functions from repositories.

In [4]:
import json

def funcs_to_lists(funcs, func_codes, docs):
    for func_name, func_info in funcs.items():
        if func_info.get("source_code") is not None:
            func_codes.append(func_info["source_code"])
        if func_info.get("doc") is None:
            continue
        for key in ["full", "long_description", "short_description"]:
            if func_info["doc"].get(key) is not None:
                docs.append(f"{func_name} {func_info['doc'].get(key)}")
                break

def file_to_lists(filename):
    func_codes = []
    docs = []
    with open(filename, "r") as f:
        dic = json.load(f)
    dic.pop("readme_files", None)
    for dir_name, files in dic.items():
        for file in files:
            if file.get("functions") is not None:
                funcs_to_lists(file["functions"], func_codes, docs)
            if file.get("classes") is not None:
                for class_name, class_info in file["classes"].items():
                    if class_info.get("methods") is not None:
                        funcs_to_lists(class_info["methods"], func_codes, docs)
    return func_codes, docs

In [5]:
repo_info = {}
function_list, docstring_list = file_to_lists(f"content/output/{repo}/directory_info.json")
repo_info["funcs"] = function_list

### Download UniXcoder, use fine-tuned UniXcoder model for code NL-NL embeddings

In [6]:
# !wget https://raw.githubusercontent.com/microsoft/CodeBERT/master/UniXcoder/unixcoder.py

### Generate embeddings for all repositories

In [7]:
import torch
from unixcoder import UniXcoder

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
code_search_model = UniXcoder("Lazyhope/unixcoder-nine-advtest")
clone_detection_model = UniXcoder("Lazyhope/unixcoder-clone-detection")
code_search_model.to(device)
clone_detection_model.to(device)

def get_code_embeddings(code, model):
    tokens_ids = model.tokenize([code], max_length=512, mode="<encoder-only>")
    source_ids = torch.tensor(tokens_ids).to(device)
    _, embeddings = model(source_ids)

    return embeddings

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
from tqdm import tqdm

# Generate code embeddings for all funcs in repository
print(f" - Generating func embeddings for repo - ")
code_embeddings = []
for func in tqdm(repo_info["funcs"]):
    code_embeddings.append(get_code_embeddings(func, clone_detection_model))    

 - Generating func embeddings for repo - 


100%|███████████████████████████████████████████████████████████████████████████████| 1171/1171 [01:02<00:00, 18.83it/s]


### Test & Results

In [9]:
query = """
def bfs(graph, start):
    visited = set()
    queue = deque([start])
    visited.add(start)
    result = []
    while queue:
        node = queue.popleft()
        result.append(node)

        for neighbor in graph.get(node, []):
            if neighbor not in visited:
                visited.add(neighbor)
                queue.append(neighbor)
    return result
"""

In [10]:
from torch.nn import CosineSimilarity

input_embedding = get_code_embeddings(query, clone_detection_model)

cosine_sim = CosineSimilarity(dim=1, eps=1e-8)
similarities = cosine_sim(input_embedding, torch.stack(code_embeddings))

In [11]:
# Convert similarities tensor to a list
similarities_list = similarities.tolist()

# Combine functions with cosine similarities
func_similarities = list(zip(repo_info["funcs"], similarities_list))

# Sort the func_similarities list based on cosine similarities
sorted_similarities = sorted(func_similarities, key=lambda x: x[1], reverse=True)

num_similar_funcs = 5  # Specify the number of similar functions to retrieve
most_similar_funcs = sorted_similarities[:num_similar_funcs]

# Extract the function names from the most_similar_funcs list
similar_func_names = [func for func, _ in most_similar_funcs]

# Output the function names
print('Similiar code snippets:')
for func_name in similar_func_names:
    print(f'\n------------------------------------------------------------------\n {func_name}')

Similiar code snippets:

------------------------------------------------------------------
 def dfs_traverse(graph, start):
    """
    Traversal by depth first search.
    """
    (visited, stack) = (set(), [start])
    while stack:
        node = stack.pop()
        if node not in visited:
            visited.add(node)
            for next_node in graph[node]:
                if next_node not in visited:
                    stack.append(next_node)
    return visited

------------------------------------------------------------------
 def bfs_traverse(graph, start):
    """
    Traversal by breadth first search.
    """
    (visited, queue) = (set(), [start])
    while queue:
        node = queue.pop(0)
        if node not in visited:
            visited.add(node)
            for next_node in graph[node]:
                if next_node not in visited:
                    queue.append(next_node)
    return visited

------------------------------------------------------------------
 def 