This notebook demonstrates the full process of `Sematic Code Search`, which contains both `code2code` and `text2code` search paradigms.

### Prepare Python environment for Code Search Engine

In [None]:
!pip install -r ../requirements.txt

# Part 1. Prepare database for search engine

### Download test repository example and run `inspect4py` on it

In [1]:
# Repository picked from https://github.com as an example
repo = 'keon/algorithms'

In [2]:
!mkdir -p content/output
%cd content/

!mkdir -p {repo} && git clone {f"https://github.com/{repo}.git"} {repo}
!inspect4py -i {repo} -o output/{repo} -sc -rm
%cd ..

/cs/home/cd271/Documents/Project/Examples/SemanticCodeSearch/CodeSearchEngine/content
fatal: destination path 'keon/algorithms' already exists and is not an empty directory.
Error when processing stooge_sort.py:  <class 'AttributeError'>
Error when processing randomized_set.py:  <class 'AttributeError'>
Error when processing search_in_sorted_matrix.py:  <class 'AttributeError'>
Error when processing sum_sub_squares.py:  <class 'AttributeError'>
Error when processing longest_increasing.py:  <class 'AttributeError'>
Error when processing stack.py:  <class 'AttributeError'>
Error when processing polynomial.py:  <class 'AttributeError'>
Error when processing generate_parenthesis.py:  <class 'AttributeError'>
Error when processing array_sum_combinations.py:  <class 'AttributeError'>
Error when processing find_words.py:  <class 'AttributeError'>
Error when processing add_operators.py:  <class 'AttributeError'>
Error when processing generate_abbreviations.py:  <class 'AttributeError'>
Error w

In [3]:
from data import file_to_lists

repo_info = {}
function_list = file_to_lists(f"content/output/{repo}/directory_info.json")
repo_info["funcs"] = function_list

# Part 2. Code-to-code Search Engine

In [4]:
from model import get_code_embeddings, get_cos_similarity, retrieve_topN
from model import code_search_model
from tqdm import tqdm
import torch

class Code2CodeSearchEngine:
    def __init__(self, repo_info):
        self.repo_info = repo_info
        self.code_embeddings = self.compute_code_embeddings()

    def compute_code_embeddings(self):
        code_embeddings = []
        print("Generating code embeddings for dataset ... ")
        for func in tqdm(self.repo_info["funcs"]):
            code_embeddings.append(get_code_embeddings(func, code_search_model))
        print("Dataset code embeddings generated!")
        return code_embeddings

    def search(self, query, n):
        input_embedding = get_code_embeddings(query, code_search_model)
        similarities = get_cos_similarity(input_embedding, torch.stack(self.code_embeddings))
        similar_func_names = retrieve_topN(self.repo_info, similarities, n)

        print(f'The most similar {n} code snippets:')
        for func_name in similar_func_names:
            print(f'\n------------------------------------------------------------------\n {func_name}')

# Instantiate the CodeSearchEngine and compute code_embeddings
se_pl = Code2CodeSearchEngine(repo_info)

  from .autonotebook import tqdm as notebook_tqdm


Generating code embeddings for dataset ... 


100%|███████████████████████████████████████████████████████████████████████████████| 1171/1171 [01:00<00:00, 19.25it/s]

Dataset code embeddings generated!





In [5]:
from IPython.core.magic import (register_line_magic, register_cell_magic)

@register_cell_magic
def search_by_code(line, cell):
    n = int(input("How many similar code snippets you want to retrieve: "))
    se_pl.search(cell, n)

In [6]:
%%search_by_code
"""
def dfs(graph, start_node, visited):
    if start_node not in visited:
        # Mark the current node as visited.
        visited.append(start_node)
        print("Visited:", start_node)
        # Explore all the adjacent nodes.
        for neighbor in graph[start_node]:
            dfs(graph, neighbor, visited)
"""

How many similar code snippets you want to retrieve:  5


The most similar 5 code snippets:

------------------------------------------------------------------
 def dfs_traverse(graph, start):
    """
    Traversal by depth first search.
    """
    (visited, stack) = (set(), [start])
    while stack:
        node = stack.pop()
        if node not in visited:
            visited.add(node)
            for next_node in graph[node]:
                if next_node not in visited:
                    stack.append(next_node)
    return visited

------------------------------------------------------------------
 def dfs_traverse_recursive(graph, start, visited=None):
    """
    Traversal by recursive depth first search.
    """
    if visited is None:
        visited = set()
    visited.add(start)
    for next_node in graph[start]:
        if next_node not in visited:
            dfs_traverse_recursive(graph, next_node, visited)
    return visited

------------------------------------------------------------------
 def bfs_traverse(graph, start):
   

# Part 3. Text-to-code Search Engine

In [7]:
class Text2CodeSearchEngine:
    def __init__(self, repo_info):
        self.repo_info = repo_info
        self.code_embeddings = self.compute_code_embeddings()

    def compute_code_embeddings(self):
        code_embeddings = []
        print("Generating code embeddings for dataset ... ")
        for func in tqdm(self.repo_info["funcs"]):
            code_embeddings.append(get_code_embeddings(func, code_search_model))
        print("Dataset code embeddings generated!")
        
        code_embeddings = torch.stack([torch.tensor(embedding) for embedding in code_embeddings])
        return torch.squeeze(code_embeddings)

    def search(self, query, n):
        input_embedding = get_code_embeddings(query, code_search_model)
        similarities = get_cos_similarity(input_embedding, self.code_embeddings)
        similar_func_names = retrieve_topN(self.repo_info, similarities, n)

        print(f'The most similar {n} code snippets:')
        for func_name in similar_func_names:
            print(f'\n------------------------------------------------------------------\n {func_name}')

# Instantiate the CodeSearchEngine and compute code_embeddings
se_nl = Text2CodeSearchEngine(repo_info)

Generating code embeddings for dataset ... 


100%|███████████████████████████████████████████████████████████████████████████████| 1171/1171 [00:59<00:00, 19.64it/s]
  code_embeddings = torch.stack([torch.tensor(embedding) for embedding in code_embeddings])


Dataset code embeddings generated!


In [10]:
@register_cell_magic
def search_by_text(line, cell):
    n = int(input("How many similar code snippets you want to retrieve: "))
    se_nl.search(cell, n)

In [11]:
%%search_by_text
Function to calcualte cosine similarity

How many similar code snippets you want to retrieve:  5


The most similar 5 code snippets:

------------------------------------------------------------------
 def test_cosine_similarity(self):
    vec_a = [1, 1, 1]
    vec_b = [-1, -1, -1]
    vec_c = [1, 2, -1]
    self.assertAlmostEqual(cosine_similarity(vec_a, vec_a), 1)
    self.assertAlmostEqual(cosine_similarity(vec_a, vec_b), -1)
    self.assertAlmostEqual(cosine_similarity(vec_a, vec_c), 0.4714045208)

------------------------------------------------------------------
 def cosine_similarity(vec1, vec2):
    """
    Calculate cosine similarity between given two vectors
    :type vec1: list
    :type vec2: list
    """
    if len(vec1) != len(vec2):
        raise ValueError('The two vectors must be the same length. Got shape ' + str(len(vec1)) + ' and ' + str(len(vec2)))
    norm_a = _l2_distance(vec1)
    norm_b = _l2_distance(vec2)
    similarity = 0.0
    for (vec1_element, vec2_element) in zip(vec1, vec2):
        similarity += vec1_element * vec2_element
    similarity /= norm_a 