This notebook demonstrates the full process of `Sematic Code Search`, which contains both `code2code` and `text2code` search paradigms.

### Prepare Python environment for Code Search Engine

In [None]:
!pip install -r requirements.txt

# Part 1. Prepare database for search engine

### Download test repository example and run `inspect4py` on it

In [1]:
# Repository picked from https://github.com as an example
repo = 'keon/algorithms'

In [2]:
!mkdir -p content/output
%cd content/

!mkdir -p {repo} && git clone {f"https://github.com/{repo}.git"} {repo}
!inspect4py -i {repo} -o output/{repo} -sc -rm
%cd ..

/cs/home/cd271/Documents/Project/Examples/SemanticCodeSearch/notebook/content
Cloning into 'keon/algorithms'...
remote: Enumerating objects: 5162, done.[K
remote: Counting objects: 100% (26/26), done.[K
remote: Compressing objects: 100% (23/23), done.[K
remote: Total 5162 (delta 11), reused 16 (delta 3), pack-reused 5136[K
Receiving objects: 100% (5162/5162), 1.42 MiB | 7.52 MiB/s, done.
Resolving deltas: 100% (3230/3230), done.
Updating files: 100% (477/477), done.
Creating jsDir:output/keon/algorithms/algorithms/json_files
Creating jsDir:output/keon/algorithms/algorithms/algorithms/json_files
Creating jsDir:output/keon/algorithms/algorithms/algorithms/streaming/json_files
Creating jsDir:output/keon/algorithms/algorithms/algorithms/maths/json_files
Error when processing polynomial.py:  <class 'AttributeError'>
Creating jsDir:output/keon/algorithms/algorithms/algorithms/greedy/json_files
Creating jsDir:output/keon/algorithms/algorithms/algorithms/queues/json_files
Error when proces

In [3]:
import sys
sys.path.append("..")
from search_engine import data_prepare

repo_info = {}
function_list = data_prepare.file_to_lists(f"content/output/{repo}/directory_info.json")
repo_info["funcs"] = function_list

# Part 2. Code-To-Code Search Engine Interface

In [5]:
from search_engine import model

# Instantiate the Code2CodeSearchEngine and compute code_embeddings
se_pl = model.Code2CodeSearchEngine(repo_info)

Generating code embeddings for dataset ... 


100%|███████████████████████████████████████████████████████████████████████████████| 1171/1171 [01:11<00:00, 16.45it/s]

Dataset code embeddings generated!





In [6]:
from IPython.core.magic import (register_line_magic, register_cell_magic)

@register_cell_magic
def search_by_code(line, cell):
    n = int(input("How many similar code snippets you want to retrieve: "))
    se_pl.search(cell, n)

In [7]:
%%search_by_code
"""
def dfs(graph, start_node, visited):
    if start_node not in visited:
        # Mark the current node as visited.
        visited.append(start_node)
        print("Visited:", start_node)
        # Explore all the adjacent nodes.
        for neighbor in graph[start_node]:
            dfs(graph, neighbor, visited)
"""

How many similar code snippets you want to retrieve:  5


The most similar 5 code snippets:

------------------------------------------------------------------
 def dfs_traverse(graph, start):
    """
    Traversal by depth first search.
    """
    (visited, stack) = (set(), [start])
    while stack:
        node = stack.pop()
        if node not in visited:
            visited.add(node)
            for next_node in graph[node]:
                if next_node not in visited:
                    stack.append(next_node)
    return visited

------------------------------------------------------------------
 def dfs_traverse_recursive(graph, start, visited=None):
    """
    Traversal by recursive depth first search.
    """
    if visited is None:
        visited = set()
    visited.add(start)
    for next_node in graph[start]:
        if next_node not in visited:
            dfs_traverse_recursive(graph, next_node, visited)
    return visited

------------------------------------------------------------------
 def bfs_traverse(graph, start):
   

# Part 3. Text-to-code Search Engine Interface

In [8]:
# Instantiate the Text2CodeSearchEngine and compute code_embeddings
se_nl = model.Text2CodeSearchEngine(repo_info)

Generating code embeddings for dataset ... 


100%|███████████████████████████████████████████████████████████████████████████████| 1171/1171 [01:10<00:00, 16.50it/s]
  code_embeddings = torch.stack([torch.tensor(embedding) for embedding in code_embeddings])


Dataset code embeddings generated!


In [9]:
@register_cell_magic
def search_by_text(line, cell):
    n = int(input("How many similar code snippets you want to retrieve: "))
    se_nl.search(cell, n)

In [10]:
%%search_by_text
Function to calcualte cosine similarity

How many similar code snippets you want to retrieve:  5


The most similar 5 code snippets:

------------------------------------------------------------------
 def test_cosine_similarity(self):
    vec_a = [1, 1, 1]
    vec_b = [-1, -1, -1]
    vec_c = [1, 2, -1]
    self.assertAlmostEqual(cosine_similarity(vec_a, vec_a), 1)
    self.assertAlmostEqual(cosine_similarity(vec_a, vec_b), -1)
    self.assertAlmostEqual(cosine_similarity(vec_a, vec_c), 0.4714045208)

------------------------------------------------------------------
 def cosine_similarity(vec1, vec2):
    """
    Calculate cosine similarity between given two vectors
    :type vec1: list
    :type vec2: list
    """
    if len(vec1) != len(vec2):
        raise ValueError('The two vectors must be the same length. Got shape ' + str(len(vec1)) + ' and ' + str(len(vec2)))
    norm_a = _l2_distance(vec1)
    norm_b = _l2_distance(vec2)
    similarity = 0.0
    for (vec1_element, vec2_element) in zip(vec1, vec2):
        similarity += vec1_element * vec2_element
    similarity /= norm_a 