## This notebook demonstrates the full process of `SemanticCodeSearch` using fine-tuned `UnixCoder` model, which implement the code-to-code search engine.

### Prepare dataset for search engine

In [1]:
from data import file_to_lists

repo = 'keon/algorithms'
repo_info = {}
function_list = file_to_lists(f"content/output/{repo}/directory_info.json")
repo_info["funcs"] = function_list

### Generate embeddings for all repositories

In [2]:
from tqdm import tqdm
from model import get_code_embeddings, get_cos_similarity, retrieve_topN

print("Generating code embeddings for dataset ... ")
code_embeddings = []
for func in tqdm(repo_info["funcs"]):
    code_embeddings.append(get_code_embeddings(func))
    
print("Dataset code embeddings generated!")

  from .autonotebook import tqdm as notebook_tqdm


Generating code embeddings for dataset ... 


100%|███████████████████████████████████████████████████████████████████████████████| 1171/1171 [01:05<00:00, 17.83it/s]

Dataset code embeddings generated!





### Query Example:
```
query = """
def test_topsort(self):
    res_recursive = top_sort_recursive(self.depGraph)
    self.assertTrue(res_recursive.index('g') < res_recursive.index('e'))
    
    res_iterative = top_sort(self.depGraph)
    self.assertTrue(res_iterative.index('g') < res_iterative.index('e'))

"""
```

In [3]:
query = input("Please input the code snippet you want to search: \n")

print("\nGenerating embeddings for query code ...")
input_embedding = get_code_embeddings(query)
print("Input code snippet embedding generated!")

Please input the code snippet you want to search: 
 def test_topsort(self):     res_recursive = top_sort_recursive(self.depGraph)     self.assertTrue(res_recursive.index('g') < res_recursive.index('e'))          res_iterative = top_sort(self.depGraph)     self.assertTrue(res_iterative.index('g') < res_iterative.index('e'))



Generating embeddings for query code ...
Input code snippet embedding generated!


In [4]:
print("Calulating the similarity...")
similarities = get_cos_similarity(input_embedding, code_embeddings)

Calulating the similarity...


In [6]:
n = int(input("How many similar code snippets you want to retrieve: "))
similar_func_names = retrieve_topN(repo_info, similarities, n)

print('The most similiar {n} code snippets:')
for func_name in similar_func_names:
    print(f'\n------------------------------------------------------------------\n {func_name}')

How many similar code snippets you want to retrieve:  5


The most similiar {n} code snippets:

------------------------------------------------------------------
 def test_topsort(self):
    res = top_sort_recursive(self.depGraph)
    self.assertTrue(res.index('g') < res.index('e'))
    res = top_sort(self.depGraph)
    self.assertTrue(res.index('g') < res.index('e'))

------------------------------------------------------------------
 def dfs(res, root, cur):
    if root.left is None and root.right is None:
        res.append(cur)
    if root.left:
        dfs(res, root.left, cur + '->' + str(root.left.val))
    if root.right:
        dfs(res, root.right, cur + '->' + str(root.right.val))

------------------------------------------------------------------
 def string_reverse(s):
    return s[::-1]

------------------------------------------------------------------
 def rotate_right(self):
    """
        Right rotation
        """
    new_root = self.node.left.node
    new_left_sub = new_root.right.node
    old_root = self.node
    self.nod

In [None]:
# from IPython.core.magic import (register_line_magic, register_cell_magic, register_line_cell_magic)
# @register_cell_magic
# def search(line, cell):
#     return se.search(cell)