This notebook demonstrates the full process of `SemanticCodeSearch` using fine-tuned GraphCodeBERT model, which implement the code-to-code search.

### Download test repositories and run `inspect4py` on them

In [1]:
# Repository picked from https://github.com as an example
repo = 'keon/algorithms'

In [2]:
!mkdir -p content/output
%cd content/

!mkdir -p {repo} && git clone {f"https://github.com/{repo}.git"} {repo}
!inspect4py -i {repo} -o output/{repo} -sc -rm

/cs/home/cd271/Documents/Project/Examples/RepoAnalysis/CodeSearch/Code2code/content
Cloning into 'keon/algorithms'...
remote: Enumerating objects: 5162, done.[K
remote: Counting objects: 100% (27/27), done.[K
remote: Compressing objects: 100% (24/24), done.[K
remote: Total 5162 (delta 11), reused 16 (delta 3), pack-reused 5135[K
Receiving objects: 100% (5162/5162), 1.42 MiB | 10.46 MiB/s, done.
Resolving deltas: 100% (3227/3227), done.
Updating files: 100% (477/477), done.
Creating jsDir:output/keon/algorithms/algorithms/json_files
Creating jsDir:output/keon/algorithms/algorithms/docs/source/json_files
Creating jsDir:output/keon/algorithms/algorithms/algorithms/json_files
Creating jsDir:output/keon/algorithms/algorithms/algorithms/streaming/json_files
Creating jsDir:output/keon/algorithms/algorithms/algorithms/tree/json_files
Error when processing invert_tree.py:  <class 'AttributeError'>
Error when processing deepest_left.py:  <class 'AttributeError'>
Error when processing longest

### Extract docstrings and functions from repositories.

In [3]:
import json

def funcs_to_lists(funcs, func_codes, docs):
    for func_name, func_info in funcs.items():
        if func_info.get("source_code") is not None:
            func_codes.append(func_info["source_code"])
        if func_info.get("doc") is None:
            continue
        for key in ["full", "long_description", "short_description"]:
            if func_info["doc"].get(key) is not None:
                docs.append(f"{func_name} {func_info['doc'].get(key)}")
                break

def file_to_lists(filename):
    func_codes = []
    docs = []
    with open(filename, "r") as f:
        dic = json.load(f)
    dic.pop("readme_files", None)
    for dir_name, files in dic.items():
        for file in files:
            if file.get("functions") is not None:
                funcs_to_lists(file["functions"], func_codes, docs)
            if file.get("classes") is not None:
                for class_name, class_info in file["classes"].items():
                    if class_info.get("methods") is not None:
                        funcs_to_lists(class_info["methods"], func_codes, docs)
    return func_codes, docs

In [5]:
repo_info = {}
function_list, docstring_list = file_to_lists(f"output/{repo}/directory_info.json")
repo_info["docs"] = docstring_list
repo_info["funcs"] = function_list

### Download GraghCodeBERT fine-tuned model and using pipeline to calculate code similarity

In [6]:
from transformers import pipeline

pipe = pipeline(model="Lazyhope/python-clone-detection", trust_remote_code=True)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
query = """
def test_topsort(self):
    res_recursive = top_sort_recursive(self.depGraph)
    self.assertTrue(res_recursive.index('g') < res_recursive.index('e'))
    
    res_iterative = top_sort(self.depGraph)
    self.assertTrue(res_iterative.index('g') < res_iterative.index('e'))

"""

In [8]:
def find_top_n_index(lst, n):
    largest_indices = []
    for i in range(n):
        max_value = max(lst)
        max_index = lst.index(max_value)
        largest_indices.append(max_index)
        lst[max_index] = float('-inf')
    return largest_indices

In [15]:
from tqdm import tqdm
similarities = []
for func in tqdm(repo_info["funcs"], desc="Calculating similarities"):
    compare_dict = dict(pipe((query, func)))
    similarities.append(compare_dict[True])

sim = similarities.copy()
index = find_top_n_index(sim,5)
for i in index:
    print(f'Similarity: {similarities[i]}, \n{repo_info["funcs"][i]} \n------------------------------------------------------------------\n')

Calculating similarities: 100%|███████████████████████████████████████████████████████████████████| 1171/1171 [01:36<00:00, 12.15it/s]

Similarity: 0.9999922513961792, 
def test_topsort(self):
    res = top_sort_recursive(self.depGraph)
    self.assertTrue(res.index('g') < res.index('e'))
    res = top_sort(self.depGraph)
    self.assertTrue(res.index('g') < res.index('e')) 
------------------------------------------------------------------

Similarity: 0.9981995820999146, 
def setUp(self):
    self.depGraph = {'a': ['b'], 'b': ['c'], 'c': ['e'], 'e': ['g'], 'd': [], 'f': ['e', 'd'], 'g': []} 
------------------------------------------------------------------

Similarity: 0.9960083961486816, 
def test_is_sorted(self):
    head = Node(-2)
    head.next = Node(2)
    head.next.next = Node(2)
    head.next.next.next = Node(4)
    head.next.next.next.next = Node(9)
    self.assertTrue(is_sorted(head))
    head = Node(1)
    head.next = Node(2)
    head.next.next = Node(8)
    head.next.next.next = Node(4)
    head.next.next.next.next = Node(6)
    self.assertFalse(is_sorted(head)) 
-----------------------------------------


