This notebook implement Text-To-Code Search Engine with fine-tuned `UniXcoder` model.

In [None]:
# !wget https://raw.githubusercontent.com/microsoft/CodeBERT/master/UniXcoder/unixcoder.py

In [None]:
!inspect4py --version

inspect4py, version 0.0.6


In [None]:
!mkdir -p content/output
%cd content/

!mkdir -p {repo} && git clone {f"https://github.com/{repo}.git"} {repo}
!inspect4py -i {repo} -o output/{repo} -sc -rm
%cd ..

/cs/home/cd271/Documents/Project/Examples/RepoAnalysis/SemanticCodeSearch/Text2code/content
Cloning into 'keon/algorithms'...
remote: Enumerating objects: 5162, done.[K
remote: Counting objects: 100% (26/26), done.[K
remote: Compressing objects: 100% (23/23), done.[K
remote: Total 5162 (delta 11), reused 16 (delta 3), pack-reused 5136[K
Receiving objects: 100% (5162/5162), 1.42 MiB | 10.99 MiB/s, done.
Resolving deltas: 100% (3231/3231), done.
Updating files: 100% (477/477), done.
Creating jsDir:output/keon/algorithms/algorithms/json_files
Creating jsDir:output/keon/algorithms/algorithms/algorithms/json_files
Creating jsDir:output/keon/algorithms/algorithms/algorithms/streaming/json_files
Creating jsDir:output/keon/algorithms/algorithms/algorithms/map/json_files
Error when processing separate_chaining_hashtable.py:  <class 'AttributeError'>
Error when processing hashtable.py:  <class 'AttributeError'>
Creating jsDir:output/keon/algorithms/algorithms/algorithms/stack/json_files
Erro

In [1]:
import torch
from unixcoder import UniXcoder

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = UniXcoder("microsoft/unixcoder-base")
model.to(device)
model2 = UniXcoder("Lazyhope/unixcoder-nine-advtest")
model2.to(device)

  from .autonotebook import tqdm as notebook_tqdm


UniXcoder(
  (model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(51416, 768, padding_idx=1)
      (position_embeddings): Embedding(1026, 768, padding_idx=1)
      (token_type_embeddings): Embedding(10, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

In [2]:
repo = 'keon/algorithms'

import json

def funcs_to_lists(funcs, func_codes, docs):
    for func_name, func_info in funcs.items():
        if func_info.get("source_code") is not None:
            func_codes.append(func_info["source_code"])
        if func_info.get("doc") is None:
            continue
        for key in ["full", "long_description", "short_description"]:
            if func_info["doc"].get(key) is not None:
                docs.append(f"{func_name} {func_info['doc'].get(key)}")
                break

def file_to_lists(filename):
    func_codes = []
    docs = []
    with open(filename, "r") as f:
        dic = json.load(f)
    dic.pop("readme_files", None)
    for dir_name, files in dic.items():
        for file in files:
            if file.get("functions") is not None:
                funcs_to_lists(file["functions"], func_codes, docs)
            if file.get("classes") is not None:
                for class_name, class_info in file["classes"].items():
                    if class_info.get("methods") is not None:
                        funcs_to_lists(class_info["methods"], func_codes, docs)
    return func_codes
    
repo_info = {}
function_list = file_to_lists(f"content/output/{repo}/directory_info.json")
repo_info["funcs"] = function_list

In [6]:
from tqdm import tqdm

def get_code_embeddings(code, model):
    tokens_ids = model.tokenize([code], max_length=512, mode="<encoder-only>")
    source_ids = torch.tensor(tokens_ids).to(device)
    _, embeddings = model(source_ids)

    return torch.nn.functional.normalize(embeddings, p=2, dim=1)
    
print("Generating code embeddings for dataset ... ")
code_embeddings = []
for func in tqdm(repo_info["funcs"]):
    code_embeddings.append(get_code_embeddings(func, model2))
    
print("Dataset code embeddings generated!")

Generating code embeddings for dataset ... 


100%|███████████████████████████████████████████████████████████████████████████████| 1171/1171 [01:10<00:00, 16.71it/s]

Dataset code embeddings generated!





In [7]:
code_embeddings = torch.stack([torch.tensor(embedding) for embedding in code_embeddings])

code_vecs = torch.squeeze(code_embeddings)

  code_embeddings = torch.stack([torch.tensor(embedding) for embedding in code_embeddings])


In [8]:
query = "Function to calcualte cosine similarity"
query_vec = get_code_embeddings(query, model2)

from torch.nn import CosineSimilarity
cosine_sim = CosineSimilarity(dim=1, eps=1e-8)
similarities = cosine_sim(query_vec, code_vecs)

top_scores, top_indices = torch.topk(similarities, k=3, largest=True)

print("Query:", query)
for score, idx in zip(top_scores, top_indices):
    code_embedding = repo_info["funcs"][idx]
    print("Code:", code_embedding)
    print("Similarity:", score.item())

Query: Function to calcualte cosine similarity
Code: def test_cosine_similarity(self):
    vec_a = [1, 1, 1]
    vec_b = [-1, -1, -1]
    vec_c = [1, 2, -1]
    self.assertAlmostEqual(cosine_similarity(vec_a, vec_a), 1)
    self.assertAlmostEqual(cosine_similarity(vec_a, vec_b), -1)
    self.assertAlmostEqual(cosine_similarity(vec_a, vec_c), 0.4714045208)
Similarity: 0.6907704472541809
Code: def cosine_similarity(vec1, vec2):
    """
    Calculate cosine similarity between given two vectors
    :type vec1: list
    :type vec2: list
    """
    if len(vec1) != len(vec2):
        raise ValueError('The two vectors must be the same length. Got shape ' + str(len(vec1)) + ' and ' + str(len(vec2)))
    norm_a = _l2_distance(vec1)
    norm_b = _l2_distance(vec2)
    similarity = 0.0
    for (vec1_element, vec2_element) in zip(vec1, vec2):
        similarity += vec1_element * vec2_element
    similarity /= norm_a * norm_b
    return similarity
Similarity: 0.6898452639579773
Code: def lcs(word1,