## Imports and Configuration

In [2]:
%pip install sentence-transformers usearch numpy pandas
from sentence_transformers import SentenceTransformer
import usearch.index
import numpy as np, pandas as pd

sentences = ["This is an example sentence", "Each sentence is converted"]

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(sentences)
print(embeddings)

MODEL_NAME = 'all-MiniLM-L6-v2'
DIMENSION = 384

Note: you may need to restart the kernel to use updated packages.

[[ 6.76568821e-02  6.34959936e-02  4.87131141e-02  7.93049484e-02
   3.74481082e-02  2.65281019e-03  3.93750109e-02 -7.09848432e-03
   5.93614094e-02  3.15369926e-02  6.00981005e-02 -5.29051758e-02
   4.06067446e-02 -2.59308759e-02  2.98428461e-02  1.12691789e-03
   7.35148489e-02 -5.03818877e-02 -1.22386619e-01  2.37028655e-02
   2.97265742e-02  4.24768887e-02  2.56337579e-02  1.99517026e-03
  -5.69190569e-02 -2.71598063e-02 -3.29035297e-02  6.60248920e-02
   1.19007193e-01 -4.58791070e-02 -7.26214647e-02 -3.25840414e-02
   5.23413345e-02  4.50553559e-02  8.25301651e-03  3.67024057e-02
  -1.39415748e-02  6.53918460e-02 -2.64272299e-02  2.06413330e-04
  -1.36643304e-02 -3.62810455e-02 -1.95044074e-02 -2.89737955e-02
   3.94270308e-02 -8.84090438e-02  2.62422604e-03  1.36713414e-02
   4.83062416e-02 -3.11566312e-02 -1.17329173e-01 -5.11690117e-02
  -8.85287970e-02 -2.18962729e-02  1.42986486e-02  4.44168076e-02
  -1.3481

## Part 1: Prototype Search Engine

In [4]:
documents = [
    "def calculate_factorial(n): # Computes the product of all positive integers less than or equal to n.",
    "class DatabaseConnector: # Manages the connection pool to a SQL database.",
    "def quicksort(arr): # Sorts an array using the divide and conquer method.",
    "def binary_search(arr, target): # Finds the position of a target value within a sorted array.",
]
doc_ids = np.array([1, 2, 3, 4], dtype=np.longlong)

document_embeddings = model.encode(documents, convert_to_numpy=True)
print(f"Generated embeddings shape: {document_embeddings.shape}")
print(f"Embedding dimension (D): {DIMENSION}")

index = usearch.index.Index(
    ndim=DIMENSION,
    metric='cos',
)
index.add(doc_ids, document_embeddings)
print(f"Index created with {index.size} vectors.")

Generated embeddings shape: (4, 384)
Embedding dimension (D): 384
Index created with 4 vectors.


In [5]:
def search_code(query, k=3):
    """Encodes a query and searches the index for the top k similar documents."""
    
    query_embedding = model.encode(query, convert_to_numpy=True).reshape(1, -1)
    matches = index.search(query_embedding, k)

    id_to_doc = {id: doc for id, doc in zip(doc_ids, documents)}
    
    results = []
    for rank, (doc_id, distance) in enumerate(zip(matches.keys, matches.distances)):
        similarity = 1 - distance
        results.append({
            "rank": rank + 1,
            "document_id": doc_id,
            "document": id_to_doc.get(doc_id, "Document Not Found"),
            "similarity_score": similarity
        })
    return results

test_query = "Find a function to sort elements quickly."
search_results = search_code(test_query)
print(f"--- Search Results for Query: '{test_query}' ---")
for result in search_results:
    print(f"Rank {result['rank']} (Score: {result['similarity_score']:.4f}, ID: {result['document_id']}):")
    print(f"    Code: {result['document']}")

--- Search Results for Query: 'Find a function to sort elements quickly.' ---
Rank 1 (Score: 0.5863, ID: 3):
    Code: def quicksort(arr): # Sorts an array using the divide and conquer method.
Rank 2 (Score: 0.3575, ID: 4):
    Code: def binary_search(arr, target): # Finds the position of a target value within a sorted array.
Rank 3 (Score: 0.1400, ID: 1):
    Code: def calculate_factorial(n): # Computes the product of all positive integers less than or equal to n.


## Part 2: Evaluation

In [7]:
import json, requests
import os

RAW_FILE_URL = "https://raw.githubusercontent.com/microsoft/CodeXGLUE/main/Text-Code/NL-code-search-WebQuery/data/test_webquery.json"
LOCAL_FILE_PATH = "cosqa_test_data.json"

def fetch_and_process_cosqa(url, local_path):
    """Fetches the CoSQA test file, saves it locally, and processes it into a searchable format."""
    
    print(f"Fetching data from: {url}")
    try:
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        
        with open(local_path, 'w', encoding='utf-8') as f:
            f.write(response.text)
        print(f"Data successfully downloaded and saved to {local_path}")
        
    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch data. Check your connection: {e}")
        return [], [], {}, {}
    
    try:
        with open(local_path, 'r', encoding='utf-8') as f:
            raw_data = json.load(f)
            data_list = raw_data 
    except Exception as e:
        print(f"Failed to read or parse local JSON file: {e}")
        return [], [], {}, {}

    if not data_list:
        print("Data list for iteration is empty.")
        return [], [], {}, {}
    
    corpus = {} 
    queries = {}
    qrels = {}

    for item in data_list: 
        q_id = item.get('idx')
        q_text = item.get('doc')
        code_text = item.get('code')
        c_id = q_id 
        
        if not q_id or not q_text or not code_text:
            continue

        queries[q_id] = q_text
        corpus[c_id] = code_text
        qrels[q_id] = c_id

    corpus_items = list(corpus.items())
    new_corpus_ids = np.arange(1, len(corpus_items) + 1, dtype=np.longlong)
    old_to_new_id_map = {old_id: new_id for new_id, (old_id, _) in zip(new_corpus_ids, corpus_items)}
    
    COSQA_DOCUMENTS = [code for _, code in corpus_items]
    COSQA_DOC_IDS = new_corpus_ids
    
    COSQA_QRELS = {}
    for q_id, old_c_id in qrels.items():
        correct_new_id = old_to_new_id_map.get(old_c_id)
        if correct_new_id:
            COSQA_QRELS[q_id] = correct_new_id
            
    COSQA_QUERIES = queries

    print(f"\nSuccessfully loaded CoSQA data:")
    print(f"- Total code snippets (Documents) to index: {len(COSQA_DOCUMENTS)}")
    print(f"- Total queries for evaluation: {len(COSQA_QUERIES)}")

    return COSQA_DOCUMENTS, COSQA_DOC_IDS, COSQA_QUERIES, COSQA_QRELS

COSQA_DOCUMENTS, COSQA_DOC_IDS, COSQA_QUERIES, COSQA_QRELS = fetch_and_process_cosqa(RAW_FILE_URL, LOCAL_FILE_PATH)

Fetching data from: https://raw.githubusercontent.com/microsoft/CodeXGLUE/main/Text-Code/NL-code-search-WebQuery/data/test_webquery.json
Data successfully downloaded and saved to cosqa_test_data.json

Successfully loaded CoSQA data:
- Total code snippets (Documents) to index: 1046
- Total queries for evaluation: 1046


In [8]:
print("Indexing the CoSQA Corpus...")

if len(COSQA_DOCUMENTS) > 0:
    COSQA_DOCUMENT_EMBEDDINGS = model.encode(
        COSQA_DOCUMENTS, 
        convert_to_numpy=True
    )
    print(f"Generated embeddings for CoSQA corpus. Shape: {COSQA_DOCUMENT_EMBEDDINGS.shape}")

    cosqa_index = usearch.index.Index(
        ndim=DIMENSION,
        metric='cos', 
    )
    cosqa_index.add(COSQA_DOC_IDS, COSQA_DOCUMENT_EMBEDDINGS)
    print(f"CoSQA Index created successfully with {cosqa_index.size} code snippets.")
else:
    print("No documents were loaded")

Indexing the CoSQA Corpus...
Generated embeddings for CoSQA corpus. Shape: (1046, 384)
CoSQA Index created successfully with 1046 code snippets.


In [9]:
def calculate_mrr(relevance_list, k):
    """Calculates Mean Reciprocal Rank (MRR) for a single query."""
    for rank, is_relevant in enumerate(relevance_list):
        if is_relevant == 1.0:
            return 1.0 / (rank + 1)
    return 0.0

def calculate_ndcg(relevance_list, k):
    """Calculates Normalized Discounted Cumulative Gain (NDCG) for a single query."""

    idcg = 1.0 

    dcg = 0.0
    for rank, rel in enumerate(relevance_list):
        if rel == 1.0:
            dcg += rel / np.log2(rank + 2)
            break

    return dcg / idcg if idcg > 0 else 0.0

def evaluate_search_engine(index, queries, qrels, model, k=10):
    """
    Evaluates the search engine using Recall@k, MRR@k, and NDCG@k against the entire test set.
    """
    print(f"Starting evaluation of {len(queries)} queries at k={k}...")
    
    recall_k_list = []
    mrr_k_list = []
    ndcg_k_list = []

    for q_id, query_text in queries.items():
        query_embedding = model.encode(query_text, convert_to_numpy=True).reshape(1, -1)
        matches = index.search(query_embedding, k)
        retrieved_ids = matches.keys.tolist()
        correct_id = qrels.get(q_id)
        
        if correct_id is None:
            continue

        is_relevant = [1.0 if doc_id == correct_id else 0.0 for doc_id in retrieved_ids]
        
        recall_k_list.append(1.0 if correct_id in retrieved_ids else 0.0)
        mrr_k_list.append(calculate_mrr(is_relevant, k))
        ndcg_k_list.append(calculate_ndcg(is_relevant, k))

    results = {
        f"Recall@{k}": np.mean(recall_k_list) if recall_k_list else 0.0,
        f"MRR@{k}": np.mean(mrr_k_list) if mrr_k_list else 0.0,
        f"NDCG@{k}": np.mean(ndcg_k_list) if ndcg_k_list else 0.0,
    }
    return results

if 'cosqa_index' in locals():
    BASELINE_METRICS = evaluate_search_engine(
        index=cosqa_index, 
        queries=COSQA_QUERIES, 
        qrels=COSQA_QRELS, 
        model=model, 
        k=10
    )
    print("\n--- BASELINE (all-MiniLM-L6-v2) METRICS ---")
    print(json.dumps(BASELINE_METRICS, indent=4))
    print("-" * 40)
else:
    print("\nERROR: cosqa_index was not created")

Starting evaluation of 1046 queries at k=10...

--- BASELINE (all-MiniLM-L6-v2) METRICS ---
{
    "Recall@10": 0.5898661567877629,
    "MRR@10": 0.295947145588637,
    "NDCG@10": 0.3654286274711419
}
----------------------------------------


## Part 3: Fine-Tuning

In [11]:
TRAIN_FILE_URL = "https://raw.githubusercontent.com/Jun-jie-Huang/CoCLR/main/data/search/cosqa-retrieval-train-19604.json"
LOCAL_TRAIN_PATH = "cosqa_train_data.json" 

def fetch_cosqa_training_data(url, local_path):
    print(f"Starting Step 7: Fetching training data from: {url}")
    
    try:
        response = requests.get(url, timeout=60)
        response.raise_for_status()
        with open(local_path, 'w', encoding='utf-8') as f:
            f.write(response.text)
        print(f"Training data successfully downloaded and saved to {local_path}")
    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch data: {e}")
        return []

    try:
        with open(local_path, 'r', encoding='utf-8') as f:
            raw_data = json.load(f)
            return raw_data
    except Exception as e:
        print(f"Failed to read or parse local JSON file: {e}")
        return []

COSQA_TRAIN_DATA = fetch_cosqa_training_data(TRAIN_FILE_URL, LOCAL_TRAIN_PATH)

print(f"Total training pairs loaded: {len(COSQA_TRAIN_DATA)}")

Starting Step 7: Fetching training data from: https://raw.githubusercontent.com/Jun-jie-Huang/CoCLR/main/data/search/cosqa-retrieval-train-19604.json
Training data successfully downloaded and saved to cosqa_train_data.json
Total training pairs loaded: 19604


In [12]:
%pip install accelerate>=0.26.0

Note: you may need to restart the kernel to use updated packages.


In [13]:
from sentence_transformers import losses, InputExample
from torch.utils.data import DataLoader
from datetime import datetime
from datasets import Dataset
import torch

if len(COSQA_TRAIN_DATA) == 0:
    print("Training data is empty, cannot fine-tune.")
else:
    train_examples = []
    for item in COSQA_TRAIN_DATA:
        query = item.get('doc')
        code = item.get('code') 
        
        if query and code:
            train_examples.append(InputExample(texts=[query, code]))

    print(f"Created {len(train_examples)} training examples.")
    
    if len(train_examples) == 0:
        print("Zero training examples were created.")
    else:
        train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
        train_loss = losses.MultipleNegativesRankingLoss(model=model)
        
        num_epochs = 1
        warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)
        
        model_save_path = f'output/finetuned_cosqa_{datetime.now().strftime("%Y%m%d_%H%M%S")}'
        os.makedirs(model_save_path, exist_ok=True)

        print(f"Starting fine-tuning for {num_epochs} epoch(s) on {len(train_examples)} examples...")

        model.fit(
            train_objectives=[(train_dataloader, train_loss)],
            epochs=num_epochs,
            warmup_steps=warmup_steps,
            output_path=model_save_path,
            save_best_model=True,
        )

        print("\nFine-tuning complete!")
        print(f"Model saved to: {model_save_path}")

Created 19604 training examples.
Starting fine-tuning for 1 epoch(s) on 19604 examples...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]



Step,Training Loss
500,0.1485



Fine-tuning complete!
Model saved to: output/finetuned_cosqa_20251111_133933


In [24]:
print("\nRe-indexing Corpus with Fine-Tuned Model...")

FINE_TUNED_EMBEDDINGS = model.encode(
    COSQA_DOCUMENTS, 
    convert_to_numpy=True, 
    show_progress_bar=False
)
print(f"Generated embeddings with fine-tuned model. Shape: {FINE_TUNED_EMBEDDINGS.shape}")

FINETUNED_INDEX = usearch.index.Index(
    ndim=DIMENSION,
    metric='cos', 
)
FINETUNED_INDEX.add(COSQA_DOC_IDS, FINE_TUNED_EMBEDDINGS)
print(f"Fine-Tuned Index created successfully with {FINETUNED_INDEX.size} code snippets.")

FINE_TUNED_METRICS = evaluate_search_engine(
    index=FINETUNED_INDEX, 
    queries=COSQA_QUERIES, 
    qrels=COSQA_QRELS, 
    model=model, 
    k=10
)

print("\n--- FINAL RESULTS COMPARISON (Part 3 Complete) ---")

print("Baseline (all-MiniLM-L6-v2):")
print(json.dumps(BASELINE_METRICS, indent=4))

print("\nFine-Tuned Model:")
print(json.dumps(FINE_TUNED_METRICS, indent=4))

print("\n--- Improvement Summary ---")
print(f"Recall@10 Improvement: {FINE_TUNED_METRICS['Recall@10'] - BASELINE_METRICS['Recall@10']:.4f}")
print(f"MRR@10 Improvement: {FINE_TUNED_METRICS['MRR@10'] - BASELINE_METRICS['MRR@10']:.4f}")
print(f"NDCG@10 Improvement: {FINE_TUNED_METRICS['NDCG@10'] - BASELINE_METRICS['NDCG@10']:.4f}")
print("-" * 40)


Re-indexing Corpus with Fine-Tuned Model...
Generated embeddings with fine-tuned model. Shape: (1046, 384)
Fine-Tuned Index created successfully with 1046 code snippets.
Starting evaluation of 1046 queries at k=10...

--- FINAL RESULTS COMPARISON (Part 3 Complete) ---
Baseline (all-MiniLM-L6-v2):
{
    "Recall@10": 0.5898661567877629,
    "MRR@10": 0.295947145588637,
    "NDCG@10": 0.3654286274711419
}

Fine-Tuned Model:
{
    "Recall@10": 0.6577437858508605,
    "MRR@10": 0.33761229475856624,
    "NDCG@10": 0.4140278985967439
}

--- Improvement Summary ---
Recall@10 Improvement: 0.0679
MRR@10 Improvement: 0.0417
NDCG@10 Improvement: 0.0486
----------------------------------------
