# Advanced Retrieval 101: Navigate Your Codebase with Semantic Search and Qdrant


https://qdrant.tech/documentation/advanced-tutorials/code-search/

In [1]:
# !curl https://storage.googleapis.com/tutorial-attachments/code-search/structures.jsonl -O

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 4805k  100 4805k    0     0   913k      0  0:00:05  0:00:05 --:--:-- 1137k


In [3]:
# !tail structures.jsonl

In [4]:
import json

structures = []
with open("structures.jsonl", "r") as fp:
    for i, row in enumerate(fp):
        entry = json.loads(row)
        structures.append(entry)

## Code to natural language conversion

In [5]:
import re
from typing import Any, Dict

import inflection


def textify(chunk: Dict[str, Any]) -> str:
    # Get rid of all the camel case / snake case
    # - inflection.underscore changes the camel case to snake case
    # - inflection.humanize converts the snake case to human readable form
    name = inflection.humanize(inflection.underscore(chunk["name"]))
    signature = inflection.humanize(inflection.underscore(chunk["signature"]))

    # Check if docstring is provided
    docstring = ""
    if chunk["docstring"]:
        docstring = f"that does {chunk['docstring']} "

    # Extract the location of that snippet of code
    context = (
        f"module {chunk['context']['module']} " f"file {chunk['context']['file_name']}"
    )
    if chunk["context"]["struct_name"]:
        struct_name = inflection.humanize(
            inflection.underscore(chunk["context"]["struct_name"])
        )
        context = f"defined in struct {struct_name} {context}"

    # Combine all the bits and pieces together
    text_representation = (
        f"{chunk['code_type']} {name} "
        f"{docstring}"
        f"defined as {signature} "
        f"{context}"
    )

    # Remove any special characters and concatenate the tokens
    tokens = re.split(r"\W", text_representation)
    tokens = filter(lambda x: x, tokens)
    return " ".join(tokens)

## Natural language embeddings

In [6]:
text_representations = list(map(textify, structures))

In [9]:
from sentence_transformers import SentenceTransformer

nlp_model = SentenceTransformer("all-MiniLM-L6-v2")
nlp_embeddings = nlp_model.encode(
    text_representations,
    show_progress_bar=True,
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/148 [00:00<?, ?it/s]

## Code embeddings

In [11]:
# Extract the code snippets from the structures to a separate list
code_snippets = [structure["context"]["snippet"] for structure in structures]
code_model = SentenceTransformer(
    "jinaai/jina-embeddings-v2-base-code", trust_remote_code=True
)
code_model.max_seq_length = 8192  # increase the context length window
code_embeddings = code_model.encode(
    code_snippets,
    batch_size=4,
    show_progress_bar=True,
)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/7.44k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

configuration_bert.py:   0%|          | 0.00/8.53k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-v2-qk-post-norm:
- configuration_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_bert.py:   0%|          | 0.00/96.0k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-v2-qk-post-norm:
- modeling_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/322M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/493 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/971k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

Batches:   0%|          | 0/1181 [00:00<?, ?it/s]

## Create collection

In [13]:
from qdrant_client import QdrantClient, models

client = QdrantClient(":memory:")
client.create_collection(
    "qdrant-sources",
    vectors_config={
        "text": models.VectorParams(
            size=nlp_embeddings.shape[1],
            distance=models.Distance.COSINE,
        ),
        "code": models.VectorParams(
            size=code_embeddings.shape[1],
            distance=models.Distance.COSINE,
        ),
    },
)

True

In [14]:
import uuid

points = [
    models.PointStruct(
        id=uuid.uuid4().hex,
        vector={
            "text": text_embedding,
            "code": code_embedding,
        },
        payload=structure,
    )
    for text_embedding, code_embedding, structure in zip(
        nlp_embeddings, code_embeddings, structures
    )
]

client.upload_points("qdrant-sources", points=points, batch_size=64)

## Querying the codebase

### Using text embedding

In [15]:
query = "How do I count points in a collection?"

hits = client.query_points(
    "qdrant-sources",
    query=nlp_model.encode(query).tolist(),
    using="text",
    limit=5,
).points

In [29]:
import pprint

pprint.pp(hits[0].payload)

{'name': 'count',
 'signature': 'async fn count (& self , collection_name : & str , request : '
              'CountRequestInternal , read_consistency : Option < '
              'ReadConsistency > , shard_selection : ShardSelectorInternal ,) '
              '-> Result < CountResult , StorageError >',
 'code_type': 'Function',
 'docstring': '= " Count points in the collection."',
 'line': 120,
 'line_from': 108,
 'line_to': 132,
 'context': {'module': 'toc',
             'file_path': 'lib/storage/src/content_manager/toc/point_ops.rs',
             'file_name': 'point_ops.rs',
             'struct_name': 'TableOfContent',
             'snippet': '    /// Count points in the collection.\n'
                        '    ///\n'
                        '    /// # Arguments\n'
                        '    ///\n'
                        '    /// * `collection_name` - in what collection do '
                        'we count\n'
                        '    /// * `request` - [`CountRequestInterna

### Using code embedding

In [30]:
hits = client.query_points(
    "qdrant-sources",
    query=code_model.encode(query).tolist(),
    using="code",
    limit=5,
).points

In [31]:
pprint.pp(hits[0].payload)

{'name': 'count_indexed_points',
 'signature': 'fn count_indexed_points (& self) -> usize',
 'code_type': 'Function',
 'docstring': None,
 'line': 612,
 'line_from': 612,
 'line_to': 614,
 'context': {'module': 'field_index',
             'file_path': 'lib/segment/src/index/field_index/geo_index.rs',
             'file_name': 'geo_index.rs',
             'struct_name': 'GeoMapIndex',
             'snippet': '    fn count_indexed_points(&self) -> usize {\n'
                        '        self.points_count()\n'
                        '    }\n'}}


### Using text + code embedding

In [33]:
responses = client.query_batch_points(
    "qdrant-sources",
    requests=[
        models.QueryRequest(
            query=nlp_model.encode(query).tolist(),
            using="text",
            with_payload=True,
            limit=5,
        ),
        models.QueryRequest(
            query=code_model.encode(query).tolist(),
            using="code",
            with_payload=True,
            limit=5,
        ),
    ],
)

results = [response.points for response in responses]

In [38]:
pprint.pp(results[0][0].payload)

{'name': 'count',
 'signature': 'async fn count (& self , collection_name : & str , request : '
              'CountRequestInternal , read_consistency : Option < '
              'ReadConsistency > , shard_selection : ShardSelectorInternal ,) '
              '-> Result < CountResult , StorageError >',
 'code_type': 'Function',
 'docstring': '= " Count points in the collection."',
 'line': 120,
 'line_from': 108,
 'line_to': 132,
 'context': {'module': 'toc',
             'file_path': 'lib/storage/src/content_manager/toc/point_ops.rs',
             'file_name': 'point_ops.rs',
             'struct_name': 'TableOfContent',
             'snippet': '    /// Count points in the collection.\n'
                        '    ///\n'
                        '    /// # Arguments\n'
                        '    ///\n'
                        '    /// * `collection_name` - in what collection do '
                        'we count\n'
                        '    /// * `request` - [`CountRequestInterna

### Grouping the results


In [41]:
results = client.query_points_groups(
    "qdrant-sources",
    query=code_model.encode(query).tolist(),
    using="code",
    group_by="context.module",
    limit=5,
    group_size=1,
)

In [53]:
pprint.pp(results.groups[0].hits[0].payload)

{'name': 'count_indexed_points',
 'signature': 'fn count_indexed_points (& self) -> usize',
 'code_type': 'Function',
 'docstring': None,
 'line': 612,
 'line_from': 612,
 'line_to': 614,
 'context': {'module': 'field_index',
             'file_path': 'lib/segment/src/index/field_index/geo_index.rs',
             'file_name': 'geo_index.rs',
             'struct_name': 'GeoMapIndex',
             'snippet': '    fn count_indexed_points(&self) -> usize {\n'
                        '        self.points_count()\n'
                        '    }\n'}}
