# Vector DB Search 

This notebook can be used to do a vector DB search of Arrow issues.  This allows for semantic search—retrieving issues based on meaning rather than exact keyword matches. This differs from GitHub's built-in search, which is mostly lexical and relies on specific terms, labels, or filters. A vector DB can surface issues with similar intent or topic even if they use different wording, making it more useful for detecting duplicates, related bugs, or thematic clusters.

In [20]:
import pandas as pd
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer
import re
import gzip
import json
import pandas as pd
from IPython.display import display, HTML

In [4]:
with gzip.open("../test_data/issues_min.json.gz", "rt", encoding="utf-8") as f:
    df = json.load(f)
    
df = pd.DataFrame(df)
data = df.to_dict('records')

This is some pretty messy data cleaning and for sure needs proper pre-filtering using data fields rather than just the text, but it works for this prototype ;)

In [5]:
# Get rid of empty issues and any which are actually PRs
non_empty = [x for x in data if len(x['body']) > 1]
just_issues = [x for x in non_empty if not (len(x['pull_request']) > 0)]

# This is all issues - opened and closed
len(just_issues)

26349

Do you want to search just open issues or closed ones too? Set this to False if you want to search all previous issues

In [6]:
just_open = True

In [7]:
if just_open:
    just_issues = [x for x in just_issues if x['state'] == "open"]

In [8]:
len(just_issues)

4236

In [9]:
# Functions to clean up the data

# Remove code chunks from issue body
def remove_code_chunks(text):
    # Remove fenced code blocks (```...```)
    text = re.sub(r"```.*?\n.*?```", "", text, flags=re.DOTALL)
    
    # Remove inline code (`...`)
    text = re.sub(r"`[^`]*`", "", text)
    
    # Remove indented code blocks (lines starting with 4+ spaces or a tab)
    text = re.sub(r"^(?: {4,}|\t).*\n?", "", text, flags=re.MULTILINE)

    return text

# Remove URLs from issue body
def remove_urls(text):
    return re.sub(r'(https?://\S+|www\.\S+|ftp://\S+)', '', text)

# Remove boilerplate issue text
def remove_boilerplate(text):

    phrases_to_remove = [
        "### Describe the enhancement requested",
        "### Describe the bug, including details regarding any error messages, version, and platform.",
        "### Component(s)",
        "### Describe the usage question you have. Please include as many useful details as  possible.",
        "**_Overview_**",
        "**_Impact_**",
        "**_Key Features_**"
    ]
    
    for phrase in phrases_to_remove:
        text = text.replace(phrase, '')

    return text

In [10]:
for x in just_issues:
    body = x.get('body', '')
    if not isinstance(body, str):
        continue  # or set x['body'] = "" if you prefer
    x['body'] = remove_boilerplate(x['body'])
    x['body'] = remove_code_chunks(x['body'])
    x['body'] = remove_urls(x['body'])
    x['body'] = "\n".join(line for line in x['body'].splitlines() if line.strip())

In [17]:
# Fragment issues bodies into sentences

import spacy
# spacy.cli.download('en_core_web_sm')
nlp = spacy.load("en_core_web_sm")

fragments = []
for issue_id, issue in enumerate(just_issues):
    doc = nlp(issue['body'])
    for idx, sent in enumerate(doc.sents):
        fragments.append({
            "fragment_id": f"{issue_id}_{idx}",
            "text": sent.text.strip(),
            "issue_id": issue_id,
            "issue": issue
        })

And now to upload it to a searchable vector DB...

In [19]:
# Set up the vector DB

encoder = SentenceTransformer('all-MiniLM-L6-v2', device='cpu') # Model to create embeddings
qdrant = QdrantClient(":memory:") # Create in-memory Qdrant instance

# Create collection to store issues
qdrant.create_collection(
    collection_name="arrow_issues",
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(), # Vector size is defined by used model
        distance=models.Distance.COSINE
    )
)

qdrant.upload_points(
    collection_name="arrow_issues",
    points=[
        models.PointStruct(
            id=idx,
            vector=encoder.encode(fragment["text"]).tolist(),
            payload={
                "text": fragment["text"],
                "issue_id": fragment["issue_id"],
                "url": fragment["issue"]["url"],
                "title": fragment["issue"]["title"],
                "body": fragment["issue"]["body"]
            }
        ) for idx, fragment in enumerate(fragments)
    ]
)

Choose a term to search for

In [63]:
my_term_to_search = "billions of rows"

In [64]:
# Search!
hits = qdrant.search(
    collection_name="arrow_issues",
    query_vector=encoder.encode(my_term_to_search).tolist(),
    limit=10
)

In [65]:
html_snippets = []

for hit in hits:
    block = f"""
    <br>
    <b>Score:</b> {round(hit.score, 2)} <br>
    <b>Matched Sentence:</b> {hit.payload['text']}<br>
    <b>Issue URL:</b> <a target="blank" href="{hit.payload['url']}">{hit.payload['title']}</a><br>
    <b>Full Body:</b> {hit.payload['body']} <br>
    <br>
    """
    html_snippets.append(block)

html_output = "\n".join(html_snippets)
display(HTML(html_output))