In [1]:
import pandas as pd
import re
import itertools
import gzip
import json
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from IPython.display import display, HTML

  from tqdm.autonotebook import tqdm, trange


In [2]:
# load data and convert to records format
with gzip.open("../test_data/issues_min.json.gz", "rt", encoding="utf-8") as f:
    df = json.load(f)
  
df = pd.DataFrame(df)

data = df.to_dict('records')
len(data)

46076

In [5]:
# ditch PRs and closed issues
data =  [x for x in data if x['state'] == "open" and len(x["pull_request"]) == 0]
len(data)

4242

In [7]:
# Set up model to create embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode([x['title'] for x in data], convert_to_tensor=True)


In [8]:
# Set up index to search
emb_np = embeddings.cpu().numpy().astype('float32')
faiss.normalize_L2(emb_np)

index = faiss.IndexFlatIP(emb_np.shape[1])  # Inner product = cosine if normalized
index.add(emb_np)

In [9]:
# Find duplicates

In [23]:
neighbours_per_issue = 2
similarity_cutoff = 0.9

In [24]:
D, I = index.search(emb_np, neighbours_per_issue)

duplicates = []

for idx, (neighbors, sims) in enumerate(zip(I, D)):
    for j, sim in zip(neighbors[1:], sims[1:]):  # skip self
        if sim > similarity_cutoff:
            duplicates.append((idx, j, sim))

duplicates = sorted(duplicates, key=lambda x: -x[2])

In [25]:
seen = set()
html_snippets = []

for idx1, idx2, sim in duplicates:
    # Skip if either issue already used
    if idx1 in seen or idx2 in seen:
        continue

    issue1 = data[int(idx1)]
    issue2 = data[int(idx2)]

    block = f"""
<br>
<b>Issue:</b> <a target="blank" href="{issue1['url']}">{issue1['title']}</a><br>
<b>Duplicate:</b> <a target="blank" href="{issue2['url']}">{issue2['title']}</a><br>
<b>Score:</b> {sim:.3f}<br>
<br>
"""
    html_snippets.append(block)

    # Mark both as seen so we don’t include them again
    seen.add(idx1)
    seen.add(idx2)

html_output = "\n".join(html_snippets)

In [26]:
from IPython.display import display, HTML
display(HTML(html_output))

Conclusion: While I did use this to close some duplicate issues, and we could iterate on this to retrieve data from the API, there aren't a huge number of these kinds of issues and we might be better serve 