To use this notebook, install poetry and do `poetry install` and set the env it creates as active in your editor, to start mlflow run `poetry run mlflow ui`

In [40]:
from enum import Enum

class Models(Enum):
      mpnet = "sentence-transformers/all-mpnet-base-v2"
      MiniLM_L6 = "sentence-transformers/all-MiniLM-L6-v2"
      MiniLM_L12 = "sentence-transformers/all-MiniLM-L12-v2"
      distilroberta = "sentence-transformers/all-distilroberta-v1"
      gte = "Supabase/gte-small"

# Select model to use here ⤵
embedding_model = Models.gte

In [45]:
from sentence_transformers import SentenceTransformer
import json
from safetensors import safe_open
from safetensors.numpy import save_file
import hashlib

class EmbeddingStore:
    _issues = {}
    _embeddings = {}
    _model = None
    _model_name = None
    _storeHash = None
    def __init__(self, sources, model_name):
        source_hashes = [hashlib.sha3_256(model_name.encode()).hexdigest()]
        for source in sources:
            f = open(f'./issues/{source}.json', 'r').read()
            source_hashes.append(hashlib.sha3_256((source + f).encode()).hexdigest())
            for entry in json.loads(f):
                url = entry["html_url"]
                if "pull" in url:
                    continue
                self._issues[url] = entry
        self._storeHash = hashlib.sha3_256("".join(source_hashes).encode()).hexdigest()
        self._model = SentenceTransformer(model_name)
        self._model_name = model_name
    
    def gen_embeddings(self):
        try:
            with safe_open(f"./embeddings/{self._storeHash}.safetensors", framework="numpy", device="cpu") as f:
                print(f"Loading embeddings from {self._storeHash}.safetensors")
                for key in f.keys():
                    self._embeddings[key] = f.get_tensor(key)
                return
        except FileNotFoundError:
            pass
        for url, issue in self._issues.items():
            self._embeddings[url] = self._model.encode(f"{issue['title']} {issue['body']}")
        save_file(self._embeddings, f"./embeddings/{self._storeHash}.safetensors")
    
    def query(self, question, top_k=5):
        question_embedding = self._model.encode(question)
        results = [(url, self._model.similarity(embedding, question_embedding)) for url, embedding in self._embeddings.items() if "issue" in url]
        return sorted(results, key=lambda x: x[1], reverse=True)[:top_k]

In [42]:
store = EmbeddingStore(["honojs-hono", "honojs-middleware", "neondatabase-serverless", "drizzle-team-drizzle-kit-mirror", "drizzle-team-drizzle-orm"], embedding_model.value)
store.gen_embeddings()



Loading embeddings from bf9bdd63dedbabb48ccc9d2fcb1fd10e44c04cec4ff765c748ce92338919167f.safetensors


In [43]:
import pandas as pd

test_cases = [
    [
        "Error: `RangeError: Responses with a WebSocket must have status code 101.`",
        "https://github.com/honojs/hono/issues/2883",
    ],
    [
        "Error: TypeError: Cannot read properties of null (reading 'isEscaped')",
        "https://github.com/honojs/hono/issues/2194",
    ],
    [
        "Error: Cookies Max-Age SHOULD NOT be greater than 400 days (34560000 seconds) in duration.",
        "https://github.com/honojs/hono/issues/2762",
    ],
]
data = [
    [query, [url], [url for (url, _) in store.query(query)]]
    for [query, url] in test_cases
]
df = pd.DataFrame(data, columns=["question", "ground_truth", "top_5_results"])
df

Unnamed: 0,question,ground_truth,top_5_results
0,Error: `RangeError: Responses with a WebSocket...,[https://github.com/honojs/hono/issues/2883],"[https://github.com/honojs/hono/issues/2883, h..."
1,Error: TypeError: Cannot read properties of nu...,[https://github.com/honojs/hono/issues/2194],"[https://github.com/honojs/hono/issues/2194, h..."
2,Error: Cookies Max-Age SHOULD NOT be greater t...,[https://github.com/honojs/hono/issues/2762],"[https://github.com/honojs/hono/issues/2762, h..."


In [44]:
import mlflow

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("issue-similarity")
compact_name = embedding_model.value.split("/")[-1]
with mlflow.start_run(run_name=compact_name):
    mlflow.evaluate(
        data=df,
        model_type="retriever",
        targets="ground_truth",
        predictions="top_5_results",
        evaluators="default",
    )

2024/06/12 21:36:13 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
