In [100]:
# pip install chromadb tiktoken sentence-transformers ipykernel ipywidgets
import json
import chromadb
from chromadb.utils import embedding_functions
import platform
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-mpnet-base-v2")

platform.platform()

path = (
    "/Users/ansel/chromadb"
    if "macOS" in platform.platform()
    else "/home/ansel/chromadb"
)

chroma_client = chromadb.PersistentClient(path=path)
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-mpnet-base-v2"
)

collection = chroma_client.get_collection(
    name="roderick", embedding_function=sentence_transformer_ef
)

In [101]:
raw_results = collection.query(
    query_texts=["What does John Roderick think about donuts?"],
    n_results=30,
    include=["documents", "metadatas"],
)

results = list(zip(raw_results["metadatas"][0], raw_results["documents"][0]))

In [102]:
metas = [json.loads(metadatas["wavfiles"]) for metadatas, text in results]
john_lines = [line for chunk in metas for line in chunk if line["speaker"] == "John"]

donut = model.encode("donut")


def similarity(emb, string):
    embedding = model.encode(string)
    return float(model.similarity(emb, embedding))


filtered = [line for line in john_lines if similarity(donut, line["speech"]) > 0.4]

filteredToJson = [
    {
        "file": "/audio/" + line["wavfile"].replace(".wav", ".mp3"),
        "text": line["speech"],
    }
    for line in filtered
]

f = open("data.json", "w")
f.write(json.dumps(filteredToJson))
f.close()