In [1]:
import setup

setup.init_django()

  from tqdm.autonotebook import tqdm, trange


In [2]:
from sentence_transformers import SentenceTransformer
from blog.models import BlogPost
from decouple import config
EMEDDING_MODEL=config("EMEDDING_MODEL", default="multi-qa-distilbert-cos-v1")
RECREATE_DATA = True

In [3]:
docs = [
    "The dog jumped over the cat", 
    "The cat jumped over the dog",
    "It is very warm today",
    "The cat is yellow and the dog is red",
]


In [4]:
new_data = []
for i, x in enumerate(docs):
    new_data.append(
        BlogPost(title=f"Blog Post {i+1}", content=x, can_delete=True)
    )
if RECREATE_DATA:
    qs = BlogPost.objects.filter(can_delete=True)
    qs.delete()
    BlogPost.objects.bulk_create(new_data)

In [5]:
model = SentenceTransformer(EMEDDING_MODEL)



In [9]:
def get_embedding(text, model=model):
    text = text.replace('\n', ' ').strip()
    return model.encode(text)

In [10]:
for obj in qs:
    if obj.embedding is None:
        obj.embedding = get_embedding(obj.get_embedding_text_raw(), model)
        obj.save()

In [15]:
query = "The dog jumped over the green cow"
# query = "The dog jumped over the cat"
query_embedding = get_embedding(query)

In [16]:
from pgvector.django import CosineDistance
from django.db.models import F

qs = BlogPost.objects.annotate(
    distance=CosineDistance('embedding',query_embedding),
    similarity=1 - F("distance")
).order_by("distance")
for obj in qs:
    print(obj.title, obj.distance, obj.similarity * 100)

Blog Post 1 0.27111828327178955 72.88817167282104
Blog Post 2 0.2754528522491455 72.45471477508545
Blog Post 4 0.5430814811362141 45.691851886378586
Blog Post 3 0.9220983255388343 7.79016744611657
