In [None]:
from qdrant_client import QdrantClient, models
from fastembed import TextEmbedding
import numpy as np
import requests 

In [2]:
client = QdrantClient("http://localhost:6333")

In [3]:
collection_name_q6 = "zoomcamp-hw02-q06"
EMBEDDING_DIMENSIONALITY = 384
model_handle = "BAAI/bge-small-en"
q6 = 'I just discovered the course. Can I still join it?'

In [4]:
# Create the collection with specified vector parameters
client.create_collection(
    collection_name=collection_name_q6,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,  # Dimensionality of the vectors
        distance=models.Distance.COSINE  # Distance metric for similarity search
    )
)

True

In [5]:
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']
    if course_name != 'machine-learning-zoomcamp':
        continue

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [8]:
points = []
id = 0

for document in documents:
    full_text = document['question'] + ' ' + document['text']
    vector=models.Document(text=full_text, model=model_handle)
    point = models.PointStruct(
        id=id,
        vector=vector,
        payload={
            "text": document['text'],
            "question": document['question']
        }
    )
    points.append(point)

    id += 1

In [9]:
client.upsert(
    collection_name=collection_name_q6,
    points=points
)

Fetching 5 files: 100%|██████████| 5/5 [00:04<00:00,  1.00it/s]


UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [15]:
def vector_search(question):
    query_points = client.query_points(
        collection_name=collection_name_q6,
        query=models.Document(
            text=question,
            model=model_handle 
        ),
        limit=1,
        with_payload=True
    )
    
    results = []
    
    for point in query_points.points:
        results.append(f"ID: {point.id}, score: {point.score}")
    
    return results

In [16]:
print(vector_search(q6))

['ID: 14, score: 0.8893469']
