In [41]:
from qdrant_client import QdrantClient, models
from dotenv import load_dotenv
import uuid
import json
import requests
from os import environ
import time

load_dotenv()

True

In [24]:
# init qdrant
qd_client = QdrantClient(url="http://localhost:6666")

# read json file
with open('../dist/lotr_characters.json', 'r') as file:
    characters = json.load(file)

print(f"Loaded {len(characters)} entries.")

Loaded 749 entries.


In [33]:
collection_name = 'lotr-characters'
embedding_dimension = 1024
jina_embedding_model = "jina-embeddings-v4"
jina_url = "https://api.jina.ai/v1/embeddings"
jina_api_key = environ.get('JINA_API_KEY')
indexing_task = "retrieval.passage"
querying_task = "retrieval.query"

In [51]:
def create_jina_embedding(input_text: str, task = indexing_task)-> list:
    """
    Create embedding using Jina API
    Returns a single embedding vector (list of floats)
    """
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {jina_api_key}",
    }
    data = {
        "input": [input_text],
        "model": jina_embedding_model,
        "dimensions": embedding_dimension,
        "task": task,
        "late_chunking": True,
    }
    try:
        res = requests.post(url=jina_url, headers=headers, json=data, timeout=30)
        if res.status_code == 200:
            embedding = res.json()["data"][0]["embedding"]
            return embedding
        else:
            raise Exception(f"Jina API error: {res.status_code} - {res.text}")
    except requests.RequestException as e:
        raise Exception(f"Request failed: {str(e)}")

In [40]:
def create_jina_embedding_batch(input_texts: list)-> list:
    """
    Create embeddings for multiple texts in a single API call (more efficient)
    Returns list of embedding vectors
    """
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {jina_api_key}",
    }
    data = {
        "input": input_texts,
        "model": jina_embedding_model,
        "dimensions": embedding_dimension,
        "task": indexing_task,
        "late_chunking": True,
    }
    try:
        res = requests.post(url=jina_url, headers=headers, json=data, timeout=60)
        if res.status_code == 200:
            # Extract all embeddings from the response
            embeddings = [d["embedding"] for d in res.json()["data"]]
            return embeddings
        else:
            raise Exception(f"Jina API error: {res.status_code} - {res.text}")
    except requests.RequestException as e:
        raise Exception(f"Request failed: {str(e)}")

In [14]:
def reinitiate_collection():
    is_collection_exist = qd_client.collection_exists(collection_name=collection_name)
    if is_collection_exist:
        qd_client.delete_collection(collection_name=collection_name)
        print(f"Deleted existing collection: {collection_name}")
    print(f"Collection {collection_name} didn't exist, creating new one")
    qd_client.create_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(
            size=embedding_dimension, # Dimensionality of the vectors
            distance=models.Distance.COSINE # Distance metric for similarity search
        )
    )
    print("Created the new collection")

In [42]:
def upsert_to_qdrant():
    """
    Updated function to use Jina API for embeddings instead of FastEmbed
    """
    if not qd_client.collection_exists(collection_name=collection_name):
        print(f"Collection {collection_name} does not exist.")
        return
    

    points = []
    print("Creating embeddings and preparing points...")
    # Process one by one (simpler but slower)
    for i, character in enumerate(characters):
        try:
            # Get embedding from Jina API
            embedding = create_jina_embedding(character['name'])
            point = models.PointStruct(
                id=uuid.uuid4().hex(),
                vector=embedding,
                payload=character
            )
            points.append(point)

            # Add small delay to avoid rate limiting
            if i > 0 and i % 10 == 0:
                time.sleep(0.1)
                print(f"Processed {i+1}/{len(characters)} characters...")
        except Exception as e:
            print(f"Error creating embedding for character '{character['name']}': {str(e)}")
            continue
    if not points:
        print("No valid points to upsert..")
        return
    print("Initialize upserting process...")
    try:
        qd_client.upsert(
            collection_name=collection_name,
            points=points
        )
    except Exception as e:
        print(f"Error during upsert: {str(e)}")

In [43]:
def upsert_to_qdrant_batch():
    """
    More efficient version using batch embedding creation
    """
    if not qd_client.collection_exists(collection_name=collection_name):
        print(f'Collection {collection_name} does not exist.')
        return
    
    print("Creating batch embeddings...")
    try:
        # Extract character names for batch embedding
        character_names = [character["name"] for character in characters]

        # Create all embeddings in one API call
        embeddings = create_jina_embedding_batch(character_names)

        # Create points with embeddings
        points = []
        for character, embedding in zip(characters, embeddings):
            point = models.PointStruct(
                id=uuid.uuid4().hex,
                vector=embedding,
                payload=character
            )
            points.append(point)
        
        print("Starting upsert process...")
        qd_client.upsert(
            collection_name=collection_name,
            points=points
        )
        print(f"Successfully upserted {len(points)} entries.")
    except Exception as e:
        print(f"Error during batch processing: {str(e)}")

In [47]:
def upsert_to_qdrant_chunked(chunk_size: int = 100):
    """
    Process large datasets in chunks to avoid API limits and memory issues
    """
    if not qd_client.collection_exists(collection_name=collection_name):
        print(f'Collection {collection_name} does not exist.')
        return
    
    total_characters = len(characters)
    total_processed = 0

    print(f"Processing {total_characters} characters in chunk of {chunk_size}...")

    for i in range(0, total_characters, chunk_size):
        chunk = characters[i:i + chunk_size]
        character_names = [character["name"] for character in chunk]

        try:
            print(f"Processing chunk {i//chunk_size + 1}/{(total_characters + chunk_size - 1)//chunk_size}...")

            # Create embeddings for this chunk
            embeddings = create_jina_embedding_batch(character_names)

            # Create points
            points = []
            for character, embedding in zip(chunk, embeddings):
                point = models.PointStruct(
                    id=uuid.uuid4().hex,
                    vector=embedding,
                    payload=character
                )
                points.append(point)
            
            # Upsert this chunk
            qd_client.upsert(
                collection_name=collection_name,
                points=points
            )

            total_processed += len(points)
            print(f"Processed {total_processed}/{total_characters} characters...")

            # Small delay between chunks to be nice to the API
            time.sleep(1)
        except Exception as e:
            print(f"Error processing chunk starting at index {i}: {str(e)}")
            continue
    print(f"Finished processing. Total upserted: {total_processed} entries...")

In [48]:
reinitiate_collection()

Deleted existing collection: lotr-characters
Collection lotr-characters didn't exist, creating new one
Created the new collection


In [49]:
upsert_to_qdrant_chunked()

Processing 749 characters in chunk of 100...
Processing chunk 1/8...
Processed 100/749 characters...
Processing chunk 2/8...
Processed 200/749 characters...
Processing chunk 3/8...
Processed 300/749 characters...
Processing chunk 4/8...
Processed 400/749 characters...
Processing chunk 5/8...
Processed 500/749 characters...
Processing chunk 6/8...
Processed 600/749 characters...
Processing chunk 7/8...
Processed 700/749 characters...
Processing chunk 8/8...
Processed 749/749 characters...
Finished processing. Total upserted: 749 entries...


In [53]:
def search(query: str, limit: int = 1):
    """
    Updated search function to use Jina API for query embedding
    """
    try:
        # Create embedding for the search query using Jina API
        query_embedding = create_jina_embedding(input_text=query, task=querying_task)
        
        results = qd_client.query_points(
            collection_name=collection_name,
            query=query_embedding,
            limit=limit,
            with_payload=True
        )
        return results
    except Exception as e:
        print(f"Error during search: {str(e)}")

In [60]:
search('witch king of angmar')

QueryResponse(points=[ScoredPoint(id='d5cf8a4a-b47e-4f00-b28f-d70b203390aa', version=6, score=0.69371796, payload={'birth': 'TA 2844', 'death': None, 'gender': 'Male', 'hair': None, 'height': None, 'name': 'Hildifons Took', 'race': 'Hobbits', 'realm': None, 'spouse': None, 'biography': 'Hildifons Took was the sixth child of Gerontius "The Old" Took and Adamanta (Chubb) Took. Hildifons had five older siblings: Isengrim III, Hildigard, Isumbras IV, Hildigrim, and Isembold; as well as six younger siblings: Isembard, Hildibrand, Belladonna, Donnamira, Mirabella, and Isengar. Hildifons was a curious hobbit who never settled down. One day, he left the Shire on a journey and never returned.[1][2]', 'history': None}, vector=None, shard_key=None, order_value=None)])