In [None]:
from clearml import PipelineController, Task
from pymongo import MongoClient
from qdrant_client import QdrantClient
from sentence_transformers import SentenceTransformer
import numpy as np

In [None]:
# MongoDB and Qdrant Configurations
MONGO_URI = "mongodb://mongodb:27017/"
DB_NAME = "media_data"
COLLECTION_NAME_RAW = "raw_data"
COLLECTION_NAME_FEATURIZED = "featurized_data"

QDRANT_URL = "http://qdrant:6333"
VECTOR_COLLECTION = "rag_vectors"

In [None]:
# Connect to MongoDB
mongo_client = MongoClient(MONGO_URI)
db = mongo_client[DB_NAME]
raw_collection = db[COLLECTION_NAME_RAW]
featurized_collection = db[COLLECTION_NAME_FEATURIZED]

# Connect to Qdrant
qdrant_client = QdrantClient(url=QDRANT_URL)

# Load Sentence Transformer Model
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# ClearML Task Setup
Task.init(project_name="ETL Pipeline", task_name="Featurization Pipeline")

In [None]:
# Featurization Step: Generate Embeddings and Store in MongoDB and Qdrant
def featurize_and_store():
    # Create Qdrant collection if not exists
    qdrant_client.recreate_collection(
        collection_name=VECTOR_COLLECTION,
        vectors_config={"size": embedding_model.get_sentence_embedding_dimension(), "distance": "Cosine"}
    )
    
    # Fetch raw data from MongoDB
    raw_data = list(raw_collection.find())
    
    for record in raw_data:
        text = record.get("content")
        if not text:
            continue
        
        # Generate embeddings
        embedding = embedding_model.encode(text).tolist()
        
        # Store in MongoDB (backup)
        featurized_collection.insert_one({
            "source": record["source"],
            "url": record["url"],
            "embedding": embedding,
            "content": text
        })
        
        # Store in Qdrant
        qdrant_client.upsert(
            collection_name=VECTOR_COLLECTION,
            points=[
                {
                    "id": record["_id"],  # Use MongoDB ID as unique identifier
                    "vector": embedding,
                    "payload": {
                        "source": record["source"],
                        "url": record["url"]
                    }
                }
            ]
        )

    print("Featurization completed. Data stored in MongoDB and Qdrant.")

In [None]:
# ClearML Pipeline Controller
pipeline = PipelineController(
    project="ETL Pipeline",
    name="Featurization Pipeline",
    version="1.0"
)

pipeline.add_function_step(
    name="Featurize and Store",
    function=featurize_and_store
)

if __name__ == "__main__":
    pipeline.execute()
