In [1]:
!pip install clearml pymongo transformers qdrant-client torch

Collecting transformers
  Obtaining dependency information for transformers from https://files.pythonhosted.org/packages/d0/a7/7eedcf6a359e1e1eff3bc204ad022485aa5d88c08e1e3e0e0aee8a2e2235/transformers-4.47.0-py3-none-any.whl.metadata
  Downloading transformers-4.47.0-py3-none-any.whl.metadata (43 kB)
     ---------------------------------------- 0.0/43.5 kB ? eta -:--:--
     ----------------- -------------------- 20.5/43.5 kB 330.3 kB/s eta 0:00:01
     -------------------------------------- 43.5/43.5 kB 537.0 kB/s eta 0:00:00
Collecting qdrant-client
  Obtaining dependency information for qdrant-client from https://files.pythonhosted.org/packages/68/c0/eef4fe9dad6d41333f7dc6567fa8144ffc1837c8a0edfc2317d50715335f/qdrant_client-1.12.1-py3-none-any.whl.metadata
  Downloading qdrant_client-1.12.1-py3-none-any.whl.metadata (10 kB)
Collecting torch
  Obtaining dependency information for torch from https://files.pythonhosted.org/packages/5f/ba/607d013b55b9fd805db2a5c2662ec7551f1910b4eef3965

In [2]:
from pymongo import MongoClient
import torch
from transformers import AutoTokenizer, AutoModel
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct

# Connect to MongoDB
client = MongoClient("mongodb://localhost:27017/")
db = client["rag_system"]
raw_data_collection = db["raw_data"]
featurized_data_collection = db["featurized_data"]

# Initialize Qdrant client
qdrant_client = QdrantClient("localhost", port=6333)  # Ensure Qdrant is running


In [3]:
# Load pre-trained model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Function to encode text into feature vector
def encode_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)  # Average pooling over token embeddings
    return embeddings.squeeze().numpy()


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [16]:
# Create the Qdrant collection if it doesn't exist
def create_qdrant_collection():
    try:
        # Get all collections from Qdrant
        collections_response = qdrant_client.get_collections()

        # Check if 'rag_vectors' exists in the response
        if "rag_vectors" not in collections_response.collections:
            # Create the collection if it doesn't exist
            qdrant_client.create_collection(
                collection_name="rag_vectors",
                vectors_config={
                    "size": 768,  # Size of feature vector (768 for BERT)
                    "distance": "Cosine"  # Similarity metric (Cosine, Euclidean, or Dot)
                }
            )
            print("Collection 'rag_vectors' created successfully.")
        else:
            print("Collection 'rag_vectors' already exists.")
    except Exception as e:
        print(f"Error creating Qdrant collection: {e}")

# Featurization pipeline function
def featurize_data():
    # Ensure that the Qdrant collection exists
    create_qdrant_collection()

    # Retrieve raw data from MongoDB
    raw_documents = raw_data_collection.find()

    for doc in raw_documents:
        text = doc.get("content", "")
        
        # If the content is empty, skip it
        if not text:
            continue

        # Convert text to feature vector using BERT
        feature_vector = encode_text(text)

        # Store the featurized data in MongoDB
        featurized_data = {
            "type": doc["type"],
            "url": doc["url"],
            "feature_vector": feature_vector.tolist()
        }
        featurized_data_collection.insert_one(featurized_data)

        # Convert ObjectId to an integer (using timestamp part of ObjectId)
        point_id = int(str(doc["_id"])[:8], 16)  # Convert first 8 hex digits of ObjectId to int

        # Store the feature vector in Qdrant for similarity search
        point = PointStruct(
            id=point_id,  # Use the integer point ID
            vector=feature_vector.tolist(),
            payload={"url": doc["url"], "type": doc["type"]}
        )
        qdrant_client.upsert(collection_name="rag_vectors", points=[point])
        print(f"Processed and stored features for: {doc['url']}")


In [17]:
featurize_data()


Collection 'rag_vectors' created successfully.
Processed and stored features for: https://github.com/gazebosim/gz-sim.git
Processed and stored features for: https://github.com/ros2/ros2_documentation.git
Processed and stored features for: https://github.com/gazebosim/gz-sim.git
Processed and stored features for: https://github.com/ros2/ros2_documentation.git
Processed and stored features for: https://github.com/gazebosim/gz-sim.git
Processed and stored features for: https://github.com/ros2/ros2_documentation.git
Processed and stored features for: https://github.com/gazebosim/gz-sim.git
Processed and stored features for: https://github.com/ros2/ros2_documentation.git
Processed and stored features for: https://github.com/gazebosim/gz-sim.git
Processed and stored features for: https://github.com/ros2/ros2_documentation.git
Processed and stored features for: https://github.com/gazebosim/gz-sim.git


In [18]:
query_text = "ROS2 Navigation"
query_vector = encode_text(query_text)


results = qdrant_client.search(
    collection_name="rag_vectors",
    query_vector=query_vector.tolist(),
    limit=5  
)

for result in results:
    print(f"Found URL: {result.payload['url']}, Type: {result.payload['type']}")


Found URL: https://github.com/ros2/ros2_documentation.git, Type: github
Found URL: https://github.com/ros2/ros2_documentation.git, Type: github
Found URL: https://github.com/gazebosim/gz-sim.git, Type: github
Found URL: https://github.com/gazebosim/gz-sim.git, Type: github
Found URL: https://github.com/gazebosim/gz-sim.git, Type: github
