In [27]:
import sys
import os

# Get the absolute path of the project root (adjust as needed)
project_root = os.path.abspath("../..")  # Moves one level up to project root

# Add to sys.path if not already present
if project_root not in sys.path:
    sys.path.insert(0, project_root)


In [1]:
%pip install scikit-learn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
from qdrant_client import QdrantClient, models
import numpy as np
from rich import print
from utils import get_tokens

openai_api_key: EMPTY
openai_api_base: http://localhost:9000/v1
export_template_path: ./outflow/templates
qdrant_host: localhost
qdrant_port: 6333
neo4j_connection: bolt://localhost:7687
neo4j_user: neo4j
neo4j_password: ********


## Collection Setup

In [34]:
import ollama

# Initialize Ollama client
oclient = ollama.Client(host="localhost")
# Create a vector for the Component node
response = oclient.embeddings(model="nomic-embed-text", prompt="Hello, world")
embeddings = response["embedding"]
# 2. Convert to Qdrant sparse vectors
COLLECTION_NAME = "demo_collection"
client = QdrantClient(host="localhost", port="6333")

if client.collection_exists(COLLECTION_NAME):
    client.delete_collection(COLLECTION_NAME)

client.create_collection(
    collection_name=COLLECTION_NAME,
    vectors_config={
        "text-dense": models.VectorParams(
            size=len(embeddings), distance=models.Distance.COSINE
        )
    },
    sparse_vectors_config={
        "text-sparse": models.SparseVectorParams(
            index=models.SparseIndexParams(on_disk=False)
        )
    },
)

True

## Data Loading

In [4]:
from typing import List, Optional
from pydantic_xml import BaseXmlModel, attr, element


class NamePresentation(BaseXmlModel):
    edt: Optional[str] = attr("edt")
    value: str = ""


class Description(BaseXmlModel):
    edt: Optional[str] = attr("edt")
    value: str = ""


class ProductVariantConditionRef(BaseXmlModel):
    ref: Optional[str] = attr("ref")
    value: str = ""


class Server(BaseXmlModel):
    canAddress: str = element("CanAddress")
    propertyName: str = element("PropertyName")


class Property(BaseXmlModel):
    server: Server = element("Server")


class PropertyGroup(BaseXmlModel):
    name: str = element("Name")
    namePresentation: NamePresentation = element("NamePresentation")
    description: Optional[Description] = element("Description", default=None)  # ✅ Added
    property: List[Property] = element("Property")


class FunctionPropertyGroup(BaseXmlModel, tag="FunctionPropertyGroup"):
    name: str = element("Name")
    namePresentation: NamePresentation = element("NamePresentation")
    description: Optional[Description] = element("Description", default=None)
    productVariantConditionRef: Optional[ProductVariantConditionRef] = element("ProductVariantConditionRef", default=None)
    ufNumber: List[int] = element("UFNumber")

    propertyGroup: Optional[List[PropertyGroup]] = element("PropertyGroup", default=None)
    property: Optional[List[Property]] = element("Property", default=None)

    class Config:
        xml_ns = {
            "xsi": "http://www.w3.org/2001/XMLSchema-instance"
        }

In [5]:
from transformers import AutoTokenizer

# Load the Qwen tokenizer
tokenizer = AutoTokenizer.from_pretrained("NovaSky-AI/Sky-T1-32B-Flash", trust_remote_code=True)

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [6]:
# load all the physical quantities from the ./data/Function-Parameters/PhysicalQuantity/ folder
from pathlib import Path

physical_quantity_files = Path("./data/Function-Parameters/FunctionViewAdjust").glob(
    "*.xml"
)

# make sure their names start with PhysicalQuantity
physical_quantity_files = [
    file
    for file in physical_quantity_files
    if file.name.startswith("FunctionPropertyGroup")
]

physical_quantities = []
token_count = 0
print("total files: ", len(physical_quantity_files))
funciton_properties_details = ""

function_group_objects = []

for file in physical_quantity_files:
    with file.open("rb") as f:
        xml_data = f.read()
        obj = FunctionPropertyGroup.from_xml(xml_data)
        physical_quantities.append(obj)

        model_json = obj.model_dump_json(indent=2)

        input_ids = tokenizer(model_json)["input_ids"]
        token_count += len(input_ids)

        funciton_properties_details += "\n\nFUNCTION PROPERTY GROUP " + obj.name + "\n"
        if obj.property:
            for prop in obj.property:
                funciton_properties_details += prop.server.propertyName + "\n"
        if obj.propertyGroup:
            for prop_group in obj.propertyGroup:
                funciton_properties_details += prop_group.name + "\n"
                for prop in prop_group.property:
                    funciton_properties_details += prop.server.propertyName + "\n"

        function_group_objects.append(obj)

print(f"Token count: {token_count}")

# tokens for all physical quantities
tokens_count_function_properties = len(
    tokenizer(funciton_properties_details)["input_ids"]
)

print(f"Token count for all function properties: {tokens_count_function_properties}")
# print(physical_quantities)

In [7]:
# # 0. Sample data
# documents = [
#     "Reading Light",
#     "Rear Cam Disp",
#     "GPS Navigation",
#     "GPS Panel",
#     "Heated Seats",
#     "Read Lite",
# ]

# # 1. Fit TF-IDF vectorizer with char n-grams
# vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(2, 4))
# X_sparse = vectorizer.fit_transform(documents)

documents = []
for obj in function_group_objects:
    document = ""

    if obj.property:
        for prop in obj.property:
            document += prop.server.propertyName + "\n"
    if obj.propertyGroup:
        for prop_group in obj.propertyGroup:
            document += prop_group.name + "\n"
            for prop in prop_group.property:
                document += prop.server.propertyName + "\n"
    print(document)
    documents.append(document)

# # 1. Fit TF-IDF vectorizer with char n-grams
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(2, 4))
X_sparse = vectorizer.fit_transform(documents)


In [18]:
X_sparse

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 33651 stored elements and shape (164, 11807)>

In [85]:
def get_dense_vector(document):
    """
    Convert sparse vector to dense format.
    """
    response = oclient.embeddings(
        model="nomic-embed-text", prompt=" ".join(get_tokens(document))
    )
    embeddings = response["embedding"]

    return embeddings


# 3. Upsert documents
for idx in range(X_sparse.shape[0]):
    row = X_sparse[idx]
    obj = function_group_objects[idx]
    document = documents[idx]
    indices = row.indices
    values = row.data

    client.upsert(
        collection_name=COLLECTION_NAME,
        points=[
            models.PointStruct(
                id=idx,
                payload={"json": obj.model_dump_json(), "document": document, "tokens": get_tokens(document)},
                vector={
                    "text-sparse": models.SparseVector(
                        indices=indices.tolist(),
                        values=values.tolist(),
                    ),
                    "text-dense": get_dense_vector(document),
                },
            )
        ],
    )

## Query

In [174]:
# 4. Query preparation
name="DHSC_AtEgegTyp"
description = """
Brake pedal behaviour for activating Automatic DHSC, See parameter for description.
"""
query_vec = vectorizer.transform([name])
query_indices = query_vec.indices
query_values = query_vec.data

dense_embedding = get_dense_vector(name + " " + description)

##### Sparse Query


In [175]:
results = client.query_points(
    collection_name=COLLECTION_NAME,
    query=models.SparseVector(
        indices=query_indices,
        values=query_values
    ),
    using="text-sparse",
    limit=3,
)

##### Dense Query

In [156]:
results = client.query_points(
    collection_name=COLLECTION_NAME,
    query=dense_embedding,
    using="text-dense",
)

##### Fusion Query

In [178]:
results = client.query_points(
    collection_name=COLLECTION_NAME,
    prefetch=[
        models.Prefetch(
            query=models.SparseVector(indices=query_indices, values=query_values),
            using="text-sparse",
            limit=20,
        ),
        models.Prefetch(
            query=dense_embedding,  # <-- dense vector
            using="text-dense",
            limit=20,
        ),
    ],
    query=models.FusionQuery(fusion=models.Fusion.RRF),
)

### Results

In [179]:
print(results)