In [1]:
%pip install scikit-learn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import sys
import os

# Get the absolute path of the project root (adjust as needed)
project_root = os.path.abspath("../..")  # Moves one level up to project root

# Add to sys.path if not already present
if project_root not in sys.path:
    sys.path.insert(0, project_root)


In [3]:
from models.input.physical_quantity import PhysicalQuantity
from models.input.function_view import FunctionView
from models.input.function_property_group import FunctionPropertyGroup

In [4]:
from database.database import (
    qclient, oclient
)

from qdrant_client import QdrantClient, models

# Create a vector for the Component node
response = oclient.embeddings(model="nomic-embed-text", prompt="Hello, world")
embeddings = response["embedding"]
# 2. Convert to Qdrant sparse vectors
COLLECTION_NAME = "function_parameters"

if qclient.collection_exists(COLLECTION_NAME):
    qclient.delete_collection(COLLECTION_NAME)

qclient.create_collection(
    collection_name=COLLECTION_NAME,
    vectors_config={
        "text-dense": models.VectorParams(
            size=len(embeddings), distance=models.Distance.COSINE
        )
    },
    sparse_vectors_config={
        "text-sparse": models.SparseVectorParams(
            index=models.SparseIndexParams(on_disk=False)
        )
    },
)

openai_api_key: EMPTY
openai_api_base: http://localhost:9000/v1
export_template_path: ./outflow/templates
qdrant_host: localhost
qdrant_port: 6333
neo4j_connection: bolt://localhost:7687
neo4j_user: neo4j
neo4j_password: ********
Neo4j Connection Successful


True

In [5]:
# load all the physical quantities from the ./data/Function-Parameters/PhysicalQuantity/ folder
from pathlib import Path

all_files = Path("./data/Function-Parameters/FunctionViewAdjust").glob("*.xml")

# make sure their names start with FunctionPropertyGroup
function_property_group_files = [
    file for file in all_files if file.name.startswith("FunctionPropertyGroup")
]

funciton_properties_details = ""

function_group_objects : list[FunctionPropertyGroup] = []

for file in function_property_group_files:
    with file.open("rb") as f:
        xml_data = f.read()
        obj = FunctionPropertyGroup.from_xml(xml_data)
        model_json = obj.model_dump_json(indent=2)
        function_group_objects.append(obj)

# print(physical_quantities)

In [6]:
# Load all the physical quantities from the ./data/Function-Parameters/PhysicalQuantity/ folder
all_files = Path("./data/Function-Parameters/PhysicalQuantity/").glob("*.xml")
# make sure their names start with PhysicalQuantity
physical_quantity_files = [
    file for file in all_files if file.name.startswith("PhysicalQuantity")
]
physical_quantity_objects: list[PhysicalQuantity] = []
for file in physical_quantity_files:
    with file.open("rb") as f:
        xml_data = f.read()
        obj = PhysicalQuantity.from_xml(xml_data)
        model_json = obj.model_dump_json(indent=2)
        physical_quantity_objects.append(obj)

In [7]:
print("Total FunctionPropertyGroup objects loaded:", len(function_group_objects))
print("Total PhysicalQuantity objects loaded:", len(physical_quantity_objects))

Total FunctionPropertyGroup objects loaded: 164
Total PhysicalQuantity objects loaded: 83


In [8]:
# load the FunctionViewAdjust file (single file)
file = Path("./data/Function-Parameters/FunctionView_FunctionAdjustTree.xml")
function_view_adjust_object: FunctionView
with file.open("rb") as f:
    xml_data = f.read()
    obj = FunctionView.from_xml(xml_data)
    model_json = obj.model_dump_json(indent=2)

    function_view_adjust_object = obj

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

## prep the data for vectorization
documents = []
for function_property_group in function_group_objects:
    document = ""
    if function_property_group.property:
        for prop in function_property_group.property:
            document += prop.server.propertyName + "\n"
    if function_property_group.propertyGroup:
        for prop_group in function_property_group.propertyGroup:
            document += prop_group.name + "\n"
            for prop in prop_group.property:
                document += prop.server.propertyName + "\n"
    documents.append(document)

# # 1. Fit TF-IDF vectorizer with char n-grams
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(2, 4))
X_sparse = vectorizer.fit_transform(documents)
    

In [10]:
from utils import get_tokens


def get_dense_vector(document):
    """
    Convert sparse vector to dense format.
    """
    response = oclient.embeddings(
        model="nomic-embed-text", prompt=" ".join(get_tokens(document))
    )
    embeddings = response["embedding"]

    return embeddings


# 3. Upsert documents
for idx in range(X_sparse.shape[0]):
    row = X_sparse[idx]
    obj = function_group_objects[idx]
    document = documents[idx]
    indices = row.indices
    values = row.data

    qclient.upsert(
        collection_name=COLLECTION_NAME,
        points=[
            models.PointStruct(
                id=idx,
                payload={
                    "json": obj.model_dump_json(),
                    "document": document,
                    "tokens": get_tokens(document),
                    "type": "FunctionPropertyGroup",
                },
                vector={
                    "text-sparse": models.SparseVector(
                        indices=indices.tolist(),
                        values=values.tolist(),
                    ),
                    "text-dense": get_dense_vector(document),
                },
            )
        ],
    )

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

## prep the data for vectorization
documents = []

for physical_quantity in physical_quantity_objects:
    document = ""
    if physical_quantity.name:
        document += physical_quantity.name + "\n"
    if physical_quantity.unit:
        for unit in physical_quantity.unit:
            document += unit.name + "\n"
    documents.append(document)

# # 1. Fit TF-IDF vectorizer with char n-grams
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(2, 4))
X_sparse = vectorizer.fit_transform(documents)
    

In [12]:

# 3. Upsert documents
for idx in range(X_sparse.shape[0]):
    row = X_sparse[idx]
    obj = physical_quantity_objects[idx]
    document = documents[idx]
    indices = row.indices
    values = row.data

    qclient.upsert(
        collection_name=COLLECTION_NAME,
        points=[
            models.PointStruct(
                id=idx + len(function_group_objects),
                payload={
                    "json": obj.model_dump_json(),
                    "document": document,
                    "tokens": get_tokens(document),
                    "type": "PhysicalQuantity",
                },
                vector={
                    "text-sparse": models.SparseVector(
                        indices=indices.tolist(),
                        values=values.tolist(),
                    ),
                    "text-dense": get_dense_vector(document),
                },
            )
        ],
    )

In [13]:
documents = []
group_objects = []  # Keep track of which group each document came from

for top_group in function_view_adjust_object.group:
    document = top_group.name + "\n"
    groups_to_process = [top_group]

    while groups_to_process:
        next_level = []
        for group in groups_to_process:
            # 1. Collect child group names
            if group.group:
                for child in group.group:
                    document += child.name + "\n"
                    next_level.append(child)

            # 2. Collect function references from content
            if group.content:
                for field in [
                    group.content.functionPropertyGroup,
                    group.content.functionGuidedMethodControlGroup,
                    group.content.functionGuidedMethodCalibrateGroup,
                ]:
                    if field:
                        for ref in field:
                            document += ref.value + "\n"

        groups_to_process = next_level

    documents.append(document)
    group_objects.append(top_group)  # This aligns 1:1 with the document

In [14]:
# Vectorize
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(2, 4))
X_sparse = vectorizer.fit_transform(documents)

# Upsert Function Groups
for idx in range(X_sparse.shape[0]):
    row = X_sparse[idx]
    document = documents[idx]
    group_obj = group_objects[idx]  # Only this group, not the full FunctionView

    qclient.upsert(
        collection_name=COLLECTION_NAME,
        points=[
            models.PointStruct(
                id=idx,  # Offset if needed
                payload={
                    "json": group_obj.model_dump_json(),
                    "document": document,
                    "tokens": get_tokens(document),
                    "type": "FunctionViewAdjustTree",
                },
                vector={
                    "text-sparse": models.SparseVector(
                        indices=row.indices.tolist(),
                        values=row.data.tolist(),
                    ),
                    "text-dense": get_dense_vector(document),
                },
            )
        ],
    )