In [4]:
import sys
import os

# Get the absolute path of the project root (adjust as needed)
project_root = os.path.abspath("../..")  # Moves one level up to project root

# Add to sys.path if not already present
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [2]:
%pip install transformers torch

Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch
  Using cached torch-2.6.0-cp312-cp312-manylinux1_x86_64.whl.metadata (28 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.26.0 (from transformers)
  Downloading huggingface_hub-0.29.3-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting networkx (from torch)
  Using cached networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting fsspec (from torch)
  Do

In [22]:
from transformers import AutoTokenizer, AutoModel
import torch

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")  # Debugging

# Load model from Hugging Face Hub
model_name = "lukasweber/WG_BERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, ignore_mismatched_sizes=True)  

# Move model to GPU
model.to(device)

def get_bert_embeddings(text):
    """Generate sentence embeddings using a BERT model."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

    # Move inputs to GPU
    inputs = {key: value.to(device) for key, value in inputs.items()}  # ✅ Move input tensors

    with torch.no_grad():
        outputs = model(**inputs)

    # Extract mean-pooled embeddings
    embeddings = outputs.last_hidden_state.mean(dim=1)

    # Move embeddings back to CPU and convert to list
    return embeddings.cpu().squeeze(0).tolist()

# Example usage
text = "Hello, how are you?"
embedding = get_bert_embeddings(text)
print(len(embedding))  # Expected output: 768 (correct dimension)

Using device: cuda


Some weights of BertModel were not initialized from the model checkpoint at lukasweber/WG_BERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


768


In [16]:
from database.database import qclient
from qdrant_client import QdrantClient, models

COLLECTION_NAME = "wg_bert_embeddings"
# Create a collection if it doesn't already exist
if not qclient.collection_exists(COLLECTION_NAME):
    qclient.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=models.VectorParams(
            size=embedding.shape[1], distance=models.Distance.COSINE
        ),
    )

In [13]:
import uuid
from utils import get_tokens


def create_component_vector(name, description, ecu_system):
    text = " ".join(get_tokens(name + "\n" + description))

    # Get BERT embedding as a Python list
    embeddings = (
        get_bert_embeddings(text).squeeze(0).tolist()
    )  # ✅ Convert tensor → list

    # Insert into Qdrant
    qclient.upsert(
        collection_name=COLLECTION_NAME,
        points=[
            models.PointStruct(
                id=str(uuid.uuid4()),
                vector=embeddings,  # ✅ Now correctly formatted as a list
                payload={
                    "name": name,
                    "description": description,
                    "type": "Component",
                    "ecu_system": ecu_system,
                },
            )
        ],
    )

In [14]:
from database.database import get_all_components, driver

# get all components for the CCU1
with driver.session() as session:
    all_components = session.execute_read(get_all_components, "CCU1")
    for component in all_components:
        print(component)
        create_component_vector(component["name"], component["description"], "CCU1")

<Record name='C24' description='Connector, 21-pole' file_id=None purpose=None more_description=None>
<Record name='C78' description='Connector, 21-pole' file_id=None purpose=None more_description=None>
<Record name='C285' description='Splice, door driver' file_id=None purpose=None more_description=None>
<Record name='C8052' description='Splice, ground, external' file_id=None purpose=None more_description=None>
<Record name='C8056' description='Splice, B' file_id=None purpose=None more_description=None>
<Record name='C8068' description='Connector, 6-pole' file_id=None purpose=None more_description=None>
<Record name='C8069' description='Connector, 6-pole' file_id=None purpose=None more_description=None>
<Record name='E106' description='Control unit, DCS, driver' file_id=None purpose=None more_description=None>
<Record name='E107' description='Control unit, DCS, passenger' file_id=None purpose=None more_description=None>
<Record name='G8-3' description='Splice, ground' file_id=None purpo

#### IO List Processing

In [11]:
import os
from xmltodict import parse

file_content = None
# calculate the hash of the file
with open(f"../../data/input/io_lists/PtIOList_COO.xml", "rb") as file:
    file_content = file.read()
    # parse the xml file content to a dictionary
    pt_io_list = parse(file_content)
    io_list = []
    if "IO" in pt_io_list["PtIOList"] and type(pt_io_list["PtIOList"]["IO"]) == list:
        for io in pt_io_list["PtIOList"]["IO"]:
            io_list.append(io)
    elif "IO" in pt_io_list["PtIOList"]:
        io_list.append(pt_io_list["PtIOList"]["IO"])

    print(f"Processing IO Mapping - total IOs: {len(io_list)}")

Processing IO Mapping - total IOs: 658


In [10]:
from utils import get_clean_io_name

def get_io_tokens(io):
    description = ""
    name_presentation = ""
    name = io["Name"]
    if "NamePresentation" in io and "#text" in io["NamePresentation"]:
        name_presentation = io["NamePresentation"]["#text"]

    if "Description" in io["IOService"] and "#text" in io["IOService"]["Description"]:
        description = io["IOService"]["Description"]["#text"]

    if name == name_presentation:
        name_presentation = ""  # if name and name_presentation are the same, we don't need to repeat the name_presentation

    data = get_clean_io_name(name) + "\n" + name_presentation + "\n" + description

    tokens = get_tokens(data)

    # make tokens unique
    tokens = list(set(tokens))

    return tokens

In [27]:
def get_points(embeddings, ecu_system, collection_name):
    response = qclient.query_points(
        collection_name=collection_name,
        query=embeddings,
        query_filter=models.Filter(
            must=[
                models.FieldCondition(
                    key="ecu_system",
                    match=models.MatchValue(
                        value=ecu_system,
                    ),
                )
            ],
        ),
    )

    points = response.points

    return points

In [26]:
from database.database import oclient


def get_nomic_embeddings(text):
    embeddings_response = oclient.embeddings(model="nomic-embed-text", prompt=text)
    embeddings = embeddings_response["embedding"]

    return embeddings

In [30]:
import pandas as pd
from database.database import qclient

io_df = pd.DataFrame(
    columns=[
        "IO Mapping",
        "Description",
        "Embedding",
        "Component Name",
        "Component Description",
        "Score",
    ]
)


# loop with enumerate to get the index of the IO
for idx, io in enumerate(io_list):
    print(f"Processing IO Mapping - IO: {idx + 1}")
    tokens = get_io_tokens(io)
    text = " ".join(tokens)
    embeddings = get_bert_embeddings(text)

    name_presentation = ""
    name = io["Name"]
    if "NamePresentation" in io and "#text" in io["NamePresentation"]:
        name_presentation = io["NamePresentation"]["#text"]

    points = get_points(embeddings, "CCU1", COLLECTION_NAME)

    for point in points[:3]:
        row = {
            "IO Mapping": name,
            "Description": name_presentation,
            "Embedding": "WG_BERT",
            "Component Name": point.payload["name"],
            "Component Description": point.payload["description"],
            "Score": point.score,
        }

        io_df.loc[len(io_df)] = row

    # nomic embeddings
    embeddings = get_nomic_embeddings(text)

    points = get_points(embeddings, "CCU1", "components")

    for point in points[:3]:
        row = {
            "IO Mapping": name,
            "Description": name_presentation,
            "Embedding": "NOMIC",
            "Component Name": point.payload["name"],
            "Component Description": point.payload["description"],
            "Score": point.score,
        }

        io_df.loc[len(io_df)] = row


io_df.to_csv("../../data/output/io_mapping_ccu1.csv", index=False)

Processing IO Mapping - IO: 1
Processing IO Mapping - IO: 2
Processing IO Mapping - IO: 3
Processing IO Mapping - IO: 4
Processing IO Mapping - IO: 5
Processing IO Mapping - IO: 6
Processing IO Mapping - IO: 7
Processing IO Mapping - IO: 8
Processing IO Mapping - IO: 9
Processing IO Mapping - IO: 10
Processing IO Mapping - IO: 11
Processing IO Mapping - IO: 12
Processing IO Mapping - IO: 13
Processing IO Mapping - IO: 14
Processing IO Mapping - IO: 15
Processing IO Mapping - IO: 16
Processing IO Mapping - IO: 17
Processing IO Mapping - IO: 18
Processing IO Mapping - IO: 19
Processing IO Mapping - IO: 20
Processing IO Mapping - IO: 21
Processing IO Mapping - IO: 22
Processing IO Mapping - IO: 23
Processing IO Mapping - IO: 24
Processing IO Mapping - IO: 25
Processing IO Mapping - IO: 26
Processing IO Mapping - IO: 27
Processing IO Mapping - IO: 28
Processing IO Mapping - IO: 29
Processing IO Mapping - IO: 30
Processing IO Mapping - IO: 31
Processing IO Mapping - IO: 32
Processing IO Map