In [1]:
!pip install nltk spacy transformers pyyaml optuna sentence-transformers ipywidgets

!python -m spacy download en_core_web_sm

Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.5-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.5-py3-none-any.whl (247 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.4/247.4 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m39.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling co

In [3]:
import spacy
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
from typing import List, Dict, Any, Optional, Union, Tuple
from dataclasses import dataclass, field, asdict
import re
import yaml
import ipywidgets as widgets
from IPython.display import display
import datetime

# Load spaCy NLP pipeline
nlp = spacy.load("en_core_web_sm")

# Load HuggingFace NER model and tokenizer
model_name = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [4]:
@dataclass
class CalibrationDetails:
    camera_intrinsic: List[List[float]] = field(default_factory=list)
    rotation: List[float] = field(default_factory=list)
    translation: List[float] = field(default_factory=list)

@dataclass
class Sensor:
    SensorType: str
    CalibrationDetails: CalibrationDetails

@dataclass
class Sensors:
    camera: Optional[Sensor] = None
    lidar: Optional[Sensor] = None
    radar: Optional[Sensor] = None

@dataclass
class Position:
    x: float
    y: float
    z: float

@dataclass
class Orientation:
    qw: float
    qx: float
    qy: float
    qz: float

@dataclass
class VehicleState:
    Orientation: Orientation
    Position: Position
    Sensors: Sensors

@dataclass
class Environment:
    Illumination: str
    Objects: List[str]
    SceneType: str
    Weather: str

@dataclass
class ODD:
    ODD_ID: str
    Environment: Environment
    OperationalConditions: Dict[str, Any]
    Timestamp: str
    VehicleState: VehicleState

In [5]:
# Text preprocessing (lemmatization and stopword removal using SpaCy)
def preprocess_text_spacy(text: str) -> List[str]:
    doc = nlp(text)
    return [token.lemma_.lower() for token in doc if token.is_alpha and not token.is_stop]

# Value and unit extraction
def parse_value_and_unit(text: str) -> Tuple[Optional[Union[int, float]], Optional[str]]:
    number_match = re.search(r'(\d+(\.\d+)?)', text)
    value = None
    if number_match:
        try:
            value_str = number_match.group(1)
            value = float(value_str) if '.' in value_str else int(value_str)
        except ValueError:
            value = None
    unit = None
    if number_match:
        unit_text = text[number_match.end():].strip()
        unit = unit_text.lstrip('/ ').rstrip('.').strip() or None
    return value, unit

# BERT based Named Entity Recognition (NER)
def perform_ner(text: str) -> List[Dict[str, Any]]:
    return ner_pipeline(text)

In [6]:
def extract_detailed_odd_info(text: str) -> Dict[str, Any]:
    doc = nlp(text)
    extracted_data = {'entities': [], 'relationships': [], 'attributes': {}}
    entity_id_counter = 0
    entity_map = {}

    ner_entities = perform_ner(text)
    for ner_entity in ner_entities:
        word = ner_entity['word'].replace("##", "")
        entity_type_raw = ner_entity['entity'].replace('B-', '').replace('I-', '').capitalize()
        odd_type = 'Entity'
        if entity_type_raw in ['Loc', 'Gpe', 'Org']:
            odd_type = 'Environment'
        elif entity_type_raw in ['Per']:
            odd_type = 'Object'
        elif entity_type_raw == 'Misc':
            if any(x in word.lower() for x in ['car', 'vehicle', 'truck']):
                odd_type = 'Vehicle'
            elif any(x in word.lower() for x in ['weather', 'rain', 'sunny']):
                odd_type = 'Environment'
        entity_id = f"{odd_type.lower()}_{entity_id_counter}"
        entity_id_counter += 1
        new_entity = {
            'id': entity_id,
            'type': odd_type,
            'text': word,
            'attributes': {'ner_type': {'name': 'ner_type', 'value': {'value': entity_type_raw}}}
        }
        extracted_data['entities'].append(new_entity)
        entity_map[word.lower()] = new_entity

    # Example: Assigning adjectives as attributes to entities
    for token in doc:
        if token.pos_ == 'ADJ' and token.head.text.lower() in entity_map:
            entity = entity_map[token.head.text.lower()]
            attr_name = token.text.lower()
            attr_val = {'name': attr_name, 'value': {'value': token.text}}
            entity['attributes'][attr_name] = attr_val

    # Additional extraction rules can be added here as needed

    return extracted_data

In [7]:
def create_odd_structure(extracted_info: Dict[str, Any]) -> ODD:
    # Return a fixed example ODD instance (replace with dynamic construction as needed)
    env = Environment(
        Illumination="Unknown",
        Objects=[],
        SceneType="scene-0655",
        Weather="Unknown"
    )
    sensors = Sensors(
        camera=Sensor(
            SensorType="camera",
            CalibrationDetails=CalibrationDetails(
                camera_intrinsic=[[1257.86, 0, 827.24], [0, 1257.86, 450.91], [0, 0, 1]],
                rotation=[0.68, -0.66, 0.21, -0.21],
                translation=[1.57, 0.50, 1.50]
            )
        ),
        lidar=Sensor(
            SensorType="lidar",
            CalibrationDetails=CalibrationDetails(
                camera_intrinsic=[],
                rotation=[0.70, -0.01, 0.01, -0.70],
                translation=[0.98, 0.0, 1.84]
            )
        ),
        radar=Sensor(
            SensorType="radar",
            CalibrationDetails=CalibrationDetails(
                camera_intrinsic=[],
                rotation=[0.04, 0.0, 0.0, -0.99],
                translation=[-0.56, -0.61, 0.53]
            )
        )
    )
    pos = Position(x=1845.52, y=867.91, z=0.0)
    orient = Orientation(qw=0.9999, qx=-0.01, qy=-0.001, qz=0.0056)
    vehicle_state = VehicleState(Orientation=orient, Position=pos, Sensors=sensors)

    odd = ODD(
        ODD_ID="20250830151033_0cfcc4",
        Environment=env,
        OperationalConditions={
            "RoadType": "Unknown",
            "Route": "boston-seaport",
            "SpeedRange": "Unknown",
            "Traffic": "Unknown"
        },
        Timestamp=datetime.datetime.now().isoformat(),
        VehicleState=vehicle_state
    )
    return odd

In [9]:
def convert_odd_to_yaml(odd: ODD) -> str:
    odd_dict = asdict(odd)
    return yaml.dump(odd_dict, sort_keys=False)

In [8]:
import ipywidgets as widgets
from IPython.display import display

text_input = widgets.Textarea(
    value="",
    placeholder="Enter detailed natural language text related to ODD...",
    description="Text:",
    layout=widgets.Layout(width='auto', height='120px')
)

output_area = widgets.Output()
process_button = widgets.Button(
    description="Generate YAML",
    button_style='success',
    tooltip="Generate ODD YAML",
    icon='cogs'
)

def on_button_clicked(b):
    with output_area:
        output_area.clear_output()
        input_text = text_input.value.strip()
        if not input_text:
            print("Please enter some text.")
            return
        print("Preprocessing text...")
        _ = preprocess_text_spacy(input_text)
        print("Extracting information...")
        extracted = extract_detailed_odd_info(input_text)
        print(f"Extracted entities: {[e['text'] for e in extracted['entities']]}")
        odd = create_odd_structure(extracted)
        yaml_output = convert_odd_to_yaml(odd)
        print("\nGenerated YAML:\n")
        print(yaml_output)

process_button.on_click(on_button_clicked)

# Add this line to display the VBox with the necessary metadata
display(widgets.VBox([text_input, process_button, output_area]), metadata={'tags': ['collapsible', 'collapsed'], 'jupyter': {'outputs_order': [], 'source_hidden': False, 'metadata': {'widgets': {'state': {}}}}})

VBox(children=(Textarea(value='', description='Text:', layout=Layout(height='120px', width='auto'), placeholde…