# Synthetic Device Entities

This notebook is for creating synthetic entites based on the previously seeded devices for each area.

## Model Client

In [56]:
import openai

from home_assistant_datasets.secrets import get_secret
from home_assistant_datasets import model_client

MODEL_ID = "gpt-3.5-turbo-0125"

openai = openai.OpenAI(api_key=get_secret("openai_key"))
model = model_client.ModelClient(openai, MODEL_ID)

## Read Seed Data

The seed data for the prompt is manually curated to use as examples for further synthetic data generation.

In [57]:
import pathlib
import yaml

DATASET_DIR = pathlib.Path("../datasets/")
SEEDS_DIR = DATASET_DIR / "seeds"
TYPES_FILE = SEEDS_DIR / "types.yaml"
DEVICE_ENTITIES_FILE = SEEDS_DIR / "device-entities.yaml"

with open(TYPES_FILE) as f:
    seeds = yaml.load(f.read(), Loader=yaml.Loader)

platforms = seeds["platforms"]
sensor_device_classes = seeds["sensor_device_classes"]
binary_sensor_device_classes = seeds["binary_sensor_device_classes"]

with open(DEVICE_ENTITIES_FILE) as f:
    seed_device_entities = list(yaml.load_all(f.read(), Loader=yaml.Loader))

In [58]:
DEVICE_ENTITY_PROMPT_FORMAT = """
Input:
{home}
Output:
{device_entities}
"""

item = []
for seed in seed_device_entities:
    print(seed)
    home = yaml.dump(seed["home"], sort_keys=False)
    entities = yaml.dump(seed["device_entities"], sort_keys=False)
    item.append(DEVICE_ENTITY_PROMPT_FORMAT.format(home=home, device_entities=entities))

seed_device_entity_prompt = "\n".join(item)


{'home': {'name': 'Coastal Bungalow', 'thoughts': ['The bungalow in a seaside village may be a vacation home or a tranquil retreat.', 'The cozy living room with a fireplace hints at a cozy atmosphere, and there may be smart climate control for comfort.', 'The outdoor shower is convenient for beach days, so there may be smart water heating systems.'], 'desc': 'Bungalow in a seaside village in Norway', 'area_devices': {'Living Room': ['light', 'climate_control'], 'Bedrooms': ['light'], 'Porch': ['light'], 'Outdoor Shower': ['water_heater']}, 'other_devices': ['smartphone', 'laptop']}, 'device_entities': [{'Living Room': [{'name': 'light', 'entities': ['light.living_room']}, {'name': 'climate_control', 'entities': ['climage.living_room', 'sensor.living_room_temperature', 'sensor.living_room_humidity']}]}, {'Bedrooms': [{'name': 'light', 'entities': ['light.bedroom']}]}, {'Porch': [{'name': 'light', 'entities': ['light.porch']}]}, {'Outdoor Shower': [{'name': 'water_heater', 'entities': ['

In [59]:
DEVICE_TYPE_PROMPT = f"""
You are an expert in smart home automation and are generating data used to
evaluate the performance of a smart home system Home Assistant on tasks like
summarization, performing actions, or being an independent agent managing
automations and maintenace tasks.

A device in Home Assistant represents a physical or virtual object. Users interact
with a device through entities. For example a smart light bulb may have an entity
that represents the light bulb. A smart thermostat may have a climate entity and
sensor and humidity sensors. A smart dishwasher device may be represented as a few
sensors, a select entity for changing the mode, and switch entities.

The following entity types are supported:
{platforms}

Below are example inputs and outputs for generating device entities for a home.

{seed_device_entity_prompt}
"""

In [61]:
from tqdm.auto import tqdm
import random

N_DATAPOINTS = 5
AREA_DEVICES_YAML = DATASET_DIR / "area-devices.yaml"
DEVICE_ENTITIES_YAML = DATASET_DIR / "device-entities.yaml"

with open(AREA_DEVICES_YAML) as f:
    content = f.read()
    homes = list(yaml.safe_load_all(content))

random.shuffle(homes)

if len(homes) > N_DATAPOINTS:
    homes = homes[:N_DATAPOINTS]

skipped = 0
with open(DEVICE_ENTITIES_YAML, "w") as device_output:
    with tqdm(total=len(homes)) as pbar:
        for home in homes:
            response_obj = None
            for i in range(3):
                prompt = DEVICE_ENTITY_PROMPT_FORMAT.format(home=home, device_entities="")
                response = model.complete(DEVICE_TYPE_PROMPT, prompt)
                try:
                    response_obj = yaml.safe_load(response)
                except yaml.YAMLError:
                    skipped += 1
                    continue
            if response_obj is not None:
                updated_home = home.copy()
                updated_home.update({"device_entities": response_obj})
                device_output.write(yaml.dump(updated_home, explicit_start=True, sort_keys=False))
            pbar.set_description(f"Skipped {skipped}")
            pbar.update(1)

Skipped 0: 100%|██████████| 5/5 [00:56<00:00, 11.32s/it]


## Validation

Examine the dataset and look at the data and statistics. This is also a chance to perform any manual cleaning if there are minor formatting issues in the generated data.


In [85]:
import itertools
from operator import itemgetter

with open(DEVICE_ENTITIES_YAML, "r") as f:
    content = f.read()
    homes = list(yaml.load_all(content, Loader=yaml.Loader))

entity_counts =  {}
domain_counts = {}
total_homes = len(homes)
total_areas = 0
total_entities = 0
for home in homes:
    area_devices = home["device_entities"]
    for area_data in area_devices:
        area = next(iter(area_data.keys()))
        device_data = next(iter(area_data.values()))
        for device in device_data:
            #for (area, device_data) in area_devices.items():
            entities = device["entities"]
            total_areas += 1
            for entity_id in entities:
                if entity_id not in entity_counts:
                    entity_counts[entity_id] = 0
                entity_counts[entity_id] += 1
                domain = entity_id.split(".")[0]
                domain_counts[domain] = domain_counts.get(domain, 0) + 1
                total_entities += 1


print(f"Total homes: {total_homes}")
print(f"Total areas: {total_areas} (average {total_areas / total_homes:0.2f} per home)")
print(f"Total entities: {total_entities} (average {total_entities / total_areas:0.2f} per area)")
print(f"Total unique entities: {len(entity_counts)}")

sorted_dict = dict(sorted(domain_counts.items(), key=itemgetter(1), reverse=True))
domain_rank = [ (k, f"{(v / total_entities)*100:.0f}%") for k, v in itertools.islice(sorted_dict.items(), 15) ]

print(f"Domains:")
domain_rank

Total homes: 5
Total areas: 60 (average 12.00 per home)
Total entities: 79 (average 1.32 per area)
Total unique entities: 63
Domains:


[('light', '29%'),
 ('switch', '14%'),
 ('device_tracker', '14%'),
 ('media_player', '13%'),
 ('sensor', '11%'),
 ('remote', '5%'),
 ('climate', '4%'),
 ('binary-sensor', '3%'),
 ('camera', '3%'),
 ('binary_sensor', '3%'),
 ('cover', '3%')]