# Synthetic Areas & Devices

## Model Client

In [1]:
import openai

from home_assistant_datasets.secrets import get_secret
from home_assistant_datasets import model_client

MODEL_ID = "gpt-3.5-turbo-0125"

openai = openai.OpenAI(api_key=get_secret("openai_key"))
model = model_client.ModelClient(openai, MODEL_ID)

# Read seed data used in the prompt

The seed data for the prompt is manually curated to use as examples for further synthetic data generation.

In [4]:
import pathlib
import yaml

DATASET_DIR = pathlib.Path("../datasets/")
SEEDS_DIR = DATASET_DIR / "seeds"
HOME_AREAS_FILE = SEEDS_DIR / "home-areas.yaml"

with open(HOME_AREAS_FILE) as f:
    seed_areas = list(yaml.load_all(f.read(), Loader=yaml.Loader))

In [10]:
AREA_PROMPT_FORMAT = """
Input:
{home}
Output:
{areas}
"""

item = []
for seed in seed_areas:
    home = yaml.dump(seed["home"], sort_keys=False)
    areas = yaml.dump(seed["areas"], sort_keys=False)
    item.append(AREA_PROMPT_FORMAT.format(home=home, areas=areas))

seed_area_prompt = "\n".join(item)

## Generate Areas and devices

In [25]:
AREA_DEVICES_PROMPT = f"""
You are generating synthetic data to used to train models for Home Assistant
and used to evaluate things like generating a summary, performing home automation
actions, or for generating other synthetic data.

You use your knowledge about the world to generate details about homes that
can be used for synthetic smart home automation data. For example, an apartment
may have a smart thermostat, a house may have a smart garage door opener or 
smart lock and camera, and all houses may have a smart light or weather feed
air quality, or a smart speaker or television. The needs of a home owner
may vary if they are a single person or family, or where in the world they
live. For example, a high rise aparment probably does not have a backyard.

Do not be cliche. Do not assume every home has a smart watch. Don't assume
every room has a smart light just because it is listed, but of course many will.

Below are example inputs and outputs for generating areas and devices for a home,
thinking step by step about the needs of the home.

{seed_area_prompt}
"""

In [32]:
from tqdm.auto import tqdm
import itertools

import yaml
import pathlib
import random

N_DATAPOINTS = 25
DATASET_DIR = pathlib.Path("../datasets/")
AREA_DEVICES_YAML = DATASET_DIR / "area-devices.yaml"

with open(DATASET_DIR / "homes.yaml", "r") as f:
    content = f.read()

data = yaml.safe_load(content)
homes = data["homes"]
random.shuffle(homes)

if len(homes) > N_DATAPOINTS:
    homes = homes[:N_DATAPOINTS]

skipped = 0
with open(AREA_DEVICES_YAML, "w") as device_output:
    with tqdm(total=len(homes)) as pbar:
        for home in homes:
            batch_yaml = yaml.dump(home, explicit_start=True, sort_keys=False)

            response_obj = None
            for i in range(0, 3):
                response = model.complete(AREA_DEVICES_PROMPT, batch_yaml)
                try:
                    response_obj = yaml.safe_load(response)
                    break
                except yaml.YAMLError as exc:
                    continue
            if "Output" in response_obj:
                del response_obj["Output"]

            if response_obj is not None:
                updated_home = home.copy()
                updated_home.update({"areas": response_obj})
                device_output.write(yaml.dump(updated_home, explicit_start=True, sort_keys=False))
            pbar.set_description(f"Skipped {skipped}")
            pbar.update(1)


Skipped 0: 100%|██████████| 25/25 [01:05<00:00,  2.60s/it]


## Data Distributions

Validate the data and look at the generated data distributions. This gives us a chance to hand fix any individual
bad records as well.

In [37]:
from operator import itemgetter

with open(AREA_DEVICES_YAML, "r") as f:
    content = f.read()
    homes = list(yaml.load_all(content, Loader=yaml.Loader))

devices_counts =  {}
total_homes = len(homes)
total_areas = 0
total_devices = 0
for home in homes:
    area_devices = home["areas"]["area_devices"]
    for (area, devices) in area_devices.items():
        total_areas += 1
        for device in devices:
            if device not in devices_counts:
                devices_counts[device] = 0
            devices_counts[device] += 1
            total_devices += 1
    other_devices = home["areas"]["other_devices"]
    for device in other_devices:
        if device not in devices_counts:
            devices_counts[device] = 0
        devices_counts[device] += 1
        total_devices += 1


print(f"Total homes: {total_homes}")
print(f"Total areas: {total_areas} (average {total_areas / total_homes:0.2f} per home)")
print(f"Total devices: {total_devices} (average {total_devices / total_homes:0.2f} per area)")
print(f"Total unique devices: {len(devices_counts)}")

sorted_dict = dict(sorted(devices_counts.items(), key=itemgetter(1), reverse=True))
device_rank = [ (k, f"{(v / total_devices)*100:.0f}%") for k, v in itertools.islice(sorted_dict.items(), 15) ]

print(f"Devices:")
device_rank

Total homes: 25
Total areas: 145 (average 5.80 per home)
Total devices: 293 (average 11.72 per area)
Total unique devices: 52
Devices:


[('light', '37%'),
 ('smart_tv', '8%'),
 ('smart_speaker', '7%'),
 ('smartphone', '6%'),
 ('laptop', '5%'),
 ('tablet', '4%'),
 ('camera', '4%'),
 ('smart_light', '2%'),
 ('sprinkler', '2%'),
 ('thermostat', '2%'),
 ('speaker', '2%'),
 ('smartwatch', '1%'),
 ('smart_blind', '1%'),
 ('smartphone 2', '1%'),
 ('cover', '1%')]