# Synthetic Areas

## Model Client

In [4]:
import openai
import google.generativeai as genai

from home_assistant_datasets import secrets
from home_assistant_datasets import model_client

secrets.DEFAULT_SECRETS_FILE = "../secrets.yaml"

# MODEL_ID = "gpt-3.5-turbo-0125"

# openai = openai.OpenAI(api_key=get_secret("openai_key"))
# model = model_client.ModelClient(openai, MODEL_ID)

# Gemini flash is higher quality and cheaper model than the GPT alternatives.
MODEL_ID = "gemini-1.5-flash"
genai.configure(api_key=secrets.get_secret("google_api_key"))
model = model_client.GoogleClient(MODEL_ID)

# Read seed data used in the prompt

The seed data for the prompt is manually curated to use as examples for further synthetic data generation.

In [9]:
import pathlib
import yaml

SEEDS_DIR = pathlib.Path("./seeds/")
HOME_AREAS_FILE = SEEDS_DIR / "areas.yaml"

with open(HOME_AREAS_FILE) as f:
    seed_areas = list(yaml.load_all(f.read(), Loader=yaml.Loader))

In [10]:
AREA_PROMPT_FORMAT = """
Input:
{home}
Output:
{areas}
"""

item = []
for seed in seed_areas:
    home = yaml.dump(seed["home"], sort_keys=False)
    areas = yaml.dump(seed["areas"], sort_keys=False)
    item.append(AREA_PROMPT_FORMAT.format(home=home, areas=areas))

seed_area_prompt = "\n".join(item)

## Generate Areas and devices

In [None]:
AREA_DEVICES_PROMPT = f"""
You are generating synthetic data to used to train models for Home Assistant
and used to evaluate things like generating a summary, performing home automation
actions, or for generating other synthetic data.

You use your knowledge about the world to generate details about homes that
can be used for synthetic smart home automation data. For example, an apartment
may have a smart thermostat, a house may have a smart garage door opener or
smart lock and camera, and all houses may have a smart light or weather feed
air quality, or a smart speaker or television. The needs of a home owner
may vary if they are a single person or family, or where in the world they
live. For example, a high rise apartment probably does not have a backyard.

Do not be cliche. Do not assume every home has a smart watch. Don't assume
every room has a smart light just because it is listed, but of course many will.

Below are example inputs and outputs for generating areas and devices for a home,
thinking step by step about the needs of the home.

{seed_area_prompt}
"""

In [15]:
from tqdm.auto import tqdm
from typing import Any
import itertools

import yaml
import pathlib
import random
import shutil
import slugify

# How many samples to generate
N_DATAPOINTS = 40

DATASET_DIR = pathlib.Path("../datasets/")
AREAS_OUTPUT_DIR = DATASET_DIR / "areas-v2"

# Wipe existing areas
shutil.rmtree(AREAS_OUTPUT_DIR, ignore_errors=True)
AREAS_OUTPUT_DIR.mkdir(exist_ok=True)

with open(DATASET_DIR / "homes.yaml", "r") as f:
    content = f.read()

data = yaml.safe_load(content)
homes = data["homes"]
random.shuffle(homes)

if len(homes) > N_DATAPOINTS:
    homes = homes[:N_DATAPOINTS]


home_ids = {}
def generate_home_id(home: dict[str, Any]) -> str:
    """Generate a new home id"""
    home_id = slugify.slugify(
        "-".join([
            home["name"],
            home["country_code"]
        ])
    )
    if home_id in home_ids:
        home_ids[home_id] += 1
        home_id = f"{home_id}-{home_ids[home_id]}"
    else:
        home_ids[home_id] = 0
    return home_id

skipped = 0
with tqdm(total=len(homes)) as pbar:
    for home in homes:
        home_id = generate_home_id(home)

        with open(AREAS_OUTPUT_DIR / f"{home_id}.yaml", "w") as area_output:
            batch_yaml = yaml.dump(home, explicit_start=True, sort_keys=False)
            response_obj = None
            for i in range(0, 3):
                response = model.complete(AREA_DEVICES_PROMPT, batch_yaml)
                try:
                    response_obj = yaml.safe_load(response)
                    areas = response_obj.get("Output")
                    break
                except yaml.YAMLError as exc:
                    continue

            if response_obj is not None:
                updated_home = home.copy()
                updated_home.update({"areas": areas})
                area_output.write(yaml.dump(updated_home, explicit_start=True, sort_keys=False))
            pbar.set_description(f"Skipped {skipped}")
            pbar.update(1)


Skipped 0: 100%|██████████| 40/40 [00:18<00:00,  2.14it/s]


## Data Distributions

Validate the data and look at the generated data distributions. This gives us a chance to hand fix any individual
bad records as well.

In [16]:
from operator import itemgetter

homes = []
for path in AREAS_OUTPUT_DIR.glob("*.yaml"):
    with path.open("r") as f:
        content = f.read()
    homes.append(yaml.load(content, Loader=yaml.Loader))

devices_counts =  {}
total_homes = len(homes)
area_names_count = {}
total_areas = 0
for home in homes:
    areas = home["areas"]
    for area in areas:
        total_areas += 1
        area_names_count[area] = area_names_count.get(area, 0) + 1

print(f"Total homes: {total_homes}")
print(f"Total areas: {total_areas} (average {total_areas / total_homes:0.2f} per home)")
print(f"Total unique areas: {len(area_names_count)}")

sorted_dict = dict(sorted(area_names_count.items(), key=itemgetter(1), reverse=True))
areas_rank = [ (k, f"{v} {(v / total_homes)*100:.0f}% {(v / total_areas)*100:.0f}%") for k, v in itertools.islice(sorted_dict.items(), 15) ]

print(f"Areas:")
areas_rank

Total homes: 40
Total areas: 312 (average 7.80 per home)
Total unique areas: 112
Areas:


[('Living Room', '31 78% 10%'),
 ('Bedroom', '30 75% 10%'),
 ('Kitchen', '25 62% 8%'),
 ('Guest Bedroom', '19 48% 6%'),
 ('Bathroom', '17 42% 5%'),
 ('Master Bedroom', '14 35% 4%'),
 ('Balcony', '11 28% 4%'),
 ('Dining Room', '7 18% 2%'),
 ('Backyard', '7 18% 2%'),
 ('Garden', '6 15% 2%'),
 ('Rooftop Terrace', '5 12% 2%'),
 ('Garage', '4 10% 1%'),
 ('Front Porch', '4 10% 1%'),
 ('Bedroom 2', '4 10% 1%'),
 ('Barn', '4 10% 1%')]