# Synthetic Home

An experiment to use an LLM to generate synthetic home data.

In [2]:
import openai

from home_assistant_datasets.secrets import get_secret
from home_assistant_datasets import model_client

MODEL_ID = "gpt-3.5-turbo-0125"

openai = openai.OpenAI(api_key=get_secret("openai_key"))
model = model_client.ModelClient(openai, MODEL_ID)

# Home Generation


In [8]:
import pathlib
import yaml

SEEDS_DIR = pathlib.Path("./seeds")
HOME_AREAS_FILE = SEEDS_DIR / "homes.yaml"

with open(HOME_AREAS_FILE) as f:
    seed_homes = list(yaml.load(f.read(), Loader=yaml.Loader))

seed_home_prompt = yaml.dump(seed_homes, sort_keys=False)

In [9]:
HOME_PROMPT = f"""
You are generating synthetic data to used to train models for Home Assistant. You
use your knowledge about the world, geographies, demographics, and every day
life to generate synthetic home information (whether not directly relevant to
smart home automation).

For example, you might know these types of patterns:
- a single person in US may have a different home than a family in FR.
- a home in the US might have a garage, a home in Europe might have a garden.
- a studio apartment might not have a dining room.
- a family house might have a back yard
- a person living in a high rise might not have a backyard.

Remember that synetic data should not contain cliches or be for super wealthy,
but instead represent the full specrum of the population home and lifestyles.


Example yaml output:

{{seed_home_prompt}}

The user will ask you to generate a home data.
"""

In [11]:
# Distribution from Home Assistant Analytics

countries = [('US', '17%'),
    ('DE', '14%'),
    ('NL', '6%'),
    ('GB', '6%'),
    ('FR', '5%'),
    ('CN', '4%'),
    ('IT', '4%'),
    ('RU', '3%'),
    ('ES', '3%'),
    ('AU', '3%'),
    ('PL', '3%'),
    ('SE', '3%'),
    ('CA', '3%'),
    ('BE', '2%'),
    ('DK', '2%')]

output = []
for (country, _) in countries:
    response = model.complete(HOME_PROMPT, f"Please generate a description of 10 homes in {country} in yaml")
    output.append((country, response))

In [12]:
import yaml
import re
import pathlib

DATASET_DIR = pathlib.Path("../datasets/")

YAML_RE = "```yaml\n(.*?)\n```"

with open(DATASET_DIR / "homes.yaml", "w") as f:
    for (country, response) in output:
        f.write(country)
        f.write(response)
        f.write("\n")

In [32]:
# One off to generate additional data for a specific country

response = model.complete(HOME_PROMPT, f"Please generate a description of 20 homes in Canada in yaml")
print(response)

homes:
  - name: Maple Cottage
    country_code: "CA"
    location: "Rural area in Ontario"
    type: "Country home"
    amenities:
      - 4 bedrooms
      - 3 bathrooms
      - Large living room and kitchen area
      - Front porch and back deck
      - Detached garage
      - Workshop space

  - name: Lakeside Retreat
    country_code: "CA"
    location: "Cottage country in British Columbia"
    type: "Lakefront cabin"
    amenities:
      - 2 bedrooms
      - 1 bathroom
      - Cozy living room with fireplace
      - Deck overlooking the lake
      - Outdoor fire pit
      - Boat dock

  - name: City Condo
    country_code: "CA"
    location: "Urban area in Toronto"
    type: "Condominium"
    amenities:
      - 1 bedroom
      - 1 bathroom
      - Open concept living and dining area
      - Balcony with city views
      - Shared gym and rooftop terrace

  - name: Forest Hideaway
    country_code: "CA"
    location: "Wooded area in Quebec"
    type: "Cabin"
    amenities:
      - 3

In [57]:
response = model.complete(HOME_PROMPT, f"Please generate a description of 5 homes in the United States (US) and Germany (DE) in yaml")
print(response)

homes:
  - name: Home1
    country_code: "US"
    location: "Suburban area in California"
    type: "Single-family house"
    amenities:
      - 3 bedrooms
      - 2 bathrooms
      - Living room, dining room, and kitchen
      - Backyard with a patio
      - Attached garage
      - Home office

  - name: Home2
    country_code: "US"
    location: "Urban area in New York City"
    type: "Apartment"
    amenities:
      - 1 bedroom
      - 1 bathroom
      - Open concept living and kitchen area
      - Balcony with city views
      - Access to communal rooftop garden
      - Laundry room in building

  - name: Home3
    country_code: "US"
    location: "Rural area in Texas"
    type: "Farmhouse"
    amenities:
      - 4 bedrooms
      - 3 bathrooms
      - Spacious living room with fireplace
      - Large kitchen with farmhouse sink
      - Front porch with rocking chairs
      - Barn and chicken coop in backyard

  - name: Home4
    country_code: "DE"
    location: "Suburban area in Mu

# Measure Country Distribution

Compare to the home Assistant analytics.

In [58]:

with open(DATASET_DIR / "homes.yaml", "r") as f:
    content = f.read()

# Parse the yaml content and count the number of homes in each country code
data = yaml.safe_load(content)

country_codes = {}
total = 0
for home in data["homes"]:
    #print(home)
    country_code = home["country_code"]
    if country_code in country_codes:
        country_codes[country_code] += 1
    else:
        country_codes[country_code] = 1
    total += 1

import itertools

country_distribution = [ (k, f"{(v / total)*100:.0f}%") for k, v in itertools.islice(country_codes.items(), 15) ]

print(country_distribution)    


[('US', '17%'), ('DE', '11%'), ('NL', '7%'), ('GB', '7%'), ('FR', '7%'), ('CN', '5%'), ('IT', '5%'), ('RU', '5%'), ('ES', '5%'), ('AU', '5%'), ('PL', '5%'), ('SE', '5%'), ('CA', '5%'), ('BE', '5%'), ('DK', '5%')]
