In [4]:
import os

import openai
from home_assistant_datasets.secrets import get_secret
from home_assistant_datasets import model_client

MODEL_ID = "gpt-3.5-turbo-0125"
os.environ['SECRETS_FILE'] = '../secrets.yaml'

openai = openai.OpenAI(api_key=get_secret("openai_api_key"))
model = model_client.ModelClient(openai, MODEL_ID)

# Read seed data

The seed data for the prompt is manually curated to use as examples for further synthetic data generation.

In [13]:
import pathlib
import yaml

SEEDS_DIR = pathlib.Path("./seeds/")
ANOMALIES_FILE = SEEDS_DIR / "anomalies.yaml"

with open(ANOMALIES_FILE) as f:
    anomaly_dataset = list(yaml.load_all(f.read(), Loader=yaml.Loader))

normal = []
anomaly = []
for ds in anomaly_dataset:
    if 'normal' in ds:
        normal.extend(ds['normal'])
    if 'anomaly' in ds:
        anomaly.extend(ds['anomaly'])


In [23]:
ANAMOLIES_PROMPT = f"""
You are generating synthetic data to used to train models for Home Assistant
and used to evaluate things like generating a summary, performing home automation
actions, or for generating other synthetic data.

The data should describe the current state of an area in a home and be classified as
"Normal" or "Anomaly."

Generate a balanced dataset with a significant portion of normal entries and a smaller
portion of anomalies. Anomalies should cover a variety of scenarios related to security,
climate control, appliances, sensors, and unusual activity. Consider including different
combinations of sensor and device states for richer data. Ensure the data reflects realistic
variations in sensor readings based on factors like time of day and season.

The user will give you some examples classifications and you generate more.
"""

In [29]:
from tqdm.auto import tqdm
from typing import Any
import itertools

import yaml
import pathlib
import random
import shutil
import slugify

# How many samples to generate
N_DATAPOINTS = 10
NUM_NORMAL = 15
NUM_ANOMALY = 5
ANOMALY = "Anomaly: "
NORMAL = "Normal: "

DATASET_DIR = pathlib.Path("../datasets/")
ANOMALY_OUTPUT_DIR = DATASET_DIR / "anomaly"

# Wipe existing areas
shutil.rmtree(ANOMALY_OUTPUT_DIR, ignore_errors=True)
ANOMALY_OUTPUT_DIR.mkdir(exist_ok=True)


def make_seed_items() -> str:
    """Generate seed items for the generation prompt."""
    items = [
        *[
            f"{ANOMALY}{item}"
            for item in random.choices(normal, k=NUM_NORMAL)
        ],
        *[
            f"{NORMAL}{item}"
            for item in random.choices(anomaly, k=NUM_NORMAL)
        ],
    ]
    random.shuffle(items)
    return "\n".join(items)


skipped = 0
with tqdm(total=N_DATAPOINTS) as pbar:
    for i in range(0, N_DATAPOINTS):
        with open(ANOMALY_OUTPUT_DIR / f"anomalies-{i}.yaml", "w") as output:
            seeds = make_seed_items()
            prompt = ANAMOLIES_PROMPT.format(seeds=seeds)

            response_obj = None
            for i in range(0, 3):
                response = model.complete(ANAMOLIES_PROMPT, prompt)
                try:
                    response_obj = yaml.safe_load(response)
                    break
                except yaml.YAMLError as exc:
                    continue

            anomaly_response = []
            normal_response = []
            for line in response.splitlines():
                if line.startswith(ANOMALY):
                    anomaly_response.append(line.lstrip(ANOMALY))
                elif line.startswith(NORMAL):
                    normal_response.append(line.lstrip(NORMAL))
                else:
                    skipped += 1
            update = {
                "normal": normal_response,
                "anomaly": anomaly_response,
            }
            output.write(yaml.dump(update, explicit_start=True, sort_keys=False))
            output.flush()

            pbar.set_description(f"Skipped {skipped}")
            pbar.update(1)

Skipped 641: 100%|██████████| 40/40 [06:32<00:00,  9.82s/it]


# Merge intermediate results

In [15]:
records = []
for in_file in ANOMALY_OUTPUT_DIR.glob("anomalies-*.yaml"):
    docs = list(yaml.load_all(in_file.read_bytes(), Loader=yaml.Loader))
    for doc in docs:
        for entry in doc.get("normal", []):
            records.append({
                "summary": entry,
                "label": "normal",
            })
        for entry in doc.get("anomaly", []):
            records.append({
                "summary": entry,
                "label": "anomaly",
            })

random.shuffle(records)
update = {"records": records}
with (ANOMALY_OUTPUT_DIR / "anomalies.yaml").open("w") as output:
    output.write(yaml.dump(update, explicit_start=True, sort_keys=False))