# generate data

In [None]:
import random
import json
from datetime import datetime, timedelta

# Define the list of devices (removed 'Air_conditioner')
devices = [
    'Ipad', 'Mac', 'Floor_lamp', 'Alexa', 'Speaker', 'TV3',
    'Microwave', 'Laptop charger1', 'Laptop charger2',
    'TV1', 'TV2', 'HVAC', 'Fridge', 'Dishwasher', 'Dryer',
    'Washer', 'Monitor', 'Playstation', 'Fan', 'Aspiradora',
    'Air fryer', 'Kettle', 'Rice cooker oven', 'Hot water heater'
]

# Define seasons based on month
def get_season(date):
    month = date.month
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

# Check if a day is weekend
def is_weekend(date):
    return date.weekday() >= 5  # 5 = Saturday, 6 = Sunday

# Initialize washer schedule: mapping week number to list of days (0=Monday,...6=Sunday)
washer_schedule = {}

# Helper function to assign two random days per week for washer usage
def assign_washer_days(year, week):
    if week in washer_schedule:
        return washer_schedule[week]
    # Assign two unique days for washer usage
    days = random.sample(range(7), 2)
    washer_schedule[week] = days
    return days

# Start and end times
start_time = datetime(2023, 11, 21, 0, 0, 0)
end_time = datetime(2024, 11, 21, 0, 0, 0)
time_interval = timedelta(minutes=10)

# Predefine some constants
DISHWASHER_START_HOUR = 20  # 8 PM
DISHWASHER_DURATION = timedelta(hours=1)  # Runs for 1 hour

# Rice cooker times (20 minutes before lunch and dinner)
LUNCH_TIME = datetime.strptime("12:00", "%H:%M").time()
DINNER_TIME = datetime.strptime("19:00", "%H:%M").time()
RICE_COOKER_DURATION = timedelta(minutes=20)

# Floor lamp on time
FLOOR_LAMP_START = 18  # 6 PM
FLOOR_LAMP_END = 23    # 11 PM

# Initialize schedule tracking variables
dishwasher_running = False
dishwasher_end_time = None

washer_running = False
washer_end_time = None
dryer_running = False
dryer_end_time = None

current_time = start_time

# Output JSON file path
output_file_path = 'synthetic_household_data.jsonl'

# Open the file in write mode
with open(output_file_path, 'w') as outfile:
    while current_time < end_time:
        season = get_season(current_time)
        week_number = current_time.isocalendar()[1]
        day_of_week = current_time.weekday()  # 0=Monday, 6=Sunday
        hour = current_time.hour
        minute = current_time.minute
        time = current_time.time()
        
        # Assign washer days per week
        washer_days = assign_washer_days(current_time.year, week_number)
        
        # Determine if today is a washer day
        is_washer_day = day_of_week in washer_days
        
        # Initialize label probabilities
        label_probabilities = {device: 0.0 for device in devices}
        
        # HVAC usage
        if season in ['Winter', 'Summer']:
            # HVAC nearly always on
            label_probabilities['HVAC'] = round(random.uniform(0.8, 1.0), 6)
        else:
            # HVAC on intermittently
            if random.random() < 0.7:  # 30% chance to be on
                label_probabilities['HVAC'] = round(random.uniform(0.8, 1.0), 6)
        
        # Fridge is always on but cycles; simulate with high probability
        label_probabilities['Fridge'] = round(random.uniform(0.8, 1.0), 6)
        
        # TVs on in the evening for a couple of hours
        if 18 <= hour < 23:
            label_probabilities['TV3'] = round(random.uniform(0.8, 1.0), 6)
            label_probabilities['TV1'] = round(random.uniform(0.8, 1.0), 6)
            label_probabilities['TV2'] = round(random.uniform(0.8, 1.0), 6)
        
        # Floor lamp on from 6 PM to 11 PM
        if FLOOR_LAMP_START <= hour < FLOOR_LAMP_END:
            label_probabilities['Floor_lamp'] = round(random.uniform(0.8, 1.0), 6)
        
        # Laptop chargers on during working hours and evening
        if 8 <= hour < 10 or 18 <= hour < 22:
            label_probabilities['Laptop charger1'] = round(random.uniform(0.8, 1.0), 6)
        if 8 <= hour < 10 or 18 <= hour < 22:
            label_probabilities['Laptop charger2'] = round(random.uniform(0.8, 1.0), 6)
        
        # Alexa and Speaker active in the evening
        if 18 <= hour < 23:
            label_probabilities['Alexa'] = round(random.uniform(0.8, 1.0), 6)
            label_probabilities['Speaker'] = round(random.uniform(0.8, 1.0), 6)
        
        # Fan active based on HVAC usage
        if label_probabilities['HVAC'] > 0:
            label_probabilities['Fan'] = round(random.uniform(0.8, 1.0), 6)
        
        # Aspiradora (vacuum cleaner) active on weekends, early morning
        if is_weekend(current_time) and 6 <= hour < 9:
            label_probabilities['Aspiradora'] = round(random.uniform(0.8, 1.0), 6)
        
        # Playstation active in the evening
        if 19 <= hour < 23:
            label_probabilities['Playstation'] = round(random.uniform(0.8, 1.0), 6)
        
        # Monitor active during laptop usage
        if label_probabilities['Laptop charger1'] > 0 or label_probabilities['Laptop charger2'] > 0:
            label_probabilities['Monitor'] = round(random.uniform(0.8, 1.0), 6)
        
        # Microwave usage: active randomly during meal times
        if (11 <= hour < 13) or (18 <= hour < 20):
            if random.random() < 0.3:  # 30% chance to be used
                label_probabilities['Microwave'] = round(random.uniform(0.8, 1.0), 6)
        
        # Air fryer usage: similar to microwave
        if (11 <= hour < 13) or (18 <= hour < 20):
            if random.random() < 0.2:  # 20% chance to be used
                label_probabilities['Air fryer'] = round(random.uniform(0.8, 1.0), 6)
        
        # Kettle active during morning and evening
        if (6 <= hour < 9) or (17 <= hour < 20):
            if random.random() < 0.5:  # 50% chance to be used
                label_probabilities['Kettle'] = round(random.uniform(0.8, 1.0), 6)
        
        # Rice cooker oven active 20 minutes before lunch and dinner
        if ((hour == LUNCH_TIME.hour and minute >= 40) or 
            (hour == DINNER_TIME.hour and minute >= 40)):
            label_probabilities['Rice cooker oven'] = round(random.uniform(0.8, 1.0), 6)
        
        # Dishwasher runs every night at 8 PM for 1 hour
        if hour == DISHWASHER_START_HOUR and minute == 0:
            dishwasher_running = True
            dishwasher_end_time = current_time + DISHWASHER_DURATION
            label_probabilities['Dishwasher'] = round(random.uniform(0.8, 1.0), 6)
        elif dishwasher_running:
            if current_time < dishwasher_end_time:
                label_probabilities['Dishwasher'] = round(random.uniform(0.8, 1.0), 6)
            else:
                dishwasher_running = False
        
        # Washer runs twice a week, assign running times (e.g., at 10 AM for 1 hour)
        if is_washer_day and hour == 10 and minute == 0:
            washer_running = True
            washer_end_time = current_time + timedelta(hours=1)
            label_probabilities['Washer'] = round(random.uniform(0.8, 1.0), 6)
        elif washer_running:
            if current_time < washer_end_time:
                label_probabilities['Washer'] = round(random.uniform(0.8, 1.0), 6)
            else:
                washer_running = False
                # Dryer starts immediately after washer ends, runs for 1 hour
                dryer_running = True
                dryer_end_time = current_time + timedelta(hours=1)
                label_probabilities['Dryer'] = round(random.uniform(0.8, 1.0), 6)
        elif dryer_running:
            if current_time < dryer_end_time:
                label_probabilities['Dryer'] = round(random.uniform(0.8, 1.0), 6)
            else:
                dryer_running = False
        
        # Hot water heater always on
        label_probabilities['Hot water heater'] = round(random.uniform(0.8, 1.0), 6)
        
        # Create entry
        entry = {
            "timestamp": current_time.strftime("%Y-%m-%dT%H:%M:%S.%f"),
            "label_probabilities": label_probabilities
        }
        
        # Write the JSON object as a single line
        outfile.write(json.dumps(entry) + '\n')
        
        # Increment current time
        current_time += time_interval
        
        # Optional: Print progress every month
        if current_time.day == 1 and current_time.hour == 0 and current_time.minute == 0:
            print(f"Generated data up to {current_time.strftime('%Y-%m-%d')}")

print(f"Data generation complete. Saved to {output_file_path}")


# Print the first few entries to confirm
for entry in data[:10]:  # First 24 hours (144 intervals of 10 minutes)
    print(json.dumps(entry, indent=4))


Generated data up to 2023-12-01
Generated data up to 2024-01-01
Generated data up to 2024-02-01
Generated data up to 2024-03-01
Generated data up to 2024-04-01
Generated data up to 2024-05-01
Generated data up to 2024-06-01
Generated data up to 2024-07-01
Generated data up to 2024-08-01
Generated data up to 2024-09-01
Generated data up to 2024-10-01
Generated data up to 2024-11-01
Data generation complete. Saved to synthetic_household_data.jsonl
{
    "timestamp": "2023-11-21T00:00:00.000000",
    "label_probabilities": {
        "Ipad": 0.0,
        "Mac": 0.0,
        "Floor_lamp": 0.0,
        "Alexa": 0.0,
        "Speaker": 0.0,
        "TV1": 0.0,
        "Microwave": 0.0,
        "Laptop charger1": 0.0,
        "Laptop charger2": 0.0,
        "TV2": 0.0,
        "TV3": 0.0,
        "HVAC": 0.970354,
        "Fridge": 0.839049,
        "Dishwasher": 0.0,
        "Dryer": 0.0,
        "Washer": 0.0,
        "Monitor": 0.0,
        "Playstation": 0.0,
        "Fan": 0.954577,
     

# upload data to dynamo

In [3]:
import boto3
import json
import logging
import sys
from decimal import Decimal  # Import Decimal for DynamoDB compatibility

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


def load_config(config_path='config.json'):
    """
    Load AWS and DynamoDB configuration from a JSON file.
    """
    try:
        with open(config_path, 'r') as file:
            config = json.load(file)
        
        aws_config = config.get('aws', {})
        dynamodb_config = config.get('dynamodb', {})
        
        access_key = aws_config.get('access_key_id')
        secret_key = aws_config.get('secret_access_key')
        region = aws_config.get('region_name')
        table_name = dynamodb_config.get('table_name')
        file_name = dynamodb_config.get('file_name')
        
        if not all([access_key, secret_key, region, table_name, file_name]):
            raise ValueError("Missing AWS or DynamoDB configuration in config.json.")
        
        return {
            'access_key': access_key,
            'secret_key': secret_key,
            'region': region,
            'table_name': table_name,
            'file_name': file_name
        }
    except FileNotFoundError:
        logging.error(f"Configuration file {config_path} not found.")
        sys.exit(1)
    except json.JSONDecodeError as e:
        logging.error(f"Error parsing {config_path}: {e}")
        sys.exit(1)
    except ValueError as ve:
        logging.error(f"Configuration error: {ve}")
        sys.exit(1)


def convert_float_to_decimal(item):
    """
    Recursively convert all float values in a dictionary to Decimal.
    """
    if isinstance(item, list):
        return [convert_float_to_decimal(sub_item) for sub_item in item]
    elif isinstance(item, dict):
        return {k: convert_float_to_decimal(v) for k, v in item.items()}
    elif isinstance(item, float):  # Convert float to Decimal
        return Decimal(str(item))
    return item


def upload_to_dynamodb(config):
    """
    Upload data from a JSON Lines file to DynamoDB using batch_writer().
    """
    dynamodb = boto3.resource(
        'dynamodb',
        region_name=config['region'],
        aws_access_key_id=config['access_key'],
        aws_secret_access_key=config['secret_key']
    )
    table = dynamodb.Table(config['table_name'])
    file_name = config['file_name']
    
    try:
        with open(file_name, 'r') as file:
            with table.batch_writer() as batch:
                for line_number, line in enumerate(file, start=1):
                    try:
                        # Parse the line as JSON
                        item = json.loads(line, parse_float=Decimal)  # Convert floats to Decimal
                        
                        # Validate 'timestamp'
                        if 'timestamp' not in item:
                            logging.warning(f"Line {line_number}: 'timestamp' field missing. Skipping item.")
                            continue
                        if not isinstance(item['timestamp'], str):
                            logging.warning(f"Line {line_number}: 'timestamp' is not a string. Skipping item.")
                            continue
                        
                        # Convert any remaining float types in 'label_probabilities' to Decimal
                        if 'label_probabilities' in item:
                            item['label_probabilities'] = convert_float_to_decimal(item['label_probabilities'])
                        
                        # Add the item to the batch
                        batch.put_item(Item=item)
                        
                        if line_number % 10000 == 0:
                            logging.info(f"Processed {line_number} items...")
                    
                    except json.JSONDecodeError as e:
                        logging.error(f"JSON decode error on line {line_number}: {e}")
                    except Exception as e:
                        logging.error(f"Unexpected error on line {line_number}: {e}")
        
        logging.info("Data upload complete.")
    
    except FileNotFoundError:
        logging.error(f"JSON file {file_name} not found.")
        sys.exit(1)
    except Exception as e:
        logging.error(f"An error occurred during upload: {e}")
        sys.exit(1)


if __name__ == "__main__":
    config = load_config('config.json')
    upload_to_dynamodb(config)


2024-11-22 19:32:05,297 - INFO - Processed 10000 items...
2024-11-22 19:32:26,397 - INFO - Processed 20000 items...
2024-11-22 19:32:50,262 - INFO - Processed 30000 items...
2024-11-22 19:33:10,650 - INFO - Processed 40000 items...
2024-11-22 19:33:30,077 - INFO - Processed 50000 items...
2024-11-22 19:33:35,293 - INFO - Data upload complete.
