In [1]:
# Import necessary libraries
import os
import numpy as np
import pandas as pd
import random

# Set a seed for reproducibility of random data
np.random.seed(42)
random.seed(42)

# User-defined variables for customization
# Adjust these variables to generate different data patterns
start_date = '2024-01-01'  # Start date for the data
num_days = 1825            # Number of days (e.g., 1825 for 5 years of daily data)

# Paths to save generated CSV files (modify as needed)
output_daily_path = './data/synthetic_daily_data.csv'
output_hourly_path = './data/synthetic_hourly_data.csv'

# Ensure output directories exist
os.makedirs(os.path.dirname(output_daily_path), exist_ok=True)
os.makedirs(os.path.dirname(output_hourly_path), exist_ok=True)

# Step 1: Generate Date Range
# This generates a range of dates for the specified period
date_range = pd.date_range(start=start_date, periods=num_days, freq='D')

# Step 2: Create DataFrame to hold daily data
# Here, a DataFrame is initialized with the generated date range as index
data = pd.DataFrame(index=date_range)
data['Date'] = date_range

# Step 3: Generate Synthetic Data for Each Column
# Customize loc (mean) and scale (standard deviation) as needed for variability

# Demand data centered around 1000 with variability
data['Demand'] = np.random.normal(loc=1000, scale=100, size=len(date_range))

# Supply data based on Demand with an additional variability
data['Supply'] = data['Demand'] + np.random.normal(loc=50, scale=150, size=len(date_range))

# Waste Generation data centered around 50 with smaller variability
data['Waste_Generation'] = np.random.normal(loc=50, scale=10, size=len(date_range))

# Pricing with a trend over time and added randomness
data['Pricing'] = np.random.normal(loc=10, scale=5, size=len(date_range)) + np.linspace(0, 50, len(date_range))

# Economic Growth Rate with minor variability around a central growth rate
data['Economic_Growth_Rate'] = np.random.normal(loc=3, scale=0.5, size=len(date_range))

# Resource Availability with minor variability around 1100
data['Resource_Availability'] = np.random.normal(loc=1100, scale=10, size=len(date_range))

# Energy Prices with a gradual increase and added randomness
data['Energy_Prices'] = np.random.normal(loc=70, scale=5, size=len(date_range)) + np.linspace(0, 20, len(date_range))

# Step 4: Save the Daily Data to CSV
# This saves the generated daily data as a CSV file at the specified path
data.to_csv(output_daily_path, index=False)
print("Daily data with improved variability generated and saved successfully at:", output_daily_path)


# Function to Generate Hourly Data based on Daily Data
# This function expands daily data into hourly data with slight variations for each hour
def generate_hourly_data(daily_data):
    hourly_data_list = []  # Initialize an empty list to hold hourly data
    
    # Loop through each day in the daily DataFrame
    for idx, row in daily_data.iterrows():
        for hour in range(24):  # Generate data for each hour
            # Create hourly data by adding small random fluctuations to daily values
            hour_data = {
                'Date': row['Date'] + pd.Timedelta(hours=hour),  # Create hourly timestamp
                'Demand': row['Demand'] + np.random.uniform(-10, 10),
                'Supply': row['Supply'] + np.random.uniform(-10, 10),
                'Pricing': row['Pricing'] + np.random.uniform(-0.5, 0.5),
                'Waste_Generation': row['Waste_Generation'] + np.random.uniform(-2, 2),
                'Economic_Growth_Rate': row['Economic_Growth_Rate'] + np.random.uniform(-0.1, 0.1),
                'Resource_Availability': row['Resource_Availability'] + np.random.uniform(-5, 5),
                'Energy_Prices': row['Energy_Prices'] + np.random.uniform(-1, 1)
            }
            # Append each hourly data dictionary to the list
            hourly_data_list.append(hour_data)
    
    # Convert list of hourly data to a DataFrame
    hourly_data = pd.DataFrame(hourly_data_list)
    return hourly_data

# Step 5: Generate and Save Hourly Data to CSV
# This uses the function above to create hourly data based on daily values
hourly_data = generate_hourly_data(data)
hourly_data.to_csv(output_hourly_path, index=False)
print("Hourly data with variability generated and saved successfully at:", output_hourly_path)


Daily data with improved variability generated and saved successfully at: ./data/synthetic_daily_data.csv
Hourly data with variability generated and saved successfully at: ./data/synthetic_hourly_data.csv
