In [None]:
import os
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
#Generates the features
# Define paths
path_LT_Pulses_Flag = ... #Origin folder of LT HP Pulses Flag as generated from generate_power_data_for_LT_ASHP.ipynb
path_LT_Sparsity = ... #Destination folder for LT ASHP Sparsity data
path_LT_Pulses = ... #Destination folder for LT ASHP Pulses data

# Load dataset the path where the modified all_summary.csv file is located 
all_summary = pd.read_csv('all_summary.csv')  

# Filter data
all_summary = all_summary[all_summary['Included_SPF_analysis'] == True]
all_summary_LT_ASHP = all_summary[all_summary['HP_Installed'] == "ASHP"]

granularity = 2
power_level = np.zeros(len(all_summary_LT_ASHP))

# Process each property
for i, property_id in enumerate(all_summary_LT_ASHP['Property_ID']):
    file_path = os.path.join(path_LT_Pulses_Flag, f"{property_id}.parquet")
    
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        continue

    # Load household data
    household = pd.read_parquet(file_path)

    # Convert timestamp to datetime
    household['Timestamp'] = pd.to_datetime(household['Timestamp'])

    # Extract time-based attributes
    household['Year'] = household['Timestamp'].dt.year
    household['Month'] = household['Timestamp'].dt.month
    household['Day'] = household['Timestamp'].dt.day
    household['DayOfYear'] = household['Timestamp'].dt.dayofyear

    power_level[i] = household['Heating_Pulses'].max()

    # Initialize lists for sparsity and pulses
    sparsity_data = []
    pulses_data = []

    # Process yearly and daily data
    for year_iter in range(household['Year'].min(), household['Year'].max() + 1):
        household_year = household[household['Year'] == year_iter]

        for dayofyear_iter in range(household_year['DayOfYear'].min(), household_year['DayOfYear'].max() + 1):
            household_day = household_year[household_year['DayOfYear'] == dayofyear_iter]

            timestamp = datetime(year_iter, 1, 1) + timedelta(days=dayofyear_iter - 1)

            if household_day.empty:
                sparsity_data.append([timestamp, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan])
                continue

            percentage_on = household_day['Heating_Pulse_On'].sum() / len(household_day['Heating_Pulse_On'])
            percentage_max_power_on = household_day['Heating_Pulses'].sum() / (
                household_day['Heating_Pulse_On'].sum() * power_level[i]
            ) if household_day['Heating_Pulse_On'].sum() > 0 else np.nan

            # Identify pulse segments
            changes = np.diff(household_day['Heating_Pulse_On'].values, prepend=True)
            segment_lengths = np.diff(np.where(np.append(changes, True))[0])

            counter_pulse = 0
            current_point = 0

            for seg_length in segment_lengths:
                timestamp_pulse = household_day['Timestamp'].iloc[current_point]
                on_status = household_day['Heating_Pulse_On'].iloc[current_point]
                avg_power = household_day['Heating_Pulses'].iloc[current_point: current_point + seg_length].mean()
                energy = avg_power * granularity * seg_length / 60
                fullness = avg_power / power_level[i] if power_level[i] != 0 else np.nan

                pulses_data.append([timestamp_pulse, granularity * seg_length, on_status, avg_power, energy, fullness])

                counter_pulse += 1
                current_point += seg_length

            pulse_df = pd.DataFrame(pulses_data, columns=['Timestamp', 'Length_Minutes', 'On', 'AveragePower', 'Energy', 'Fullness'])
            
            # Calculate summary statistics for pulses
            on_pulses = pulse_df[pulse_df['On'] == 1]
            idle_pulses = pulse_df[pulse_df['On'] == 0]

            sparsity_data.append([
                timestamp,
                percentage_on,
                percentage_max_power_on,
                len(on_pulses),
                len(idle_pulses),
                on_pulses['Length_Minutes'].min() if not on_pulses.empty else np.nan,
                on_pulses['Length_Minutes'].max() if not on_pulses.empty else np.nan,
                on_pulses['Length_Minutes'].mean() if not on_pulses.empty else np.nan,
                on_pulses['Length_Minutes'].median() if not on_pulses.empty else np.nan
            ])

    # Convert to DataFrames
    sparsity_df = pd.DataFrame(sparsity_data, columns=[
        'Time', 'Percentage_On', 'Percentage_Max_Power_On', 'Number_Of_Pulses', 'Number_Of_Idle',
        'Min_Pulse_Duration', 'Max_Pulse_Duration', 'Avg_Pulse_Duration', 'Med_Pulse_Duration'
    ])

    pulses_df = pd.DataFrame(pulses_data, columns=['Time', 'Length_Minutes', 'On', 'AveragePower', 'Energy', 'Fullness'])

    # Save to Parquet
    sparsity_output = os.path.join(path_LT_Sparsity, f"{property_id}.parquet")
    pulses_output = os.path.join(path_LT_Pulses, f"{property_id}.parquet")

    sparsity_df.to_parquet(sparsity_output, index=False)
    pulses_df.to_parquet(pulses_output, index=False)

    print(f"Processed and saved: {sparsity_output}, {pulses_output}")