# Function Definitions

In [2]:
import pandas as pd
import numpy as np
import tqdm 
from itertools import product
from pathlib import Path


def list_available_columns(df):
    print(f"Dimensions of data: {df.shape}")
    print(f'optional_cols = [')
    for col in df.columns.to_list():
        print(f"\t\'{col}\',")
    print("\t]")
    
    return None


def take_differences_over_columns(simulation_results, base_scenario, columns_to_take_difference):
    base_data = simulation_results[simulation_results['Weather Scenario'] == base_scenario].copy(deep=True)
    base_data = base_data.sort_values(by=['bldg_id', 'Year', 'Month']).reset_index(drop=True)

    differences = []
    simulation_results = simulation_results.copy(deep=True)
    # for each scenario in senarios minus historical 
    for scenario in list(set(simulation_results['Weather Scenario'].unique()) - set([base_scenario])):
        scenario_data = simulation_results[simulation_results['Weather Scenario'] == scenario].copy(deep=True)
        scenario_data = scenario_data.sort_values(by=['bldg_id', 'Year', 'Month']).reset_index(drop=True)

        # print(scenario)
        # print(scenario_data["Cost Heating:FuelOilNo2 [$](Monthly)"])
        # print(base_data["Cost Heating:FuelOilNo2 [$](Monthly)"])

        scenario_data[columns_to_take_difference] = scenario_data[columns_to_take_difference] - base_data[columns_to_take_difference]

        if len(scenario_data) != len(base_data): 
            print(f"len(scenario_data), len(base_data): {len(scenario_data)}, {len(base_data)}")
            raise RuntimeError(f"Climate scenario {scenario} and base have different number of house-years (rows)")
        
        # print(scenario_data["Cost Heating:FuelOilNo2 [$](Monthly)"])

        differences.append(scenario_data)

    differences = pd.concat(differences)
    return differences

# Join buildstock metadata onto the simulation results 
### Choose which metadata fields to keep 

In [3]:
# join buildstock metadata onto the simulation results 
buildstock_file = Path("/Users/camilotoruno/Documents/local_research_data/buildings_LA_Detroit/buildstock.csv")
buildstock = pd.read_csv(buildstock_file)

print("Buildstock available columns:")
list_available_columns(buildstock)

Buildstock available columns:
Dimensions of data: (722, 158)
optional_cols = [
	'bldg_id',
	'upgrade',
	'weight',
	'applicability',
	'in.sqft',
	'in.ahs_region',
	'in.ashrae_iecc_climate_zone_2004',
	'in.ashrae_iecc_climate_zone_2004_2_a_split',
	'in.bathroom_spot_vent_hour',
	'in.bedrooms',
	'in.building_america_climate_zone',
	'in.cec_climate_zone',
	'in.ceiling_fan',
	'in.census_division',
	'in.census_division_recs',
	'in.census_region',
	'in.city',
	'in.clothes_dryer',
	'in.clothes_washer',
	'in.clothes_washer_presence',
	'in.cooking_range',
	'in.cooling_setpoint',
	'in.cooling_setpoint_has_offset',
	'in.cooling_setpoint_offset_magnitude',
	'in.cooling_setpoint_offset_period',
	'in.corridor',
	'in.county',
	'in.county_and_puma',
	'in.dehumidifier',
	'in.dishwasher',
	'in.door_area',
	'in.doors',
	'in.ducts',
	'in.eaves',
	'in.electric_vehicle',
	'in.emissions_electricity_folders',
	'in.emissions_electricity_units',
	'in.emissions_electricity_values_or_filepaths',
	'in.emissions_fos

### Keep desired columns of buildstock metadata
Copy and paste the available optional columns below, then delete any columns you don't want saved with the table

In [4]:
buildstock_keep_columns = [
	"bldg_id",
	"in.cec_climate_zone",
	"in.city",
	"in.cooling_setpoint",
	"in.federal_poverty_level",
	"in.heating_fuel",
	"in.heating_setpoint",
	"in.hvac_cooling_efficiency",
	"in.hvac_cooling_type",
	"in.hvac_heating_efficiency",
	"in.hvac_heating_type",
	"in.income",
	"in.state",
	"in.income_recs_2015",
	"in.income_recs_2020",
	"in.iso_rto_region",
	"in.location_region",
	"in.occupants",
	"in.schedules",
	"in.vintage",
	]

buildstock = buildstock[buildstock_keep_columns]

simulation_file = Path("/Users/camilotoruno/Documents/local_research_data/simulations_five_cities/results_summary.csv")
simulation_results = pd.read_csv(simulation_file)

print("Simulations results available columns:")
list_available_columns(simulation_results)
# print(len(simulation_results))

Simulations results available columns:
Dimensions of data: (1245600, 44)
optional_cols = [
	'Month',
	'Environment:Site Outdoor Air Drybulb Temperature [C](Monthly)',
	'Environment:Site Outdoor Air Wetbulb Temperature [C](Monthly)',
	'Heating:EnergyTransfer [J](Monthly)',
	'Heating:EnergyTransfer:Zone:CENTRAL AC AND FURNACE AIRLOOP RET AIR ZONE [J](Monthly)',
	'Heating:EnergyTransfer:Zone:LIVING SPACE [J](Monthly)',
	'Heating:NaturalGas [J](Monthly)',
	'Heating:Electricity [J](Monthly)',
	'Cooling:EnergyTransfer [J](Monthly)',
	'Cooling:EnergyTransfer:Zone:CENTRAL AC AND FURNACE AIRLOOP RET AIR ZONE [J](Monthly)',
	'Cooling:EnergyTransfer:Zone:LIVING SPACE [J](Monthly)',
	'Cooling:Electricity [J](Monthly)',
	'Electricity:Facility [J](Monthly)',
	'NaturalGas:Facility [J](Monthly)',
	'ElectricityPurchased:Facility [J](Monthly)',
	'ElectricitySurplusSold:Facility [J](Monthly)',
	'ElectricityNet:Facility [J](Monthly)',
	'bldg_id',
	'Year',
	'Weather Scenario',
	'Cooling:EnergyTransfer:Zone

### Choose the columns from the simulation results to keep 
Copy and paste the available optional columns below, then delete any columns you don't want. You can reorganize the list of columns if you'd like to reorganize them. 

In [5]:
optional_cols = [
	"bldg_id",
	"Year",
	"Month",
	"Weather Scenario",

	"ElectricityPurchased:Facility [J](Monthly)",
	"ElectricitySurplusSold:Facility [J](Monthly)",
	"ElectricityNet:Facility [J](Monthly)",
	"Electricity:Facility [J](Monthly)",
	"NaturalGas:Facility [J](Monthly)",
	"DistrictHeating:Facility [J](Monthly)",
	"DistrictCooling:Facility [J](Monthly)",
	"Propane:Facility [J](Monthly)",
	"FuelOilNo2:Facility [J](Monthly)",
    
	"Heating:NaturalGas [J](Monthly)",
	"Heating:Electricity [J](Monthly)",
	"Heating:DistrictHeating [J](Monthly)",
	"Heating:Propane [J](Monthly)",
	"Heating:FuelOilNo2 [J](Monthly)",
    
	"Cooling:Electricity [J](Monthly)",
	"Cooling:DistrictCooling [J](Monthly)",
	]

integer_columns = [	
    "bldg_id",
	"Year",
	"Month",
    ]

simulation_results = simulation_results[optional_cols]
simulation_results['Month'] = simulation_results['Month'].str.strip()		# strip empty space from month strings

import calendar
month_dictionary = {month: index for index, month in enumerate(calendar.month_name) if month}	# create month name to number dictionary
if sum(simulation_results['Month'].isin(list(month_dictionary.keys()))) > 0:
    simulation_results['Month'] = simulation_results['Month'].map(month_dictionary)   			# map the month name to month number 
    

simulation_results[integer_columns] = simulation_results[integer_columns].astype(int)


### Join buildstock onto simulations

In [6]:
simulation_results = pd.merge(simulation_results, buildstock, how='left', on="bldg_id", 
                                  left_index=False, right_index=False, sort=False, suffixes=('_x', '_y'), copy=None, indicator=False, validate=None)

# Cost Calculations
#### If pricing data not yet averaged and tabularized for joining onto simulation, perform that using pricing_data_cleaning.ipynb
#### Load data

### Join the pricing data onto simulation results 

In [7]:
class obj:
    def __init__(self) -> None:
        pass

files = obj()
files.pricing_directory = Path("/Volumes/seas-mtcraig/data_sharing/Energy Burdens Under Climate Change/Energy rates")
files.natural_gas = Path(files.pricing_directory, "Natural gas/natural_gas_pricing.csv")
files.fuel_oil_no2 = Path(files.pricing_directory, "Oil/Monthly_No._2_Heating_Oil_Residential_Price_interpolated.csv") 
files.elec = Path(files.pricing_directory, "Electricity/aggregate_data_By_(City,_Month).csv")
files.propane = Path(files.pricing_directory, "Propane/Monthly_Propane_Price_Interpolated.csv") 

propane_pricing = pd.read_csv(files.propane)
electricity_pricing = pd.read_csv(files.elec)
fuel_oil_no2_pricing = pd.read_csv(files.fuel_oil_no2)
natural_gas_pricing = pd.read_csv(files.natural_gas)


In [8]:
def join(left_data, right_data, how, left_on, right_on, suffixes = ('_l', '_r') ):
    before = left_data['in.city'].value_counts()
    data = pd.merge(left_data, right_data, how=how, left_on=left_on, right_on=right_on, 
                                  left_index=False, right_index=False, sort=False, suffixes=suffixes, copy=None, indicator=False, validate=None)
    after = data['in.city'].value_counts()

    if not after.equals(before): 
        print("Missing data. Data lost")
        print(f"Before: {before}")
        print(f"After: {after}")

        raise RuntimeError()

    return data 


# Join the electricity pricing on the simulation data 
simulation_results['in.city'] = simulation_results['in.city'].str.split(', ').str[1]

print("Joining Electricity Pricing")
simulation_results = join(simulation_results, electricity_pricing, how='inner', left_on=['in.city', 'Month'], right_on=['City', 'Month'])

# Join the natural gas pricing on the simulation data
print("Joining Natural Gas Pricing")
simulation_results = join(simulation_results, natural_gas_pricing, how='inner', left_on=['in.state', 'Month'], right_on=['State', 'Month'])

# Join the fuel_oil_no2_pricing pricing on the simulation data
print("Joining Fuel Oil Pricing")
simulation_results = join(simulation_results, fuel_oil_no2_pricing, how='inner', left_on=['Month'], right_on=['Month'])

# Join the propoane pricing on the simulation data 
print("Joining Propane Pricing")
"""
Propane pricing missing for California. Use US pricing data for California
"""
simulation_results.loc[ simulation_results['in.state'] == 'CA', 'in.state' ] = 'U.S.'   # change to US for joining US pricing on California 
simulation_results = join(simulation_results, propane_pricing, how='inner', left_on=['in.state', 'Month'], right_on=['State', 'Month'])
simulation_results.loc[ simulation_results['in.state'] == 'U.S.', 'in.state' ] = 'CA'   # rename back to California 

# print(simulation_results.loc[simulation_results.loc[ simulation_results['State'] == 'CA', 'State' ], 'State'] )


Joining Electricity Pricing
Joining Natural Gas Pricing
Joining Fuel Oil Pricing
Joining Propane Pricing


### Unit Conversions

In [9]:
# Unit coversions 
j_to_kwh = 3600000 # Joule / kWh electricity https://www.rapidtables.com/convert/energy/Joule_to_kWh.html

# natural gas 
Gj_to_Mcf = 1.0551 # https://www.naturalgasintel.com/natural-gas-converter/
j_to_Mcf = 1e9 * Gj_to_Mcf  

# Propane 
propane_btu_per_gallon = 91452 # Btu #  https://www.eia.gov/energyexplained/units-and-calculators/
j_per_btu = 1055.05585262 # J / BTU
j_per_gallon_propane = propane_btu_per_gallon * j_per_btu

# number 2 fuel oil   
j_per_gallon_no2_fuel_oil = 146520000      # https://www.convertunits.com/from/gallon+[U.S.]+of+distillate+no.+2+fuel+oil/to/joule

new_columns = obj()

# Electricity 
simulation_results['Cooling:Electricity [kWh](Monthly)'] = simulation_results['Cooling:Electricity [J](Monthly)'] / j_to_kwh
simulation_results['Heating:Electricity [kWh](Monthly)'] = simulation_results['Heating:Electricity [J](Monthly)'] / j_to_kwh
simulation_results['ElectricityPurchased:Facility [kWh](Monthly)'] = simulation_results['ElectricityPurchased:Facility [J](Monthly)'] / j_to_kwh

# Natural Gas
simulation_results['Heating:NaturalGas [Mcf](Monthly)'] = simulation_results['Heating:NaturalGas [J](Monthly)'] / j_to_Mcf
simulation_results['NaturalGas:Facility [Mcf](Monthly)'] = simulation_results['NaturalGas:Facility [J](Monthly)'] / j_to_Mcf

# Propane
simulation_results['Heating:Propane [Gal](Monthly)'] = simulation_results['Heating:Propane [J](Monthly)'] / j_per_gallon_propane
simulation_results["Propane:Facility [Gal](Monthly)"] = simulation_results['Propane:Facility [J](Monthly)'] / j_per_gallon_propane

# Fuel oil 
simulation_results['Heating:FuelOilNo2 [Gal](Monthly)'] = simulation_results['Heating:FuelOilNo2 [J](Monthly)'] / j_per_gallon_no2_fuel_oil
simulation_results["FuelOilNo2:Facility [Gal](Monthly)"] = simulation_results["FuelOilNo2:Facility [J](Monthly)"] / j_per_gallon_no2_fuel_oil

### Cost Calculuations

In [10]:
# Electricity
simulation_results['Cost Cooling:Electricity [$](Monthly)'] = simulation_results['Cooling:Electricity [kWh](Monthly)'] * simulation_results["Mean(Price Electricity ($/kWh))"]
simulation_results['Cost Heating:Electricity [$](Monthly)'] = simulation_results['Heating:Electricity [kWh](Monthly)'] * simulation_results["Mean(Price Electricity ($/kWh))"]
simulation_results['Cost ElectricityPurchased:Facility [$](Monthly)'] = simulation_results['ElectricityPurchased:Facility [kWh](Monthly)'] * simulation_results["Mean(Price Electricity ($/kWh))"]

# Natural gas 
simulation_results['Cost Heating:NaturalGas [$](Monthly)'] = simulation_results['Heating:NaturalGas [Mcf](Monthly)'] * simulation_results["Price of Natural Gas Delivered to Residential Consumers (Dollars per Thousand Cubic Feet)"]
simulation_results['Cost NaturalGas:Facility [$](Monthly)'] = simulation_results['NaturalGas:Facility [Mcf](Monthly)'] * simulation_results["Price of Natural Gas Delivered to Residential Consumers (Dollars per Thousand Cubic Feet)"]

# Propane
simulation_results['Cost Heating:Propane [$](Monthly)'] = simulation_results['Heating:Propane [Gal](Monthly)'] * simulation_results['Monthly U.S. Propane Residential Price (Dollars per Gallon)']
simulation_results["Cost Propane:Facility [$](Monthly)"] = simulation_results["Propane:Facility [Gal](Monthly)"] * simulation_results['Monthly U.S. Propane Residential Price (Dollars per Gallon)']

# Fuel oil 
simulation_results['Cost Heating:FuelOilNo2 [$](Monthly)'] = simulation_results['Heating:FuelOilNo2 [Gal](Monthly)'] * simulation_results['Monthly No. 2 Heating Oil Residential Price Dollars per Gallon']
simulation_results["Cost FuelOilNo2:Facility [Gal](Monthly)"] = simulation_results["FuelOilNo2:Facility [Gal](Monthly)"] * simulation_results['Monthly No. 2 Heating Oil Residential Price Dollars per Gallon']

#### Define which columns to sum for total heating and cooling costs 

In [11]:
list_available_columns(simulation_results)

Dimensions of data: (1039680, 64)
optional_cols = [
	'bldg_id',
	'Year',
	'Month',
	'Weather Scenario',
	'ElectricityPurchased:Facility [J](Monthly)',
	'ElectricitySurplusSold:Facility [J](Monthly)',
	'ElectricityNet:Facility [J](Monthly)',
	'Electricity:Facility [J](Monthly)',
	'NaturalGas:Facility [J](Monthly)',
	'DistrictHeating:Facility [J](Monthly)',
	'DistrictCooling:Facility [J](Monthly)',
	'Propane:Facility [J](Monthly)',
	'FuelOilNo2:Facility [J](Monthly)',
	'Heating:NaturalGas [J](Monthly)',
	'Heating:Electricity [J](Monthly)',
	'Heating:DistrictHeating [J](Monthly)',
	'Heating:Propane [J](Monthly)',
	'Heating:FuelOilNo2 [J](Monthly)',
	'Cooling:Electricity [J](Monthly)',
	'Cooling:DistrictCooling [J](Monthly)',
	'in.cec_climate_zone',
	'in.city',
	'in.cooling_setpoint',
	'in.federal_poverty_level',
	'in.heating_fuel',
	'in.heating_setpoint',
	'in.hvac_cooling_efficiency',
	'in.hvac_cooling_type',
	'in.hvac_heating_efficiency',
	'in.hvac_heating_type',
	'in.income',
	'in.stat

In [12]:
heating_cols =	[
	"Cost Heating:Electricity [$](Monthly)",
	"Cost Heating:NaturalGas [$](Monthly)",
	"Cost Heating:Propane [$](Monthly)",
	"Cost Heating:FuelOilNo2 [$](Monthly)",
    ]

cooling_cols =	[
	"Cost Cooling:Electricity [$](Monthly)",
    ]

facility_energy_costs_cols = [
	'Cost ElectricityPurchased:Facility [$](Monthly)',
	'Cost NaturalGas:Facility [$](Monthly)',
	'Cost Propane:Facility [$](Monthly)',
	'Cost FuelOilNo2:Facility [Gal](Monthly)',
]

simulation_results['Total Cost Heating [$](Monthly)'] = simulation_results[heating_cols].fillna(0).sum(axis=1)      # replace NaN values with zero and sum all heating columns
simulation_results['Total Cost Cooling [$](Monthly)']  = simulation_results[cooling_cols].fillna(0).sum(axis=1)		# replace NaN values with zero and sum all cooling columns


total_cost_columns = ['Total Cost Heating [$](Monthly)', 'Total Cost Cooling [$](Monthly)']
simulation_results['Total Cost Space Conditioning [$](Monthly)'] = simulation_results[total_cost_columns].sum(axis=1) 

simulation_results['Cost Energy:Facility [$](Monthly)'] = simulation_results[facility_energy_costs_cols].sum(axis=1)

""" 
Why do some months have zero total space conditioning cost  	?????????????????????
	- Cost of district heating/cooling not accounted for?
"""
print(sum(simulation_results['Total Cost Space Conditioning [$](Monthly)'] > 0) / len(simulation_results))
print(simulation_results['in.city'].unique())


0.936405432440751
['Detroit' 'Atlanta' 'Los Angeles' 'New York' 'Dallas']


### Energy Burdens Calculations
Also Calculate More Granular Income

In [13]:
# Convert annual income bins to ranges
# Split the column on the delimiter - or < or >, replace empty cells with 0, and cast as integer
simulation_results[["Income - Low [Annual]", "Income - High [Annual]"]] = simulation_results["in.income"].str.split(expand=True, pat='[-<>]').replace('', 0).astype(int)  

# High monthly income = high end of income / 12 
simulation_results[["Income - Low [Monthly]", "Income - High [Monthly]"]] = simulation_results[["Income - Low [Annual]", "Income - High [Annual]"]] / 12 
# Calculate More Granular Income - Normalize income by number of residents
simulation_results["Income per Occupant -  High [Annual]"] = simulation_results["Income - High [Annual]"] / simulation_results["in.occupants"]

# High income energy burden = total monthly energy costs / high monthly income 
# Low income energy burden = total monthly energy costs / low monthly income 
simulation_results["Space Conditioning Energy Burden - Low [Monthly]"]  = simulation_results['Total Cost Space Conditioning [$](Monthly)'] / simulation_results["Income - Low [Monthly]"]
simulation_results["Space Conditioning Energy Burden - High [Monthly]"] = simulation_results['Total Cost Space Conditioning [$](Monthly)'] / simulation_results["Income - High [Monthly]"]
simulation_results['Energy Burden - High [Monthly]']                    = simulation_results['Cost Energy:Facility [$](Monthly)'] / simulation_results["Income - High [Monthly]"]
simulation_results = simulation_results.replace([np.inf, -np.inf, np.nan], 0)   # replace infinity values zeros - these are due to null values for income low or high end of the range

# Take differences across scenarios
### Define which columns to take difference over
Copy and paste the available optional columns below, then delete any columns you don't want differenced with the base scenario
### Take differences over those columns between scenarios 

In [14]:
list_available_columns(simulation_results)

Dimensions of data: (1039680, 76)
optional_cols = [
	'bldg_id',
	'Year',
	'Month',
	'Weather Scenario',
	'ElectricityPurchased:Facility [J](Monthly)',
	'ElectricitySurplusSold:Facility [J](Monthly)',
	'ElectricityNet:Facility [J](Monthly)',
	'Electricity:Facility [J](Monthly)',
	'NaturalGas:Facility [J](Monthly)',
	'DistrictHeating:Facility [J](Monthly)',
	'DistrictCooling:Facility [J](Monthly)',
	'Propane:Facility [J](Monthly)',
	'FuelOilNo2:Facility [J](Monthly)',
	'Heating:NaturalGas [J](Monthly)',
	'Heating:Electricity [J](Monthly)',
	'Heating:DistrictHeating [J](Monthly)',
	'Heating:Propane [J](Monthly)',
	'Heating:FuelOilNo2 [J](Monthly)',
	'Cooling:Electricity [J](Monthly)',
	'Cooling:DistrictCooling [J](Monthly)',
	'in.cec_climate_zone',
	'in.city',
	'in.cooling_setpoint',
	'in.federal_poverty_level',
	'in.heating_fuel',
	'in.heating_setpoint',
	'in.hvac_cooling_efficiency',
	'in.hvac_cooling_type',
	'in.hvac_heating_efficiency',
	'in.hvac_heating_type',
	'in.income',
	'in.stat

In [15]:

def generate_samples_for_averaging(simulation_results, scenario, unique_cols, all_unique_col_combos):
    # returns list of pandas dataframes

    data = simulation_results[simulation_results['Weather Scenario'] == scenario].copy(deep=True)
    samples_dataframes = []

    for combination in tqdm.tqdm(all_unique_col_combos, desc = 'Generating Subsamples for Averaging'):

        datasample = data.copy(deep=True)

        # row_selection = datasample['bldg_id'] > 0   # initialize with all rows
        for i, column in enumerate(unique_cols):
            # row_selection = (row_selection) & (datasample[column] == combination[i])
            datasample = datasample[ datasample[column] == combination[i] ]

        if len(datasample) != 40: 
            # print(datasample['bldg_id'].unique())
            # print(f"sum(row_selection): {len(datasample)}")
            raise RuntimeError(f"Error: len(datasample) = {len(datasample)}")

        samples_dataframes.append(datasample)
        # data = data[~row_selection]     # remove the rows that were sampled (to reduce subsequent work)
        
    return samples_dataframes     # returns list of pandas dataframes in the same order as the all_unique_col_combos


def average_samples(samples_dataframes, columns_to_average):
    # returns list of pandas dataframes

    datframes_list = []
    for sample in tqdm.tqdm(samples_dataframes, desc='Averaging Base Scenario Samples'): 

        columns_to_not_average = list(set(sample.columns) - set(columns_to_average))
        metadata = sample[columns_to_not_average].iloc[0, :]                                        # Select the first row of data to not average (using iloc for proper indexing)
        df_averaged = (sample[columns_to_average].mean(axis=0).to_frame()).transpose()              # Calculate the means and store in a new dataframe
        df_averaged = df_averaged.assign(**{col: metadata[col] for col in columns_to_not_average})  # add the metadata back onto the rows
        datframes_list.append(df_averaged)

    return datframes_list       # returns list of pandas dataframes in the same order as the all_unique_col_combos



In [16]:
columns_to_take_difference = [
	'Cooling:Electricity [kWh](Monthly)',
	'Heating:Electricity [kWh](Monthly)',
	'Heating:NaturalGas [Mcf](Monthly)',
	'Heating:Propane [Gal](Monthly)',
	'Heating:FuelOilNo2 [Gal](Monthly)',
    
	'Propane:Facility [Gal](Monthly)',
	'NaturalGas:Facility [Mcf](Monthly)',
	'ElectricityPurchased:Facility [kWh](Monthly)',
	'FuelOilNo2:Facility [Gal](Monthly)',
    
	'Cost Cooling:Electricity [$](Monthly)',
	'Cost Heating:Electricity [$](Monthly)',
	'Cost ElectricityPurchased:Facility [$](Monthly)',
	'Cost Heating:NaturalGas [$](Monthly)',
	'Cost NaturalGas:Facility [$](Monthly)',
	'Cost Heating:Propane [$](Monthly)',
	'Cost Propane:Facility [$](Monthly)',
	'Cost Heating:FuelOilNo2 [$](Monthly)',
	'Cost FuelOilNo2:Facility [Gal](Monthly)',
	'Total Cost Heating [$](Monthly)',
	'Total Cost Cooling [$](Monthly)',
	'Total Cost Space Conditioning [$](Monthly)',
	'Cost Energy:Facility [$](Monthly)',
    
	'Space Conditioning Energy Burden - Low [Monthly]',
	'Space Conditioning Energy Burden - High [Monthly]',
	'Energy Burden - High [Monthly]',
    ]


# def difference_over_mean(simulations, base_scenario, columns_to_take_difference):
    
    
def get_all_combinations(data, unique_cols):
    """
    This function generates all combinations of elements across lists.
    Args:
        lists: A list of lists, where each inner list has the same number of elements.
    Returns:
        A a list containing all combinations as tuples.
    """

    lists = []

    for column in unique_cols: 
        lists.append(data[column].unique())

    # Use product to generate combinations
    return [combination for combination in product(*lists, repeat=1)]


base_scenario = "historical_1980-2020"
unique_cols = ['bldg_id', 'Month']
# simulation_results[unique_cols].

# take the average of a subset of columns and where a set of different key columns match a certain value

columns_to_average = columns_to_take_difference    # List of columns to average

base_data = simulation_results[simulation_results['Weather Scenario'] == base_scenario]
unique_col_value_combos = get_all_combinations(base_data, unique_cols)

simulation_results[columns_to_take_difference] = simulation_results[columns_to_take_difference].replace(['', np.nan], 0).astype(float)

base_scenario_samples = generate_samples_for_averaging(simulation_results, base_scenario, unique_cols, unique_col_value_combos)   # returns list of pandas dataframes
base_scenario_samples_averages = average_samples(base_scenario_samples, columns_to_average)

# differences = take_differences_over_columns(simulation_results, base_scenario, columns_to_take_difference)
# differences_to_mean_baseline = difference_over_mean(simulation_results, base_scenario, columns_to_take_difference)

Generating Subsamples for Averaging:  18%|█▊        | 1592/8664 [01:29<06:48, 17.30it/s]

In [None]:
scenario_differences = []

for scenario in (set(simulation_results['Weather Scenario'].unique()) - set([base_scenario])): 
        
    print(f'\nTaking Diffferences for Scenario: {scenario}')
    scenario_samples = generate_samples_for_averaging(simulation_results, scenario, unique_cols, unique_col_value_combos)   

    # import copy 

    if len(scenario_samples) != len(base_scenario_samples_averages): raise RuntimeError(f"Unequal number of scenario and base samples")

    for i, sample in enumerate(scenario_samples):
        # because the samples for both the scenarios are generated using the same combinations of unique identitifiers, we can subtract the elements in these lists 
        base_normalized_sample = sample.copy(deep=True)
        base_normalized_sample[columns_to_average] = sample[columns_to_average] - base_scenario_samples_averages[i][columns_to_average].iloc[0]
        scenario_differences.append(base_normalized_sample)

scenario_differences_df = pd.concat(scenario_differences)

NameError: name 'simulation_results' is not defined

### Save data for analysis

In [None]:
output_folder = Path("/Users/camilotoruno/Documents/local_research_data/simulations_five_cities")

simulation_results.to_csv(Path(output_folder, "simulations_costs.csv"), index=False)
# differences.to_csv(Path(output_folder, "differences.csv"), index=False)
scenario_differences_df.to_csv(Path(output_folder, 'differences_to_mean.csv'), index=False)


# Functions in progress

In [None]:
def simpler_difference_calc():    #### Note this doesn't work 

	### In theory much faster, however there's an issue with indexing / keying into columns. not sure what's wrong

	base_scenario = "historical_1980-2020"
	unique_cols = ['bldg_id', 'Month']
	# simulation_results[unique_cols].

	# take the average of a subset of columns and where a set of different key columns match a certain value

	columns_to_average = columns_to_take_difference    # List of columns to average

	base_data = simulation_results[simulation_results['Weather Scenario'] == base_scenario]


	# unique_col_value_combos = get_all_combinations(base_data, unique_cols)
	simulation_results[columns_to_take_difference] = simulation_results[columns_to_take_difference].replace(['', np.nan], 0).astype(float)

	base_scenario_samples_averages = base_data.groupby(unique_cols)[columns_to_take_difference].mean().reset_index(drop=True)

	# print(len(base_scenario_samples_averages))
	# print(base_scenario_samples_averages.shape)

	differences = [] 
	for scenario in (set(simulation_results['Weather Scenario'].unique()) - set([base_scenario])):


		# print(len(simulation_results[simulation_results['Weather Scenario'] == scenario]))
		# print(simulation_results[simulation_results['Weather Scenario'] == scenario])

		# for i in range(len(base_columns)):
			# print(f"{base_columns[i]}\t\t{columns_to_take_difference[i]}")
			# print((columns_to_take_difference))

		difference = join(	, 
							  
							how='left', left_on = unique_cols, right_on=unique_cols, suffixes=('', '_base_') ) 

		left_data = simulation_results[simulation_results['Weather Scenario'] == scenario]
		right_data = base_scenario_samples_averages

		before = left_data['in.city'].value_counts()
		data = pd.merge(left_data, right_data, how='left', on=unique_cols, left_index=False, right_index=False, sort=False, 
						suffixes=('', '_base_'), copy=None, indicator=False, validate=None)
		after = data['in.city'].value_counts()
		
		base_columns = [col+"_base_" for col in columns_to_take_difference]


		print(difference[base_columns].shape)
		print(difference[columns_to_take_difference].shape)

		def check_cols(df, cols):
			for col in cols: 
				if col not in df.columns: 
					print(f"Missing col {col} not in dataframe")

		print('difference, columns_to_take_difference')
		check_cols(difference, columns_to_take_difference)
		print('difference, base')
		check_cols(difference, base_columns)
		print(difference[columns_to_take_difference])
		print(difference[base_columns])

		difference[columns_to_take_difference] = difference[columns_to_take_difference] - difference[base_columns]
		

		differences.append(difference)

	return None
	
	# base_scenario_samples_averages.to_csv('base_average_samples_method2.csv')
	# base_scenario_samples_averages[0].to_csv('base_average_samples.csv')


	# differences = take_differences_over_columns(simulation_results, base_scenario, columns_to_take_difference)
	# differences_to_mean_baseline = difference_over_mean(simulation_results, base_scenario, columns_to_take_difference)

SyntaxError: invalid syntax (3074420461.py, line 35)