In [1]:
import pandas as pd
import os
import requests
from io import StringIO
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm

In [2]:
state_abv = {
    '2': 'AK',
    '1': 'AL',
    '5': 'AR',
    '4': 'AZ',
    '6': 'CA',
    '8': 'CO',
    '9': 'CT',
    '11': 'DC',
    '10': 'DE',
    '12': 'FL',
    '13': 'GA',
    '15': 'HI',
    '19': 'IA',
    '16': 'ID',
    '17': 'IL',
    '18': 'IN',
    '20': 'KS',
    '21': 'KY',
    '22': 'LA',
    '25': 'MA',
    '24': 'MD',
    '23': 'ME',
    '26': 'MI',
    '27': 'MN',
    '29': 'MO',
    '28': 'MS',
    '30': 'MT',
    '37': 'NC',
    '38': 'ND',
    '31': 'NE',
    '33': 'NH',
    '34': 'NJ',
    '35': 'NM',
    '32': 'NV',
    '36': 'NY',
    '39': 'OH',
    '40': 'OK',
    '41': 'OR',
    '42': 'PA',
    '44': 'RI',
    '45': 'SC',
    '46': 'SD',
    '47': 'TN',
    '48': 'TX',
    '49': 'UT',
    '51': 'VA',
    '50': 'VT',
    '53': 'WA',
    '55': 'WI',
    '54': 'WV',
    '56': 'WY'}

In [3]:
test_state_abv = {
    '2': 'AK',
    '1': 'AL',
    '5': 'AR',
    '4': 'AZ'}

In [4]:
comstock_buildings = ['quickservicerestaurant',
                      'fullservicerestaurant',
                      'smalloffice',
                      'mediumoffice',
                      'largeoffice',
                      'warehouse',
                      'smallhotel',
                      'largehotel',
                      'outpatient',
                      'hospital',
                      'secondaryschool',
                      'primaryschool',
                      'retailstandalone',
                      'retailstripmall']

In [5]:
test_comstock_buildings = ['smalloffice']

In [6]:
#upgrades = ['17','18']
upgrades = ['5','6','7','8','9','10','15','17','18']
test_upgrades = ['18']

In [7]:
def plot_histogram_with_stats_and_fit(df):
    
    up = df['upgrade'].unique()[0]
    building = df['in.comstock_building_type'].unique()[0]
    state = df['in.state'].unique()[0]
    up = f"{int(up):02}"  # Convert string to int and format with leading zero
    column_name = 'calc.percent_savings.site_energy.total.energy_consumption_intensity..percent'
    
    # Count the number of buildings with a value greater than 25% / 50% in the specified column
    num_buildings_above_25_percent = (df[column_name] > 0.25).sum()
    num_buildings_above_50_percent = (df[column_name] > 0.5).sum()

    # Determine the total number of buildings
    total_buildings = df.shape[0]

    # Calculate the percentage of buildings that satisfy the condition
    percentage_above_25_percent = (num_buildings_above_25_percent / total_buildings) * 100
    percentage_above_50_percent = (num_buildings_above_50_percent / total_buildings) * 100

    print(f"Percentage of {building} with savings > 25%: {percentage_above_25_percent:.2f}%  > 50%: {percentage_above_50_percent:.2f}%")

     # Filter the DataFrame to include only rows where 'applicability' is TRUE
    # and create a copy to avoid SettingWithCopyWarning when modifying the DataFrame
    filtered_df = df[df['applicability'] == True].copy()
    
    # Use .loc[] to safely modify the specific column of interest
    data_series = filtered_df.loc[:, column_name] * 100
    
    mean_val = data_series.mean()
    median_val = data_series.median()
    std_val = data_series.std()
    min_val = data_series.min()
    max_val = data_series.max()

    # Fitting a normal distribution
    mu, std = norm.fit(data_series)

    # Plotting the histogram
    plt.figure(figsize=(10, 6))
    n, bins, patches = plt.hist(data_series, bins=30, color='blue', edgecolor='black', density=True)
    
    # Plotting the PDF of the fitted normal distribution
    xmin, xmax = plt.xlim()
    x = np.linspace(xmin, xmax, 100)
    p = norm.pdf(x, mu, std)
    plt.plot(x, p, 'k', linewidth=2)

    # Adding summary statistics to the plot
    plt.axvline(mean_val, color='red', linestyle='dashed', linewidth=1)
    plt.axvline(median_val, color='green', linestyle='dashed', linewidth=1)
    plt.legend({'Mean': mean_val, 'Median': median_val, 'Normal Fit': ''})

    plt.title(f'Histogram with Stats of {column_name}')
    plt.xlabel(column_name)
    plt.ylabel('Density')

    # Displaying summary statistics
    plt.text(xmin, max(n)*0.8, f'Mean: {mean_val:.2f}\nMedian: {median_val:.2f}\nStd: {std_val:.2f}\nMin: {min_val:.2f}\nMax: {max_val:.2f}', 
             bbox=dict(facecolor='white', alpha=0.5))

    #plt.show()
    
    # Construct the local file path
    directory = f'annual_sorted/plots/{up}/{building}'
    os.makedirs(directory, exist_ok=True)  # Ensure the directory exists
    file_path = f'{directory}/{up}_{building}_{state.upper()}_total_energy_percent_savings.png'
        
    plt.savefig(file_path, bbox_inches='tight', dpi=300)
    #plt.show()
    plt.close()
    # Return the calculated percentages
    return percentage_above_25_percent, percentage_above_50_percent

In [8]:
def plot_histogram_colored_by_category_sorted_1(df, hist_column, category_column):
    up = df['upgrade'].unique()[0]
    building = df['in.comstock_building_type'].unique()[0]
    state = df['in.state'].unique()[0]
    up = f"{int(up):02}"  # Convert string to int and format with leading zero
    
    plt.figure(figsize=(10, 6))
    
    filtered_df = df[df['applicability'] == True].copy()
        
    # Calculate the frequency of each category
    category_freq = filtered_df[category_column].value_counts().to_dict()
    
    # Sort categories by their frequency in descending order
    categories_sorted = sorted(category_freq, key=category_freq.get, reverse=True)
    
    # Assign a color to each category based on the sorted order
    colors = plt.cm.jet(np.linspace(0, 1, len(categories_sorted)))
    
    # Plot histograms for each category in sorted order
    for color, category in zip(colors, categories_sorted):
        # Filter data for the current category
        category_data = filtered_df[filtered_df[category_column] == category][hist_column].dropna() * 100
        # Calculate histogram data and plot
        n, bins, patches = plt.hist(category_data, bins=30, color=color, label=category, alpha=0.75, edgecolor='black')
    
    plt.title(f'Histogram of {hist_column} Colored by {category_column}')
    plt.xlabel(hist_column)
    plt.ylabel('Density')
    plt.legend(title=category_column, bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    #plt.show()
        
    # Compute the overall mean
    overall_mean = filtered_df[hist_column].mean()

    # Compute means by category
    category_means = filtered_df.groupby(category_column)[hist_column].mean()
    
    # Calculate the difference between category means and the overall mean, in terms of percentage of the overall mean
    difference_percentages = ((category_means - overall_mean) / overall_mean) * 100

    # Check if any category's mean differs from the overall mean by more than 10% (either more or less)
    has_significant_difference = any(difference_percentages.abs() > 10)

    # Use the boolean result in an if statement
    if has_significant_difference:
        # Construct the local file path
        directory = f'annual_sorted/plots_diff/{up}/{building}/{category_column}'
        os.makedirs(directory, exist_ok=True)  # Ensure the directory exists
        file_path = f'{directory}/{up}_{building}_{state.upper()}_{category_column}.png'
    else:
        # Construct the local file path
        directory = f'annual_sorted/plots_sim/{up}/{building}/{category_column}'
        os.makedirs(directory, exist_ok=True)  # Ensure the directory exists
        file_path = f'{directory}/{up}_{building}_{state.upper()}_{category_column}.png'
            
    plt.savefig(file_path, bbox_inches='tight', dpi=300)
    plt.close()


In [9]:
columns=[
    'upgrade',
    'in.comstock_building_type',
    'in.state',
    'applicability',
    'in.sqft',
    'in.year_built',
    'in.vintage',
    'in.interior_lighting_generation',
    'in.hvac_category',
    'in.hvac_system_type',
    'in.heating_fuel',
    'in.hvac_cool_type',
    'in.hvac_heat_type',
    'in.service_water_heating_fuel',
    'in.energy_code_followed_during_last_ext_lighting_replacement',
    'in.energy_code_followed_during_last_hvac_replacement',
    'in.energy_code_followed_during_last_int_equipment_replacement',
    'in.energy_code_followed_during_last_roof_replacement',
    'in.energy_code_followed_during_last_svc_water_htg_replacement',
    'in.energy_code_followed_during_last_walls_replacement',
    'in.energy_code_followed_during_original_building_construction',
    'calc.percent_savings.electricity.total.energy_consumption_intensity..percent',
    'calc.percent_savings.electricity.total.energy_consumption..percent',
    'calc.percent_savings.natural_gas.total.energy_consumption_intensity..percent',
    'calc.percent_savings.natural_gas.total.energy_consumption..percent',
    'calc.percent_savings.site_energy.total.energy_consumption..percent',
    'calc.percent_savings.site_energy.total.energy_consumption_intensity..percent'
]

compare=[
    'in.vintage',
    'in.interior_lighting_generation',
    'in.hvac_category',
    'in.hvac_system_type',
    'in.heating_fuel',
    'in.hvac_cool_type',
    'in.hvac_heat_type',
    'in.service_water_heating_fuel',
    'in.energy_code_followed_during_original_building_construction',
]

In [10]:
base_url = "https://oedi-data-lake.s3.amazonaws.com/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2023/comstock_amy2018_release_2/metadata_and_annual_results/by_state/state={STATE}/csv/{STATE}_upgrade{up}_metadata_and_annual_results.csv"
#https://oedi-data-lake.s3.amazonaws.com/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2023/comstock_amy2018_release_2/metadata_and_annual_results/by_state/state=CO/csv/CO_upgrade18_metadata_and_annual_results.csv

In [11]:
# loop over upgrades
for upgrade in test_upgrades:
    UP = str(upgrade)
    up = f"{int(upgrade):02}"  # Convert string to int and format with leading zero

    data_df = pd.DataFrame(columns=['upgrade', 'state', 'comstock_building_type', 'percentage_above_25', 'percentage_above_50'])

    # Loop through each state abbreviation
    for state in test_state_abv.values():
        # Construct the local file path
        directory = f'annual/data/{up}'
        os.makedirs(directory, exist_ok=True)  # Ensure the directory exists
        file_path = f'{directory}/{state.upper()}_upgrade{up}_metadata_and_annual_results.csv'

        # Check if the file exists locally
        if os.path.exists(file_path):
            print(f"Using local file for upgrade: {up}, state: {state}")
            df = pd.read_csv(file_path, low_memory=False)
        else:
            # If the file doesn't exist, construct the URL and download the file
            url = base_url.format(up=up, STATE=state.upper())
            print(f"URL: {url}")
            try:
                response = requests.get(url)
                if response.status_code == 200:
                    print(f"Downloading upgrade: {up}, state: {state}")
                    # Convert the CSV content to a DataFrame
                    csv_content = StringIO(response.content.decode('utf-8'))
                    df = pd.read_csv(csv_content, low_memory=False)
                    
                    # Save the DataFrame locally for future use
                    df.to_csv(file_path, index=False)
                    print(f"Saved {file_path}")
                else:
                    print(f"Failed to download data for {state.upper()}: HTTP {response.status_code}")
            except Exception as e:
                print(f"Error downloading data for {state.upper()}: {e}")          


        # Group by 'in.comstock_building_type'
        grouped = df.groupby('in.comstock_building_type')

        # Iterate over each group, filter columns that match the defined columns, and save as separate files
        for building_type, group in grouped:
            # Filter columns that match the defined columns
            filtered_df = group[columns].dropna(how='all')  # Optionally, remove rows where all selected columns are NaN

            #make histogram plot and compute %buildings above 25% savings
            percentage_above_25, percentage_above_50 = plot_histogram_with_stats_and_fit(filtered_df)
            new_data = {
                'upgrade': [up],
                'state': [state.upper()],
                'comstock_building_type': [building_type],
                'percentage_above_25': [percentage_above_25],
                'percentage_above_50': [percentage_above_50]
            }

            # add new entry to DF
            new_row_df = pd.DataFrame(new_data)
            data_df = pd.concat([data_df, new_row_df], ignore_index=True)
            
            for comp in compare:
                print(f"comp: {comp}")
                plot_histogram_colored_by_category_sorted_1(filtered_df, 'calc.percent_savings.site_energy.total.energy_consumption_intensity..percent', comp)
            
    print(f'saving {up}_data.csv')
    directory = f'annual_sorted/data/{up}'
    os.makedirs(directory, exist_ok=True)  # Ensure the directory exists
    file_path = f'{directory}/{up}_data.csv'
    data_df.to_csv(file_path, index=False)          

Using local file for upgrade: 18, state: AK
Percentage of FullServiceRestaurant with savings > 25%: 0.00%  > 50%: 0.00%
comp: in.vintage
comp: in.interior_lighting_generation
comp: in.hvac_category
comp: in.hvac_system_type
comp: in.heating_fuel
comp: in.hvac_cool_type
comp: in.hvac_heat_type
comp: in.service_water_heating_fuel
comp: in.energy_code_followed_during_original_building_construction
Percentage of Hospital with savings > 25%: 66.67%  > 50%: 0.00%
comp: in.vintage
comp: in.interior_lighting_generation
comp: in.hvac_category
comp: in.hvac_system_type
comp: in.heating_fuel
comp: in.hvac_cool_type
comp: in.hvac_heat_type
comp: in.service_water_heating_fuel
comp: in.energy_code_followed_during_original_building_construction
Percentage of LargeHotel with savings > 25%: 11.11%  > 50%: 0.00%
comp: in.vintage
comp: in.interior_lighting_generation
comp: in.hvac_category
comp: in.hvac_system_type
comp: in.heating_fuel
comp: in.hvac_cool_type
comp: in.hvac_heat_type
comp: in.service_wat