In [1]:
import os
import re
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from calendar import month_name

def merge_all_results(
    base_output_dir,
    output_csv,
    convert_to_daily=False,
    daily_aggregator="mean",
    convert_to_monthly=False,
    monthly_aggregator="mean",
    postproc_log=None
):
    """
    Merges multiple simulation CSV files into one wide CSV.

    Modified to:
      - NOT skip _Meter.csv or _sz.csv files
      - Loosen the filename regex so that filenames like "simulation_bldg2Meter.csv"
        are included and we correctly parse building_id = 2.
      - Default to "merge as is" (convert_to_daily=False, convert_to_monthly=False).
    """

    if postproc_log is not None:
        postproc_log["base_output_dir"] = base_output_dir
        postproc_log["output_csv"] = output_csv
        postproc_log["convert_to_daily"] = convert_to_daily
        postproc_log["daily_aggregator"] = daily_aggregator
        postproc_log["convert_to_monthly"] = convert_to_monthly
        postproc_log["monthly_aggregator"] = monthly_aggregator

    data_dict = {}
    all_times = set()
    time_to_dt = {}  # Mapping from time_str to parsed_dt

    aggregator_funcs = {
        "sum": np.sum,
        "mean": np.mean,
        "max": np.max,
        "min": np.min,
        "pick_first_hour": lambda x: x.iloc[0] if not x.empty else np.nan
    }

    if convert_to_daily and daily_aggregator not in aggregator_funcs:
        print(f"Warning: Aggregator '{daily_aggregator}' not recognized. Defaulting to 'mean'.")
        daily_aggregator = "mean"

    if convert_to_monthly and monthly_aggregator not in aggregator_funcs:
        print(f"Warning: Aggregator '{monthly_aggregator}' not recognized. Defaulting to 'mean'.")
        monthly_aggregator = "mean"

    def aggregate_series(s, how):
        return aggregator_funcs.get(how, np.mean)(s)

    month_to_num = {month: index for index, month in enumerate(month_name) if month}

    ###########################################################################
    # 1) Traverse the directory and read each CSV (no skipping of _Meter.csv)
    ###########################################################################
    for root, dirs, files in os.walk(base_output_dir):
        for f in files:
            # Only process CSV files
            if not f.lower().endswith(".csv"):
                continue

            # Loosen the filename pattern to capture something like "simulation_bldg2Meter.csv":
            #   e.g., "simulation_bldg(\d+).*\.csv"
            # which means "simulation_bldg" + one or more digits + optional anything + ".csv"
            match = re.search(r'_bldg(\d+).*\.csv$', f, re.IGNORECASE)
            if not match:
                # If you have files that do not follow the naming with "_bldg<number>",
                # you can decide what building_id should be, or skip them.
                print(f"Skipping file (no building ID match): {f}")
                continue

            bldg_id = int(match.group(1))

            file_path = os.path.join(root, f)
            print(f"[merge_all_results] Reading {file_path}, Building {bldg_id}")

            # 2) Read CSV into a DataFrame
            try:
                df = pd.read_csv(file_path, header=0, low_memory=False)
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
                continue

            if "Date/Time" not in df.columns:
                print(f"Warning: No 'Date/Time' column in {file_path}, skipping.")
                continue

            ###########################################################################
            # 3) Correct '24:00:00' => '00:00:00' next day
            ###########################################################################
            def correct_time(x):
                """
                Handle '24:00:00' by converting it to '00:00:00' of the next day.
                """
                x = str(x).strip()
                if '24:00:00' in x:
                    parts = x.split()
                    if len(parts) >= 1:
                        date_part = parts[0]
                        try:
                            if date_part in month_to_num:
                                # e.g. if it was "January 24:00:00" – next month start
                                month_num = month_to_num[date_part]
                                if month_num == 12:
                                    corrected_date = datetime(2022, 1, 1)
                                else:
                                    corrected_date = datetime(2022, month_num + 1, 1)
                                return corrected_date.strftime("%m/%d 00:00:00")
                            else:
                                # Assume MM/DD format
                                date_obj = datetime.strptime(date_part, "%m/%d")
                                corrected_date = date_obj + timedelta(days=1)
                                return corrected_date.strftime("%m/%d 00:00:00")
                        except ValueError:
                            print(f"Warning: Unable to parse date part '{date_part}' in '{x}'.")
                            return x
                return x

            df["Date/Time_corrected"] = df["Date/Time"].astype(str).apply(correct_time)

            ###########################################################################
            # 4) Parse the corrected date/time into a datetime
            ###########################################################################
            def parse_dt(x):
                """
                Attempts to parse strings like:
                  - "01/21 00:10:00"
                  - "MonthName" (Monthly data)
                  - "01/21 4"
                  - etc.
                """
                x = x.strip()
                parts = x.split()
                if len(parts) == 1:
                    # Could be a month name or a single hour
                    single_part = parts[0]
                    if single_part in month_to_num:
                        return datetime(2022, month_to_num[single_part], 1)
                    try:
                        hr = int(single_part)
                        if 0 <= hr <= 23:
                            return datetime(2022, 1, 1, hr, 0, 0)
                        else:
                            return pd.NaT
                    except:
                        return pd.NaT
                elif len(parts) == 2:
                    # Typically "MM/DD HH:MM:SS" or "MM/DD HH"
                    date_part, time_part = parts
                    # If date_part is a month name => skip
                    if date_part in month_to_num:
                        return pd.NaT
                    # parse date
                    try:
                        date_obj = datetime.strptime(date_part, "%m/%d")
                    except ValueError:
                        return pd.NaT

                    # parse time
                    if ":" in time_part:
                        for fmt in ["%H:%M:%S", "%H:%M"]:
                            try:
                                t_obj = datetime.strptime(time_part, fmt)
                                dt_combined = datetime(
                                    2022, date_obj.month, date_obj.day,
                                    t_obj.hour, t_obj.minute, t_obj.second
                                )
                                return dt_combined
                            except ValueError:
                                pass
                        return pd.NaT
                    else:
                        # If there's no colon, interpret as hour
                        try:
                            hr = int(time_part)
                            if 0 <= hr <= 23:
                                return datetime(2022, date_obj.month, date_obj.day, hr, 0, 0)
                            else:
                                return pd.NaT
                        except:
                            return pd.NaT
                else:
                    return pd.NaT

            df["parsed_dt"] = df["Date/Time_corrected"].apply(parse_dt)

            ###########################################################################
            # 5) Process each numeric column
            ###########################################################################
            for col in df.columns:
                if col in ["Date/Time", "Date/Time_corrected", "parsed_dt"]:
                    continue

                # Identify frequency from the column name
                freq_mode = "Unknown"
                if "(Hourly)" in col or "(TimeStep)" in col:
                    freq_mode = "Hourly"
                elif "(Daily)" in col:
                    freq_mode = "Daily"
                elif "(Monthly)" in col:
                    freq_mode = "Monthly"

                key = (bldg_id, col)

                ########################################################################
                # 6) If converting to daily/monthly, do the aggregations
                ########################################################################
                if convert_to_daily or convert_to_monthly:
                    subdf = pd.DataFrame({
                        "dt": df["parsed_dt"],
                        "val": pd.to_numeric(df[col], errors='coerce')
                    })

                    if freq_mode == "Hourly":
                        # Convert Hourly (or TimeStep) to daily if requested
                        if convert_to_daily:
                            subdf.dropna(subset=["dt", "val"], inplace=True)
                            subdf["day_str"] = subdf["dt"].dt.strftime("%m/%d")
                            grouped = subdf.groupby("day_str")["val"]
                            day_vals = grouped.apply(lambda x: aggregate_series(x, daily_aggregator))
                            for day_s, v in day_vals.items():
                                if key not in data_dict:
                                    data_dict[key] = {}
                                data_dict[key][day_s] = v

                    elif freq_mode == "Daily":
                        # Convert daily to monthly if requested
                        if convert_to_monthly:
                            subdf.dropna(subset=["dt", "val"], inplace=True)
                            subdf["month_str"] = subdf["dt"].dt.strftime("%B")
                            grouped = subdf.groupby("month_str")["val"]
                            month_vals = grouped.apply(lambda x: aggregate_series(x, monthly_aggregator))
                            for month_s, v in month_vals.items():
                                if key not in data_dict:
                                    data_dict[key] = {}
                                data_dict[key][month_s] = v
                        else:
                            # Keep daily as is
                            subdf.dropna(subset=["val"], inplace=True)
                            for i, row in subdf.iterrows():
                                dt_val = row["dt"]
                                val = row["val"]
                                if pd.isna(dt_val):
                                    day_s = f"Day_{i}"
                                else:
                                    day_s = dt_val.strftime("%m/%d")
                                if key not in data_dict:
                                    data_dict[key] = {}
                                data_dict[key][day_s] = val

                    elif freq_mode == "Monthly":
                        # Keep monthly as is or reconvert if needed
                        subdf.dropna(subset=["val"], inplace=True)
                        for i, row in subdf.iterrows():
                            dt_val = row["dt"]
                            val = row["val"]
                            if pd.isna(dt_val):
                                month_s = f"Month_{i}"
                            else:
                                month_s = dt_val.strftime("%B")
                            if key not in data_dict:
                                data_dict[key] = {}
                            data_dict[key][month_s] = val
                    else:
                        # Unknown frequency
                        print(f"Warning: Unknown frequency for column '{col}' in Building {bldg_id}. Skipping.")
                        continue

                ########################################################################
                # 7) If not converting => keep as-is (time_str keys)
                ########################################################################
                else:
                    subdf = pd.DataFrame({
                        "time_str": df["Date/Time_corrected"].astype(str).apply(lambda x: x.strip()),
                        "val": pd.to_numeric(df[col], errors='coerce'),
                        "parsed_dt": df["parsed_dt"]
                    })
                    subdf.dropna(subset=["val"], inplace=True)

                    for i, row in subdf.iterrows():
                        tstr = row["time_str"]
                        val = row["val"]
                        parsed_dt = row["parsed_dt"]
                        if key not in data_dict:
                            data_dict[key] = {}
                        data_dict[key][tstr] = val
                        all_times.add(tstr)
                        if tstr not in time_to_dt:
                            time_to_dt[tstr] = parsed_dt

    ###########################################################################
    # 8) Figure out the sorted "time" axis depending on daily/monthly flags
    ###########################################################################
    if convert_to_monthly and convert_to_daily:
        # Rare case: Hourly -> Daily -> Monthly
        # This would yield a mixture of day_str and month_str if not carefully handled.
        # For simplicity, let’s just gather them all.
        day_strings = set()
        month_strings = set()
        for submap in data_dict.values():
            for key_str in submap.keys():
                if re.match(r'\d{2}/\d{2}', key_str):
                    day_strings.add(key_str)
                elif key_str in month_to_num:
                    month_strings.add(key_str)

        try:
            sorted_days = sorted(list(day_strings), key=lambda x: datetime.strptime(x, "%m/%d"))
        except ValueError:
            sorted_days = sorted(list(day_strings))
        try:
            sorted_months = sorted(list(month_strings), key=lambda x: month_to_num.get(x, 0))
        except ValueError:
            sorted_months = sorted(list(month_strings))

        sorted_times = sorted_months + sorted_days
        columns = ["BuildingID", "VariableName"] + sorted_times

    elif convert_to_monthly:
        # Only monthly
        month_strings = set()
        for submap in data_dict.values():
            for key_str in submap.keys():
                if key_str in month_to_num:
                    month_strings.add(key_str)
        try:
            sorted_times = sorted(list(month_strings), key=lambda x: month_to_num.get(x, 0))
        except ValueError:
            sorted_times = sorted(list(month_strings))
        columns = ["BuildingID", "VariableName"] + sorted_times

    elif convert_to_daily:
        # Only daily
        day_strings = set()
        for submap in data_dict.values():
            for key_str in submap.keys():
                day_strings.add(key_str)
        try:
            sorted_times = sorted(list(day_strings), key=lambda x: datetime.strptime(x, "%m/%d"))
        except ValueError:
            sorted_times = sorted(list(day_strings))
        columns = ["BuildingID", "VariableName"] + sorted_times

    else:
        # As-is (Hourly or TimeStep data)
        def safe_dt(tstr):
            dtval = time_to_dt.get(tstr)
            return dtval if pd.notna(dtval) else datetime.min

        try:
            sorted_times = sorted(list(all_times), key=lambda x: safe_dt(x))
        except Exception as e:
            print(f"Error in sorting times: {e}")
            sorted_times = sorted(list(all_times))

        columns = ["BuildingID", "VariableName"] + sorted_times

    ###########################################################################
    # 9) Build final DataFrame and write to CSV
    ###########################################################################
    rows = []
    for (bldg_id, var_name), tmap in data_dict.items():
        rowdata = [bldg_id, var_name]
        for t in sorted_times:
            rowdata.append(tmap.get(t, np.nan))
        rows.append(rowdata)

    final_df = pd.DataFrame(rows, columns=columns)
    final_df.sort_values(by=["BuildingID", "VariableName"], inplace=True)

    try:
        final_df.to_csv(output_csv, index=False)
        print(f"[merge_all_results] Successfully wrote merged CSV to {output_csv}")
    except Exception as e:
        print(f"Error writing to {output_csv}: {e}")


In [2]:
base_dir = r"D:\Documents\E_Plus_2030_py\output\edfc6efe-fcb0-4276-8f77-81989745744e\Sim_Results\2020"
output_csv = r"D:\Documents\E_Plus_2030_py\output\edfc6efe-fcb0-4276-8f77-81989745744e\Sim_Results\2020\merged_as_is.csv"

merge_all_results(
    base_output_dir=base_dir,
    output_csv=output_csv,
    convert_to_daily=False,    # "as is"
    convert_to_monthly=False,  # "as is"
    daily_aggregator="mean",
    monthly_aggregator="mean"
)


[merge_all_results] Reading D:\Documents\E_Plus_2030_py\output\edfc6efe-fcb0-4276-8f77-81989745744e\Sim_Results\2020\simulation_bldg0.csv, Building 0
[merge_all_results] Reading D:\Documents\E_Plus_2030_py\output\edfc6efe-fcb0-4276-8f77-81989745744e\Sim_Results\2020\simulation_bldg0Meter.csv, Building 0
[merge_all_results] Reading D:\Documents\E_Plus_2030_py\output\edfc6efe-fcb0-4276-8f77-81989745744e\Sim_Results\2020\simulation_bldg0Zsz.csv, Building 0
[merge_all_results] Reading D:\Documents\E_Plus_2030_py\output\edfc6efe-fcb0-4276-8f77-81989745744e\Sim_Results\2020\simulation_bldg1.csv, Building 1
[merge_all_results] Reading D:\Documents\E_Plus_2030_py\output\edfc6efe-fcb0-4276-8f77-81989745744e\Sim_Results\2020\simulation_bldg10.csv, Building 10
[merge_all_results] Reading D:\Documents\E_Plus_2030_py\output\edfc6efe-fcb0-4276-8f77-81989745744e\Sim_Results\2020\simulation_bldg100.csv, Building 100
[merge_all_results] Reading D:\Documents\E_Plus_2030_py\output\edfc6efe-fcb0-4276-8f77