In [1]:
import pandas as pd
import os 
import numpy as np

In [None]:
# define the base path to thumb drive (input path)
base_path = '/Volumes/Untitled/MeterDataTest'
#base_path = '/Desktop/MeterDataTest'

#check if base path exists
if not os.path.exists(base_path):
    print(f"Error: Path {base_path} does not exist")
    exit()

In [None]:
# list to store all dataframes
all_data = []

#list to store all dataframes with 3_phase_watt_total column for later use
save_data = []

# dataframe for all data
#combined_data = pd.DataFrame()
#save_data_col = pd.DataFrame()

In [None]:
# iterate through the subfolders in the MeterDataTest folder
for subfolder in os.listdir(base_path):
    subfolder.replace(" ", "_")
    # create path for each subfolder
    folder_path = os.path.join(base_path, subfolder)

    # get the name of the meter from the subfolder name, make lowercase
    meter_name = subfolder.lower().replace(" ", "_") 

    # list of csv file paths in subfolder
    # addition with the 'and not' is to make sure to ignore the hidden ._ files
    csv_paths = [os.path.join(folder_path, f) 
                 for f in os.listdir(folder_path) 
                 if f.endswith('.csv')
                 and not f.startswith("._")
                 and not f.startswith(".")]

    # convert each csv to a df, fix columns and add df to df list
    for csv in csv_paths:
        df = pd.read_csv(csv, encoding="utf-8")

        df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
        
        # rename columns if they exist
        if '3_phase_positive_real_energy_used' in df.columns:
            df.rename(columns={
                '3_phase_positive_real_energy_used': 'total_watt_hour',
                '3_phase_real_power':'3_phase_watt_total'
            }, inplace=True)
        
        # reorder the columns
        df = df[['datetime', 'total_watt_hour', '3_phase_watt_total']]
        df.insert(1, 'meter_name', meter_name)

        # save the dataframe with all columns to list
        save_data.append(df.copy())

        df.drop('3_phase_watt_total', axis=1, inplace=True)

        # convert datetime column to a datetime type
        df['datetime'] = pd.to_datetime(df['datetime'], format='%Y-%m-%d %H:%M:%S')

        # create column that contains the closest interval for each timestamp (contains ymd hms, using timedelta)
        #df['interval_15min'] = pd.to_datetime(df['datetime'].dt.round('15min'), format='%Y-%m-%d %H:%M:%S')
        df['interval_15min'] = df['datetime'].dt.round('15min')
        
        # create column that contains the offset in seconds from the closest interval for each timestamp
        # - is if its before it and + is if its after
        df['interval_offset'] = (df['datetime'] - df['interval_15min']).dt.total_seconds()

        # create new column with true if an exact interval and false if not
        df['is_exact'] = df['datetime'].eq(df['interval_15min'])
        
        df['interpolated'] = False
        
        interpolated_rows = []

        # interval = the 15min bucket val, group = all rows in that bucket
        for interval, group in df.groupby('interval_15min'):
            # only select rows in group with is_exact == True
            exact = group[group['is_exact']]

            if exact.empty:
                before = group[group['interval_offset'] <= 0]
                after = group[group['interval_offset'] >= 0]

                # check if there are empties
                if not before.empty and not after.empty:
                    # grab the closest data to the interval
                    time_before = before.iloc[-1]
                    time_after = after.iloc[0]
                    
                    # calculate the estimated total_watt_hour
                    # get the slope to 4 decimal places
                    reading_diff = time_after['total_watt_hour'] - time_before['total_watt_hour']

                    if reading_diff == 0:
                        estimated_twh = time_before['total_watt_hour']
                    else:
                        time_diff = (time_after['datetime'] - time_before['datetime']).total_seconds()
                        slope = round(reading_diff / time_diff, 4)
                        sec_before_interval = (interval - time_before['datetime']).total_seconds()
                        estimated_twh = time_before['total_watt_hour'] + (slope * sec_before_interval)

                    # create interpolated row
                    new_row = time_before.copy()
                    new_row['datetime'] = interval
                    new_row['total_watt_hour'] = estimated_twh
                    new_row['interval_offset'] = 0
                    new_row['is_exact'] = True
                    new_row['interpolated'] = True
                    
                    # add new interpolated row to list
                    interpolated_rows.append(new_row)

        # combine interpolated data with dataframe
        if interpolated_rows:
            df = pd.concat([df, pd.DataFrame(interpolated_rows)], ignore_index=True)

        df = df.drop(columns=['interval_15min', 'interval_offset', 'is_exact'])

        # resort the data to be in order of datetime
        df = df.sort_values(by='datetime').reset_index(drop=True)
        
        all_data.append(df)

In [None]:
# create with all columns to save for later
save_data_col = pd.concat(save_data, ignore_index=True)

In [None]:
# combine all dataframes in list to one dataframe
combined_data = pd.concat(all_data, ignore_index=True)

In [None]:
# convert dataframe with all collumns to csv to save for later
save_data_col.to_csv('save_data_col.csv', index=False)

In [None]:
# convert dataframe to csv
combined_data.to_csv('interpolated_meter_data.csv', index=False)

In [None]:
# list of meters and first timestamp in dataset
df = pd.read_csv('.csv', encoding="utf-8")
print(df['meter_name'].unique())
print(df.head(1))

In [None]:
# sample of interpolated data for admin_serv_1 2025-09-10
df = pd.read_csv('interpolated_meter_data.csv', encoding="utf-8")
df['datetime'] = pd.to_datetime(df['datetime'])
date = pd.to_datetime('2025-09-10').date()
sample_data = df[(df['datetime'].dt.date == date) & (df['meter_name'] == 'admin_serv_1_mtr')]
sample_data.to_csv('sample_data.csv', index=False)

In [2]:
# sample of interpolated data for biomedical_science_main_a_mtr 2025-09-10
df = pd.read_csv('interpolated_meter_data.csv', encoding='utf-8')

In [5]:
df['datetime'] = pd.to_datetime(df['datetime'])
date = pd.to_datetime('2025-09-10').date()
sample_data = df[(df['datetime'].dt.date == date) 
    & (df['meter_name'] == 'biomedical_science_main_a_mtr') 
    & ((df['interpolated'] == True) | ((df['datetime'].dt.minute % 15 == 0) & (df['datetime'].dt.second == 0)))
    ].drop_duplicates(subset=['datetime', 'meter_name'])

In [6]:
sample_data.to_csv('sample_data.csv', index=False)