In [12]:
import pandas as pd
import os 
import numpy as np

In [13]:
# define the base path to thumb drive (input path)
base_path = '/Volumes/Untitled/MeterDataTest'
#base_path = '/Desktop/MeterDataTest'

#check if base path exists
if not os.path.exists(base_path):
    print(f"Error: Path {base_path} does not exist")
    exit()

In [14]:
# list to store all dataframes
all_data = []

#list to store all dataframes with 3_phase_watt_total column for later use
save_data = []

# dataframe for all data
combined_data = pd.DataFrame()
save_data_col = pd.DataFrame()

In [16]:
# iterate through the subfolders in the MeterDataTest folder
for subfolder in os.listdir(base_path):
    subfolder.replace(" ", "_")
    # create path for each subfolder
    folder_path = os.path.join(base_path, subfolder)

    # get the name of the meter from the subfolder name, make lowercase
    meter_name = subfolder.lower().replace(" ", "_") 

    # list of csv file paths in subfolder
    # addition with the 'and not' is to make sure to ignore the hidden ._ files
    csv_paths = [os.path.join(folder_path, f) 
                 for f in os.listdir(folder_path) 
                 if f.endswith('.csv')
                 and not f.startswith("._")
                 and not f.startswith(".")]

    # convert each csv to a df, fix columns and add df to df list
    for csv in csv_paths:
        df = pd.read_csv(csv, encoding="utf-8")

        df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
        
        # rename columns if they exist
        if '3_phase_positive_real_energy_used' in df.columns:
            df.rename(columns={
                '3_phase_positive_real_energy_used': 'total_watt_hour',
                '3_phase_real_power':'3_phase_watt_total'
            }, inplace=True)
        
        # reorder the columns
        df = df[['datetime', 'total_watt_hour', '3_phase_watt_total']]

        df.insert(1, 'meter_name', meter_name)

        # save the dataframe with all columns to list
        save_data.append(df.copy())

        df.drop('3_phase_watt_total', axis=1, inplace=True)

        # convert datetime column to a datetime type
        df['datetime'] = pd.to_datetime(df['datetime'], format='%Y-%m-%d %H:%M:%S')

        # create column that contains the closest interval for each timestamp (contains ymd hms, using timedelta)
        #df['interval_15min'] = pd.to_datetime(df['datetime'].dt.round('15min'), format='%Y-%m-%d %H:%M:%S')
        df['interval_15min'] = df['datetime'].dt.round('15min')
        
        # create column that contains the offset in seconds from the closest interval for each timestamp
        # - is if its before it and + is if its after
        df['interval_offset'] = (df['datetime'] - df['interval_15min']).dt.total_seconds()

        # create new column with true if an exact interval and false if not
        df['is_exact'] = df['datetime'].eq(df['interval_15min'])
        #df['is_exact'] = (df['datetime'].dt.minute % 15 == 0) & (df['datetime'].dt.second == 0)

        interpolated_rows = []
        df['interpolated'] = False
        # interval = the 15min bucket val, group = all rows in that bucket
        for interval, group in df.groupby('interval_15min'):
            # only select rows in group with is_exact == True
            exact = group[group['is_exact']]

            # if there exists an exact interval
            if not exact.empty:
                #row = exact.iloc[0].copy
                #row['interpolated'] = False
                interpolated_rows.append(exact)
            else:
                before = group[group['interval_offset'] <= 0]
                after = group[group['interval_offset'] >= 0]

                # check if there are empties*****
                if not before.empty and not after.empty:
                    # grab the closest data to the interval
                    time_before = before.iloc[-1]
                    time_after = after.iloc[0]
                    #print("time before:", time_before, "time after: ", time_after, "before", before, "after", after, "============================")
                    
                    # calculate the estimated total_watt_hour
                    # get the slope to 4 decimal places
                    reading_diff = time_after['total_watt_hour'] - time_before['total_watt_hour']
                    time_diff = (time_after['datetime'] - time_before['datetime']).total_seconds()
                    slope = round(reading_diff / time_diff, 4)
                    sec_before_interval = (interval - time_before['datetime']).total_seconds()
                    estimated_twh = time_before['total_watt_hour'] + (slope * sec_before_interval)
                    #print(f"interval: {interval}, slope: {slope}, secbefint: {sec_before_interval}, timebefore: {time_before['total_watt_hour']}, estimatedtwh: {estimated_twh}")
                    new_row = time_before.copy()
                    #new_row['datetime', 'total_watt_hour', 'interval_offset', 'is_exact', 'interpolated'] = [interval, estimated_twh, 0, True, True]
                    new_row['datetime'] = interval
                    new_row['total_watt_hour'] = estimated_twh
                    new_row['interval_offset'] = 0
                    new_row['is_exact'] = True
                    new_row['interpolated'] = True
                    # add new interpolated row to df
                    # add all the ture interpolated rows to the list and then concat the list and hten sort the times?

        print(df.head(20))
        # get average of all data between interval to interval + 14:59 (maybe group it)

        # add average to the line with the interval and remove all the other lines in the group
        # change col name
        #df.rename(columns={'total_watt_hour': 'mean'}, inplace=True)
        

        all_data.append(df)

              datetime        meter_name  total_watt_hour      interval_15min  \
0  2025-07-23 09:40:50  admin_serv_1_mtr          1381508 2025-07-23 09:45:00   
1  2025-07-23 09:41:54  admin_serv_1_mtr          1381508 2025-07-23 09:45:00   
2  2025-07-23 09:42:59  admin_serv_1_mtr          1381509 2025-07-23 09:45:00   
3  2025-07-23 09:44:03  admin_serv_1_mtr          1381510 2025-07-23 09:45:00   
4  2025-07-23 09:45:08  admin_serv_1_mtr          1381510 2025-07-23 09:45:00   
5  2025-07-23 09:46:14  admin_serv_1_mtr          1381511 2025-07-23 09:45:00   
6  2025-07-23 09:47:20  admin_serv_1_mtr          1381511 2025-07-23 09:45:00   
7  2025-07-23 09:48:26  admin_serv_1_mtr          1381512 2025-07-23 09:45:00   
8  2025-07-23 09:48:39  admin_serv_1_mtr          1381512 2025-07-23 09:45:00   
9  2025-07-23 09:49:39  admin_serv_1_mtr          1381512 2025-07-23 09:45:00   
10 2025-07-23 09:50:40  admin_serv_1_mtr          1381513 2025-07-23 09:45:00   
11 2025-07-23 09:51:40  admi

In [None]:
# create with all columns to save for later
save_data_col = pd.concat(save_data, ignore_index=True)

In [None]:
# combine all dataframes in list to one dataframe
combined_data = pd.concat(all_data, ignore_index=True)

In [None]:
# check columns are correct
print(save_data_col.head(10))
#print(save_data[1])
print(combined_data.head(30))
print(combined_data[combined_data['is_exact'] == True])

In [None]:
# convert dataframe with all collumns to csv to save for later
save_data_col.to_csv('save_data_col.csv', index=False)

In [None]:
# convert dataframe to csv
combined_data.to_csv('step1.csv', index=False)

In [None]:
# list of meters and first timestamp in dataset
df = pd.read_csv('.csv', encoding="utf-8")
print(df['meter_name'].unique())
print(df.head(1))