In [1]:
# Built-in libraries
from datetime import datetime, timedelta
import math

# NumPy, SciPy and Pandas
import pandas as pd
import numpy as np

# Scikit-Learn
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model


In [2]:
"""
Function that calculates a load profile curve, for each building on a dataframe, based on the specified function.
Currently the following functions are supported:
- Average
- Median

And the currently resolution:
- Daily

The name parameter allow us to save the resulting csv with a more comprehensive title
"""
def doAggregation(datasetName, context, function, resolution='day'):
    dataframe = pd.read_csv('../data/processed/{}_{}_dataset.csv'.format(datasetName, context), parse_dates=True, 
                            infer_datetime_format=True, index_col=0)
    df_load_curves = pd.DataFrame() # dataframe that will hold all load curves

    # resample based on parameter
    if (resolution == 'day'):
        availableSamples = (dataframe.resample('1D').asfreq()).index # get list of timestamps group by day
        delta = 23 # timedelta based on resample
    else:
        print("Please choose a valid resolution")
        exit()

    # iterate through all buildings (column)
    for column in range(len(dataframe.columns)):
        df_sampledReadings = pd.DataFrame() # dataframe to hold new samples for a column
        currentColumn = pd.DataFrame(dataframe.iloc[:, column])
        
        # iterate through each day
        for timestamp in availableSamples:
            # update time limits to the current date
            start = timestamp
            end = timestamp + timedelta(hours=delta)
            # get meter data from only this resolution
            df_reading = currentColumn[(currentColumn.index >= start) & (currentColumn.index <= end)]
            # ignore index since they are unique timestamps
            df_reading.reset_index(drop=True, inplace=True)         
            # append new sample as columns
            df_sampledReadings = pd.concat([df_sampledReadings, df_reading], axis=1)
            
        # make sure sure there are no columns with NaN values
        df_sampledReadings.dropna(axis=1, how='all', inplace=True)
        df_sampledReadings = df_sampledReadings.T # transpose it so it's easier to see and operate
        # up to this point, the matrix above has the shape nxm where is the number of instances and m is the number of readings
    
        # if any NaN prevailed
        df_sampledReadings.fillna(value=0, inplace=True) 

        # calculate load curve based on function
        if function == 'average':
            load_curve = np.mean(df_sampledReadings, axis = 0)

        elif function =='median':
            load_curve = np.median(df_sampledReadings, axis = 0)
            
#         elif function == 'regression':
#             # 1. Generate one single time series for the entire building
#             df_one_ts = pd.DataFrame() # empty data frame to hold complete time series
#             df_trans = df_sampledReadings.T
#             # iterate through each day worth of readings
#             for column in range(len(df_trans.columns)):
#                 currentColumn = pd.DataFrame(df_trans.iloc[:, column])
#                 df_one_ts = df_one_ts.append(currentColumn, ignore_index=True)
#             # rename variables            
#             x_values = df_one_ts.index.values.reshape(-1, 1)
#             y_values = df_one_ts.values
            
#             # 2. Perform polynomial regressions on the single time series
#             degrees = range(1, 21)
#             base_model = linear_model.LinearRegression().fit(x_values, y_values)
#             base_curve = base_model.predict(x_values)
#             rmse = np.sqrt(mean_squared_error(y_values, base_curve))
#             load_curve = base_curve

#             ####################################################################
#             # TODO: multiple reg plotting
#             # plt.figure(figsize=(18,10))
#             # plt.scatter(x_values, y_values)
#             ####################################################################

#             for d in degrees: # fit a curve for each degree
#                 polynomial_features= PolynomialFeatures(degree=d)
#                 x_poly = polynomial_features.fit_transform(x_values)    
#                 poly_model = linear_model.LinearRegression()
#                 poly_model.fit(x_poly, y_values)
#                 poly_curve = poly_model.predict(x_poly)
#                 rmse_d = np.sqrt(mean_squared_error(y_values,poly_curve))

#                 # print(rmse_d)
                
#                 # keep the polynomial with lowest RSME
#                 if rmse_d < rmse :
#                     rmse = rmse_d
#                     load_curve = poly_curve
                    

                ####################################################################
                # TODO: multiple reg plotting
            #     plt.plot(x_values, poly_curve, "k-")
            # plt.plot(x_values, load_curve, "r-")
            # plt.title("Load Profiles and red representative curve based on {}".format(function))        
            # plt.show()
            # print(rmse)
            # exit()
            ###################################################################

        else:
            print("Please choose a valid context")
            exit()

        ####################################################################
        # TODO: coding is for plotting purposes
        # plt.figure(figsize=(18,10))
        # x_axis = range(0, len(df_sampledReadings.columns))
        # for _, curve in df_sampledReadings.iterrows():
        #     plt.plot(curve, "k-", alpha=.2)
        # plt.plot(load_curve, "r-")
        # plt.title("Load Profiles and red representative curve based on {}".format(function))        
        # plt.show()
        # print(X)
        # exit()
        ####################################################################

        # turn into one column dataframe for easier manipulation
        load_curve = pd.DataFrame(load_curve)
        # keep the instance name as column name
        instance_name = []
        instance_name.append(df_sampledReadings.index[0])
        load_curve.columns = instance_name
        # append current load curve to dataframe
        df_load_curves = pd.concat([df_load_curves, load_curve], axis=1)
        
        # end of for loop for one column

    # replace NaN's with 0    
    df_load_curves = df_load_curves.replace(0.0, np.nan)
    # drop rows with all nan values
    df_load_curves = df_load_curves.dropna(axis=1, how='all') 
    
    # particular to the DGS dataset
    if datasetName =='DGS':    
        # drop columns with more than 4 nan values (seems to be a sweet spot)
        df_load_curves = df_load_curves.dropna(thresh=len(df_load_curves) - 4, axis=1)

    df_load_curves.fillna(value=0, inplace=True) 
    df_load_curves = df_load_curves.T # rotate the final dataframe
    
    # save the file and return the dataframe
    df_load_curves.to_csv("../data/processed/{}_{}_{}_dataset.csv".format(datasetName, context, function))
    return df_load_curves

In [3]:
df_BDG_weekday_average = doAggregation('BDG', 'weekday', 'average')
df_BDG_weekday_median = doAggregation('BDG', 'weekday', 'median')


In [4]:
df_BDG_weekend_average = doAggregation('BDG', 'weekend', 'average')
df_BDG_weekend_median = doAggregation('BDG', 'weekend', 'median')


In [5]:
df_BDG_fullweek_average = doAggregation('BDG', 'fullweek', 'average')
df_BDG_fullweek_median = doAggregation('BDG', 'fullweek', 'median')


In [6]:
df_DGS_weekday_average = doAggregation('DGS', 'weekday', 'average')
df_DGS_weekday_median = doAggregation('DGS', 'weekday', 'median')


In [7]:
df_DGS_weekend_average = doAggregation('DGS', 'weekend', 'average')
df_DGS_weekend_median = doAggregation('DGS', 'weekend', 'median')


In [8]:
df_DGS_fullweek_average = doAggregation('DGS', 'fullweek', 'average')
df_DGS_fullweek_median = doAggregation('DGS', 'fullweek', 'median')
