In [1]:
# Built-in libraries
from datetime import datetime, timedelta
import math

# NumPy, SciPy and Pandas
import pandas as pd
import numpy as np


In [2]:
"""
Constants for time period with maximum number of buildings measured simultaneously in the BDG dataset.
For more details, go to old_files/RawFeatures_BDG.ipynb
"""
BDG_STARTDATE = datetime.strptime('01/01/15 00:00', '%d/%m/%y %H:%M')
BDG_ENDDATE = datetime.strptime('30/11/15 23:00','%d/%m/%y %H:%M')


In [3]:
"""
Function to extract a given context from a dataframe. The resulting context is saved in a csv file.
Current contexts:
- Weekday
- Weekend
- Fullweek
"""
def getContext(datasetName, context):
    dataframe = pd.read_csv('../data/processed/{}_dataset.csv'.format(datasetName), parse_dates=True, 
                            infer_datetime_format=True, index_col=0)
    # truncate the dataframe based on a pre-calculated time period, if needed
    if datasetName == 'BDG':
        startDate = BDG_STARTDATE
        endDate = BDG_ENDDATE
    else:
        startDate = dataframe.index[0]
        endDate = dataframe.index[len(dataframe) - 1]
    
    dataframe_truncated = dataframe[(dataframe.index >= startDate) & (dataframe.index <= endDate)]

    # resample based on context    
    if context == "weekday":
        df_context = dataframe_truncated[(dataframe_truncated.index.weekday != 5) & 
                                     (dataframe_truncated.index.weekday != 6)]
    elif context == "weekend":
        df_context = dataframe_truncated[(dataframe_truncated.index.weekday == 5) |
                                     (dataframe_truncated.index.weekday == 6)]
    elif context == "fullweek":
        df_context = dataframe_truncated.copy()
    else:
        print("Please choose a valid context")
        exit()
    
    # delete the dates with 0 values
    df_context = df_context[(df_context.T != 0).any()]
    # replace 0.0 with NaN to drop columns with NaN
    # df_context = df_context.replace(0.0, np.nan)
    # drop columns with all nan values
    df_context = df_context.dropna(axis=1, how='all') 
    # drop columns with more than 7 nan values (seems to be a sweet spot)
    df_context = df_context.dropna(thresh=len(df_context) - 7,axis=1)

    # save the file and return the dataframe
    df_context.to_csv("../data/processed/{}_{}_dataset.csv".format(datasetName, context))
    return df_context


In [4]:
df_BDG_weekday = getContext('BDG', 'weekday')
df_BDG_weekend = getContext('BDG', 'weekend')
df_BDG_fullweek = getContext('BDG', 'fullweek')

df_DC_weekday = getContext('DGS', 'weekday')
df_DC_weekend = getContext('DGS', 'weekend')
df_DC_fullweek = getContext('DGS', 'fullweek')
