In [1]:
# Built-in libraries
from datetime import datetime, timedelta

# NumPy, SciPy and Pandas
import pandas as pd
import numpy as np


In [2]:
"""
Constants for time period with maximum number of buildings measured simultaneously in the BDG dataset.
For more details, go to old_files/RawFeatures_BDG.ipynb
"""
BDG_STARTDATE = datetime.strptime('01/01/15 00:00', '%d/%m/%y %H:%M')
BDG_ENDDATE = datetime.strptime('30/11/15 23:00','%d/%m/%y %H:%M')


In [5]:
def hourly_dataset(name):
    # Building Data Genome dataset
    if name == 'BDG':
        df = pd.read_csv('../data/raw/temp_open_utc_complete.csv', parse_dates=True, 
                         infer_datetime_format=True, index_col=0)
        # truncate the dataframe based on a pre-calculated time period, if needed
        startDate = BDG_STARTDATE
        endDate = BDG_ENDDATE
        df = df[(df.index >= startDate) & (df.index <= endDate)]
    
    # Washington D.C. dataset
    elif name == 'DGS':
        df = pd.read_csv('../data/raw/DGS_322_Buildings-15m-By_Building-DST-gap-filled-3-2-18-508pm.csv',
                            parse_dates=[['Building ID', 'Unnamed: 1']], infer_datetime_format=True)
        # get rid of temperature column
        del df['Unnamed: 2']

        # update column names to match the row of building names
        new_column_names = df.iloc[0,:]
        df.columns = new_column_names

        # get rid of rows with metadata and update index
        df = df.drop([0,1,2], axis=0)
        df = df.rename(columns = {'Building nan':'timestamp'})
        df.index = df['timestamp'].astype('datetime64[ns]')
        del df['timestamp']
        df = df.astype(float)
        
        # since the dataset is made from 15min interval readings, resample to 1 hr
        df = df.resample('1H').sum()
             
    else:
        print("Please choose a valid dataset")
        exit()
    
    # save the file to csv before exit
    df.to_csv('../data/processed/{}_dataset.csv'.format(name))
    
    return df


In [None]:
# load building gnome dataset (BDG)
df_BDG = hourly_dataset('BDG')

# load dc building dataset (DC)
df_DGS = hourly_dataset('DGS')
