In [95]:
import pandas as pd
from pathlib import Path
import datetime
import numpy as np
import glob
import pickle

## Starting with one building file
Starting with one of the building data from 69 facilities at Arizona State University, Tempe Campus to explore, analyse and scale up.
'Armstrong' is the name of the building with gross floor area of 105,232 Sq.ft

In [3]:
xls = pd.read_html('AllBuildingsEnergyData/armstrong.xls')
xls[0]

Unnamed: 0,0,1,2
0,EIS Charting Data Dump: Timeframe = Last Year ...,EIS Charting Data Dump: Timeframe = Last Year ...,EIS Charting Data Dump: Timeframe = Last Year ...
1,,Armstrong - kW Avg,
2,Tue Jan 1 00:00:00 GMT-0700 2019,61.44999980926514,0
3,Tue Jan 1 01:00:00 GMT-0700 2019,60.70832443237305,0
4,Tue Jan 1 02:00:00 GMT-0700 2019,64.32917594909668,0
...,...,...,...
8757,Tue Dec 31 19:00:00 GMT-0700 2019,55.76457500457764,0
8758,Tue Dec 31 20:00:00 GMT-0700 2019,54.13544940948486,0
8759,Tue Dec 31 21:00:00 GMT-0700 2019,53.32290077209473,0
8760,Tue Dec 31 22:00:00 GMT-0700 2019,52.19582557678223,0


### Extracting the name of the building

In [4]:
name = xls[0].loc[1,1].split('-')[0].strip()
name

'Armstrong'

### Extracting the Energy demand values

In [5]:
energy = xls[0][1].drop(index = [0,1]).reset_index(drop = True)
energy = energy.to_frame(name = 'Energy')

### Loading Buildings database
This buildings database contains the details of eac  building. The useful variables for our study includes:
1. Gross Flooe Area (GSF)
2. Building Name (To map details with the energy file)
3. Building Type (Type)
    a. Academic
    b. Support
    c. Housing
    d. Library etc

In [34]:
Buildings = pd.read_csv('BuildingDatabase.csv')
Buildings

Unnamed: 0,Building Name,Building \nNumber,Status,Type,Campus,Abbreviation,Address,GSF
0,GRADY GAMMAGE MEMORIAL AUDITORIUM,1,atleast one project,Academic,Tempe,GGMA,1200 S. FOREST AVENUE,150097.0
1,INTERDISCIPLINARY A,2,atleast one project,Academic,Tempe,INTDSA,1100 S. CADY MALL,28332.0
2,INTERDISCIPLINARY B,3,atleast one project,Academic,Tempe,INTDSB,1120 S. CADY MALL,62941.0
3,SCHOOL OF HUMAN EVOLUTION AND SOCIAL CHANGE,4,atleast one project,Academic,Tempe,SHESC,900 S. CADY MALL,49078.0
4,SOCIAL SCIENCES BLDG.,5,No projects so far,Academic,Tempe,SS,951 S. CADY MALL,87673.0
...,...,...,...,...,...,...,...,...
456,Physical Education Building West,PHW,atleast one project,,Polytechnic,PEBW,,
457,ALAMEDA BUILDING,ALA,atleast one project,,Tempe,ALA,734 West Alameda Dr,
458,CENTERPOINT,CNT,atleast one project,Academic,Tempe,CNT,660 S. Mill Ave.,
459,COMMUNITY SERVICES,CSB,atleast one project,Services,Tempe,CSB,"200 E Curry Rd, Tempe, AZ 85281",109322.0


### Extracting the building gross floor area and building type using the buidling name from Energy data file

In [7]:
gsf = Buildings.GSF[Buildings['Building Name'].str.contains(f'{name}',regex = False, case = False)].reset_index(drop = True)
Btype = Buildings.Type[Buildings['Building Name'].str.contains(f'{name}',regex = False, case = False)].reset_index(drop = True)
print(f'Building GSF:{gsf[0]}\nBuilding type:{Btype[0]}')

Building GSF:105232.0
Building type:Academic


Creating an empty dataframe and copying the GSF, Btype values to join it with Energy data Dataframe

In [8]:
df = pd.DataFrame(np.zeros((energy.shape[0],2)),columns = ['GSF','Btype'])
df['GSF'] = gsf[0]
df['Btype'] = Btype[0]
df

Unnamed: 0,GSF,Btype
0,105232.0,Academic
1,105232.0,Academic
2,105232.0,Academic
3,105232.0,Academic
4,105232.0,Academic
...,...,...
8755,105232.0,Academic
8756,105232.0,Academic
8757,105232.0,Academic
8758,105232.0,Academic


# Gathering all other data
Information like outside tempearure and Smester schedule is required in order to predict the energy demand more accurately. Temperature data is obtained from NOAA database and Semester information is obtained from Arizona State University's academic calendar

### Loading temperature Data

In [9]:
TempFile = pd.ExcelFile('CompleteTemperatureProfile.xlsx')
TempDF = TempFile.parse(0)
TempDF

Unnamed: 0,STATION,NAME,DATE,AWND,PGTM,PRCP,SNOW,SNWD,TAVG,TMAX,...,WDF2,WDF5,WSF2,WSF5,WT01,WT02,WT03,WT07,WT08,WT10
0,USW00023183,"PHOENIX AIRPORT, AZ US",2018-01-01,3.36,,0.0,0.0,0.0,56.0,73,...,100.0,340.0,10.1,13.0,,,,,,
1,USW00023183,"PHOENIX AIRPORT, AZ US",2018-01-02,4.47,,0.0,0.0,0.0,60.0,75,...,50.0,40.0,10.1,13.0,,,,,,
2,USW00023183,"PHOENIX AIRPORT, AZ US",2018-01-03,5.14,,0.0,0.0,0.0,61.0,76,...,70.0,60.0,17.0,23.0,,,,,,
3,USW00023183,"PHOENIX AIRPORT, AZ US",2018-01-04,3.58,,0.0,0.0,0.0,66.0,79,...,90.0,60.0,12.1,19.0,,,,,,
4,USW00023183,"PHOENIX AIRPORT, AZ US",2018-01-05,4.25,,0.0,0.0,0.0,63.0,77,...,130.0,130.0,10.1,12.1,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
921,USW00023183,"PHOENIX AIRPORT, AZ US",2020-07-10,11.41,,0.0,0.0,0.0,101.0,111,...,280.0,280.0,23.0,28.0,,,,,,
922,USW00023183,"PHOENIX AIRPORT, AZ US",2020-07-11,10.07,,0.0,0.0,0.0,103.0,115,...,240.0,280.0,21.0,29.1,,,,,,
923,USW00023183,"PHOENIX AIRPORT, AZ US",2020-07-12,7.16,,0.0,0.0,,104.0,116,...,180.0,160.0,17.0,23.9,,,,,,
924,USW00023183,"PHOENIX AIRPORT, AZ US",2020-07-13,7.61,,0.0,0.0,0.0,105.0,114,...,270.0,240.0,17.0,23.0,,,,,,


In [10]:
TempDF.describe()

Unnamed: 0,AWND,PGTM,PRCP,SNOW,SNWD,TAVG,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5,WT01,WT02,WT03,WT07,WT08,WT10
count,925.0,34.0,926.0,867.0,817.0,920.0,926.0,926.0,925.0,924.0,925.0,924.0,51.0,1.0,52.0,14.0,39.0,1.0
mean,6.517838,1525.176471,0.02027,0.0,0.0,75.375,87.049676,63.739741,209.924324,220.898268,17.232973,22.418723,1.0,1.0,1.0,1.0,1.0,1.0
std,2.322185,510.836715,0.131509,0.0,0.0,15.850829,16.341217,15.198374,89.866993,96.442431,6.054289,7.677499,0.0,,0.0,0.0,0.0,
min,2.01,2.0,0.0,0.0,0.0,41.0,47.0,30.0,10.0,10.0,6.9,8.9,1.0,1.0,1.0,1.0,1.0,1.0
25%,4.7,1437.75,0.0,0.0,0.0,61.75,73.0,51.0,120.0,150.0,13.0,17.0,1.0,1.0,1.0,1.0,1.0,1.0
50%,6.26,1644.0,0.0,0.0,0.0,74.0,86.0,62.0,260.0,260.0,16.1,21.0,1.0,1.0,1.0,1.0,1.0,1.0
75%,7.83,1738.75,0.0,0.0,0.0,91.0,103.0,77.0,280.0,290.0,21.0,25.9,1.0,1.0,1.0,1.0,1.0,1.0
max,19.01,2328.0,2.36,0.0,0.0,105.0,116.0,94.0,360.0,360.0,52.1,70.9,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
TempDF['Year'] = TempDF['DATE'].apply(lambda x: x.year)

In [13]:
Temp = TempDF.loc[TempDF['Year'] == 2019,['DATE','TAVG']].reset_index(drop=True)
Temp['DATE'] = Temp['DATE'].astype(str)
Temp

Unnamed: 0,DATE,TAVG
0,2019-01-01,43.0
1,2019-01-02,41.0
2,2019-01-03,43.0
3,2019-01-04,48.0
4,2019-01-05,54.0
...,...,...
360,2019-12-27,49.0
361,2019-12-28,49.0
362,2019-12-29,44.0
363,2019-12-30,47.0


### Loading Semester Schedule

In [14]:
Sem = pd.read_csv('Semester_Schedule.csv')
Sem.rename(columns = {'Semester/Holiday': 'Sem'},inplace = True)
Sem

Unnamed: 0,Date,Sem,Details
0,1/1/19,0,Break
1,1/2/19,0,Break
2,1/3/19,0,Break
3,1/4/19,0,Break
4,1/5/19,0,Break
...,...,...,...
360,12/27/19,0,Break
361,12/28/19,0,Break
362,12/29/19,0,Break
363,12/30/19,0,Break


### Gathering Temperature and Semester daily values

In [15]:
DailyDF = Temp.join(Sem.drop(columns = ['Date','Details']))
DailyDF

Unnamed: 0,DATE,TAVG,Sem
0,2019-01-01,43.0,0
1,2019-01-02,41.0,0
2,2019-01-03,43.0,0
3,2019-01-04,48.0,0
4,2019-01-05,54.0,0
...,...,...,...
360,2019-12-27,49.0,0
361,2019-12-28,49.0,0
362,2019-12-29,44.0,0
363,2019-12-30,47.0,0


### Creating the Hourly Interval Dataframe along with weekday

The Temperature profile and Semester Schedule data is available on a daily basis. Therefore, this hourly interval dataframe is used to assign daily averages to hourly intervals

In [16]:
HourInterval = pd.date_range('2019-01-01',periods = 8760,freq='H').to_frame(name = 'DateTime').reset_index(drop =True)
HourInterval['WeekDay'] = HourInterval['DateTime'].apply(lambda x:x.weekday())
HourInterval

Unnamed: 0,DateTime,WeekDay
0,2019-01-01 00:00:00,1
1,2019-01-01 01:00:00,1
2,2019-01-01 02:00:00,1
3,2019-01-01 03:00:00,1
4,2019-01-01 04:00:00,1
...,...,...
8755,2019-12-31 19:00:00,1
8756,2019-12-31 20:00:00,1
8757,2019-12-31 21:00:00,1
8758,2019-12-31 22:00:00,1


In [17]:
HourInterval['WeekDay'] = HourInterval['WeekDay'].map({0:1,1:1,2:1,3:1,4:1,5:0,6:0})
HourInterval

Unnamed: 0,DateTime,WeekDay
0,2019-01-01 00:00:00,1
1,2019-01-01 01:00:00,1
2,2019-01-01 02:00:00,1
3,2019-01-01 03:00:00,1
4,2019-01-01 04:00:00,1
...,...,...
8755,2019-12-31 19:00:00,1
8756,2019-12-31 20:00:00,1
8757,2019-12-31 21:00:00,1
8758,2019-12-31 22:00:00,1


In [18]:
HourInterval['Date'] = HourInterval['DateTime'].astype(str).apply(lambda x: x.split(' ')[0])

In [19]:
dailyDF = HourInterval.merge(DailyDF,left_on = 'Date',right_on = 'DATE')

In [20]:
dailyDF.drop(columns = ['DateTime','Date','DATE'],inplace = True)
dailyDF

Unnamed: 0,WeekDay,TAVG,Sem
0,1,43.0,0
1,1,43.0,0
2,1,43.0,0
3,1,43.0,0
4,1,43.0,0
...,...,...,...
8755,1,54.0,0
8756,1,54.0,0
8757,1,54.0,0
8758,1,54.0,0


Checking by joining the energy dataframe with daily dataframe

In [21]:
df = df.join([energy,dailyDF])
df

Unnamed: 0,GSF,Btype,Energy,WeekDay,TAVG,Sem
0,105232.0,Academic,61.44999980926514,1,43.0,0
1,105232.0,Academic,60.70832443237305,1,43.0,0
2,105232.0,Academic,64.32917594909668,1,43.0,0
3,105232.0,Academic,61.05000019073486,1,43.0,0
4,105232.0,Academic,62.6583251953125,1,43.0,0
...,...,...,...,...,...,...
8755,105232.0,Academic,55.76457500457764,1,54.0,0
8756,105232.0,Academic,54.13544940948486,1,54.0,0
8757,105232.0,Academic,53.32290077209473,1,54.0,0
8758,105232.0,Academic,52.19582557678223,1,54.0,0


# Scaling the code for all 69 facilities

In [22]:
def extractxls (x, Buildings, dailyDF):
    xls = pd.read_html(x)
    name = xls[0].loc[1,1].split('-')[0].strip()
    energy = xls[0][1].drop(index = [0,1]).reset_index(drop = True)
    energy = energy.to_frame(name = 'Energy')
    gsf = Buildings.GSF[Buildings['Building Name'].str.contains(f'{name}',regex = False, case = False)].reset_index(drop = True)
    Btype = Buildings.Type[Buildings['Building Name'].str.contains(f'{name}',regex = False, case = False)].reset_index(drop = True)
    df = pd.DataFrame(np.zeros((energy.shape[0],2)),columns = ['GSF','Btype'])
    print(name)
    df.loc[:,'GSF'] = gsf[0]
    df.loc[:,'Btype'] = Btype[0] 
    return df.join([energy,dailyDF])

In [93]:
Masterdf = pd.DataFrame()
for xlsheet in glob.glob('Training/*.xls'):
    print(xlsheet)
    Masterdf = Masterdf.append(extractxls(xlsheet,Buildings,dailyDF)).reset_index(drop =True)

Training/Piper writing center.xls
Piper Writing Center
Training/Cowden family.xls
Cowden Family Resources
Training/Hayden hall.xls
Hayden Hall
Training/GLV community center.xls
GLV Community Center
Training/Gammage.xls
Gammage Aud
Training/Family studies.xls
Family Studies
Training/Social Sciences.xls
Social Sciences
Training/Moeur.xls
Moeur
Training/Mcclintock hall.xls
McClintock Hall
Training/Fulton parking.xls
Fulton Parking
Training/Schwada COB.xls
Schwada COB
Training/Student Pavilion.xls
Student Pavilion
Training/Music.xls
Music
Training/Dixie gammage.xls
Dixie Gammage Hall
Training/life scinces ABD.xls
Lifescience A_B_D
Training/PV West.xls
PV West
Training/Engineering Res Center.xls
Engineering Research Ctr
Training/Memorial Union.xls
MU
Training/Murdock hall.xls
Murdock Hall
Training/Matthews center.xls
Matthews Center
Training/Matthews hall.xls
Matthews Hall
Training/Old main.xls
Old Main
Training/Uni cen A wing.xls
University Center A Wing
Training/Bus Admin C.xls
Bus Admin 

Saving the dataframe object to a pickle file for further exploration and prediction

In [97]:
a_file = open("DF.pkl", "wb")
pickle.dump(Masterdf, a_file)
a_file.close()