In [1]:
import numpy as np
import pandas as pd
import tqdm
import os

In [2]:
# Set path for data source
from google.colab import drive
drive.mount('/content/gdrive')
os.chdir("/content/gdrive/My Drive/NUS-PhD/Multi-dim autoencoder research/dataset/")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Import dataset

In [3]:
df_power_meter = pd.read_feather('GEPIII/train.feather')
df_weather = pd.read_feather('GEPIII/weather_train.feather')
df_meta = pd.read_feather('GEPIII/building_metadata.feather')

In [4]:
df_meta = df_meta.merge(df_power_meter[['building_id','meter']].drop_duplicates(), on='building_id')
df_meta['merged_id'] = df_meta['building_id'].astype('str') + '_' + df_meta['meter'].astype('str')
df_meta

Unnamed: 0,site_id,building_id,primary_use,square_feet,year_built,floor_count,meter,merged_id
0,0,0,Education,7432,2008.0,,0,0_0
1,0,1,Education,2720,2004.0,,0,1_0
2,0,2,Education,5376,1991.0,,0,2_0
3,0,3,Education,23685,2002.0,,0,3_0
4,0,4,Education,116607,1975.0,,0,4_0
...,...,...,...,...,...,...,...,...
2375,15,1444,Entertainment/public assembly,19619,1914.0,,0,1444_0
2376,15,1445,Education,4298,,,0,1445_0
2377,15,1446,Entertainment/public assembly,11265,1997.0,,0,1446_0
2378,15,1447,Lodging/residential,29775,2001.0,,0,1447_0


In [5]:
bad_meter_readings = pd.read_csv('GEPIII/bad_meter_readings.csv')
bad_meter_readings

Unnamed: 0,is_bad_meter_reading
0,1
1,1
2,1
3,1
4,1
...,...
20216095,0
20216096,0
20216097,1
20216098,0


In [6]:
df_power_meter = df_power_meter.loc[bad_meter_readings['is_bad_meter_reading']==0]

In [7]:
df_power_meter = df_power_meter.pivot_table(index='timestamp', columns=['building_id','meter'], values='meter_reading')
df_power_meter.index = pd.to_datetime(df_power_meter.index)
df_power_meter.columns = df_power_meter.columns.get_level_values(0).astype('str')+'_'+df_power_meter.columns.get_level_values(1).astype('str')
df_power_meter

Unnamed: 0_level_0,0_0,1_0,2_0,3_0,4_0,5_0,6_0,7_0,7_1,8_0,...,1440_0,1441_0,1442_0,1442_2,1443_0,1444_0,1445_0,1446_0,1447_0,1448_0
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-01 00:00:00,,,,,,,,,,,...,160.949997,248.524994,54.625000,57.796700,57.724998,5.425,4.800,,160.199997,2.325
2016-01-01 01:00:00,,,,,,,,,,,...,165.250000,259.375000,63.224998,69.938301,63.349998,5.550,4.825,,156.649994,2.225
2016-01-01 02:00:00,,,,,,,,,,,...,161.675003,250.475006,53.674999,50.276699,62.875000,5.900,5.125,,157.574997,2.275
2016-01-01 03:00:00,,,,,,,,,,,...,164.850006,259.625000,58.075001,52.163101,64.250000,5.525,5.225,,154.925003,2.575
2016-01-01 04:00:00,,,,,,,,,,,...,158.750000,247.399994,58.500000,66.411697,64.675003,5.725,5.250,,156.074997,3.075
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12-31 19:00:00,232.070999,68.255997,4.9144,107.162003,1150.800049,10.238400,264.970001,550.825989,1688.410034,555.604004,...,165.824997,245.675003,61.500000,63.456299,83.724998,8.525,5.750,,167.800003,3.400
2016-12-31 20:00:00,189.069000,52.420601,5.3240,107.162003,1197.890015,18.770399,265.378998,593.145020,1477.359985,577.445984,...,163.875000,250.125000,55.474998,52.957001,85.175003,8.950,5.850,,168.149994,2.700
2016-12-31 21:00:00,169.957993,53.512699,5.7335,106.820999,1147.380005,9.214600,263.740997,557.992981,1308.520020,540.245972,...,162.399994,247.875000,61.924999,70.205101,83.824997,8.700,5.725,,164.074997,3.050
2016-12-31 22:00:00,169.957993,54.331799,4.9144,104.089996,1130.319946,17.405300,258.826996,551.849976,1013.049988,513.968018,...,154.699997,247.574997,58.349998,63.618000,85.400002,8.950,5.775,,158.800003,3.125


In [8]:
missing_rates = (df_power_meter.fillna(method='ffill').fillna(method='bfill').resample('D').std()==0).sum()/366
missing_rates

0_0       0.382514
1_0       0.382514
2_0       0.382514
3_0       0.382514
4_0       0.382514
            ...   
1444_0    0.139344
1445_0    0.139344
1446_0    0.147541
1447_0    0.139344
1448_0    0.139344
Length: 2376, dtype: float64

In [9]:
df_power_meter_missing = df_power_meter.loc[:, missing_rates[missing_rates>=0.05].index]
df_power_meter_missing

Unnamed: 0_level_0,0_0,1_0,2_0,3_0,4_0,5_0,6_0,7_0,7_1,8_0,...,1440_0,1441_0,1442_0,1442_2,1443_0,1444_0,1445_0,1446_0,1447_0,1448_0
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-01 00:00:00,,,,,,,,,,,...,160.949997,248.524994,54.625000,57.796700,57.724998,5.425,4.800,,160.199997,2.325
2016-01-01 01:00:00,,,,,,,,,,,...,165.250000,259.375000,63.224998,69.938301,63.349998,5.550,4.825,,156.649994,2.225
2016-01-01 02:00:00,,,,,,,,,,,...,161.675003,250.475006,53.674999,50.276699,62.875000,5.900,5.125,,157.574997,2.275
2016-01-01 03:00:00,,,,,,,,,,,...,164.850006,259.625000,58.075001,52.163101,64.250000,5.525,5.225,,154.925003,2.575
2016-01-01 04:00:00,,,,,,,,,,,...,158.750000,247.399994,58.500000,66.411697,64.675003,5.725,5.250,,156.074997,3.075
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12-31 19:00:00,232.070999,68.255997,4.9144,107.162003,1150.800049,10.238400,264.970001,550.825989,1688.410034,555.604004,...,165.824997,245.675003,61.500000,63.456299,83.724998,8.525,5.750,,167.800003,3.400
2016-12-31 20:00:00,189.069000,52.420601,5.3240,107.162003,1197.890015,18.770399,265.378998,593.145020,1477.359985,577.445984,...,163.875000,250.125000,55.474998,52.957001,85.175003,8.950,5.850,,168.149994,2.700
2016-12-31 21:00:00,169.957993,53.512699,5.7335,106.820999,1147.380005,9.214600,263.740997,557.992981,1308.520020,540.245972,...,162.399994,247.875000,61.924999,70.205101,83.824997,8.700,5.725,,164.074997,3.050
2016-12-31 22:00:00,169.957993,54.331799,4.9144,104.089996,1130.319946,17.405300,258.826996,551.849976,1013.049988,513.968018,...,154.699997,247.574997,58.349998,63.618000,85.400002,8.950,5.775,,158.800003,3.125


In [10]:
df_power_meter = df_power_meter.loc[:, missing_rates[missing_rates<0.05].index]
df_power_meter

Unnamed: 0_level_0,116_0,118_0,121_0,121_3,122_0,123_0,124_0,125_0,126_0,128_0,...,1318_3,1319_3,1320_0,1321_0,1321_3,1322_3,1323_0,1323_3,1324_0,1400_1
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-01 00:00:00,,,,299.729004,,46.400002,9.300000,,46.799999,31.200001,...,169.291000,219.996002,89.814102,342.0,591.427979,1380.939941,177.0,1407.920044,109.0,3.4857
2016-01-01 01:00:00,137.300003,234.399994,449.5,593.002014,267.100006,93.000000,17.900000,151.100006,90.400002,64.000000,...,303.881989,394.898987,89.823997,321.0,496.996002,1495.680054,180.0,1330.880005,108.0,3.4089
2016-01-01 02:00:00,137.500000,234.800003,451.5,600.000000,268.700012,93.800003,18.299999,150.199997,92.000000,61.400002,...,298.700989,388.167999,81.471497,302.0,0.000000,1462.170044,177.0,1503.050049,108.0,0.0000
2016-01-01 03:00:00,137.699997,236.399994,445.0,600.000000,267.700012,92.900002,18.299999,148.199997,91.000000,61.599998,...,308.022003,400.279999,107.120003,310.0,0.000000,1393.040039,167.0,1451.959961,108.0,3.7246
2016-01-01 04:00:00,137.399994,239.800003,464.0,600.000000,267.799988,93.800003,16.600000,149.600006,91.800003,57.599998,...,295.656006,384.209991,70.371101,304.0,0.000000,1496.670044,138.0,1046.949951,109.0,3.7386
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12-31 19:00:00,149.399994,321.799988,441.5,600.551025,240.899994,104.500000,12.300000,162.100006,103.000000,80.000000,...,722.163025,938.461975,87.759102,1139.0,1018.250000,2378.909912,176.0,1492.079956,13.0,1.3462
2016-12-31 20:00:00,148.899994,299.000000,447.0,650.942017,238.199997,102.800003,12.200000,160.399994,101.000000,79.300003,...,716.072021,930.547974,88.565598,971.0,917.234985,2590.959961,170.0,1771.050049,12.0,1.6299
2016-12-31 21:00:00,149.600006,286.100006,420.0,600.000000,236.300003,102.800003,12.500000,163.800003,101.800003,78.699997,...,708.320984,920.474976,73.811401,896.0,1335.050049,2702.250000,179.0,1856.020020,13.0,1.8843
2016-12-31 22:00:00,148.199997,280.700012,445.0,648.379028,234.800003,103.300003,12.100000,162.000000,99.800003,69.800003,...,744.077026,966.940979,74.548698,930.0,1418.949951,2588.090088,167.0,1846.670044,12.0,1.5253


In [11]:
for site_id in df_weather['site_id'].unique():
    df_weather.loc[df_weather['site_id']==site_id] = df_weather.loc[df_weather['site_id']==site_id].fillna(method='ffill').fillna(method='bfill')

df_weather = df_weather.merge(df_meta[['site_id','building_id','meter']], on='site_id')

df_temperature = df_weather.pivot_table(index='timestamp', columns=['building_id','meter'], values='air_temperature')
df_temperature.index = pd.to_datetime(df_temperature.index)
df_temperature.columns = df_temperature.columns.get_level_values(0).astype('str')+'_'+df_temperature.columns.get_level_values(1).astype('str')

df_temperature = (df_temperature-df_temperature.mean())/df_temperature.std()
df_temperature = df_temperature.fillna(method='ffill').fillna(method='bfill')
df_temperature = df_temperature.loc[:, df_power_meter.columns]

df_temperature

Unnamed: 0_level_0,116_0,118_0,121_0,121_3,122_0,123_0,124_0,125_0,126_0,128_0,...,1318_3,1319_3,1320_0,1321_0,1321_3,1322_3,1323_0,1323_3,1324_0,1400_1
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-01 00:00:00,-1.288868,-1.288868,-1.288868,-1.288868,-1.288868,-1.288868,-1.288868,-1.288868,-1.288868,-1.288868,...,-0.746974,-0.746974,-0.746974,-0.746974,-0.746974,-0.746974,-0.746974,-0.746974,-0.746974,-0.954904
2016-01-01 01:00:00,-1.305158,-1.305158,-1.305158,-1.305158,-1.305158,-1.305158,-1.305158,-1.305158,-1.305158,-1.305158,...,-0.746974,-0.746974,-0.746974,-0.746974,-0.746974,-0.746974,-0.746974,-0.746974,-0.746974,-0.954904
2016-01-01 02:00:00,-1.484351,-1.484351,-1.484351,-1.484351,-1.484351,-1.484351,-1.484351,-1.484351,-1.484351,-1.484351,...,-0.746974,-0.746974,-0.746974,-0.746974,-0.746974,-0.746974,-0.746974,-0.746974,-0.746974,-0.954904
2016-01-01 03:00:00,-1.582092,-1.582092,-1.582092,-1.582092,-1.582092,-1.582092,-1.582092,-1.582092,-1.582092,-1.582092,...,-0.746974,-0.746974,-0.746974,-0.746974,-0.746974,-0.746974,-0.746974,-0.746974,-0.746974,-0.954904
2016-01-01 04:00:00,-1.533221,-1.533221,-1.533221,-1.533221,-1.533221,-1.533221,-1.533221,-1.533221,-1.533221,-1.533221,...,-0.805483,-0.805483,-0.805483,-0.805483,-0.805483,-0.805483,-0.805483,-0.805483,-0.805483,-0.954904
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12-31 19:00:00,-0.588390,-0.588390,-0.588390,-0.588390,-0.588390,-0.588390,-0.588390,-0.588390,-0.588390,-0.588390,...,-0.639707,-0.639707,-0.639707,-0.639707,-0.639707,-0.639707,-0.639707,-0.639707,-0.639707,-0.586131
2016-12-31 20:00:00,-0.735002,-0.735002,-0.735002,-0.735002,-0.735002,-0.735002,-0.735002,-0.735002,-0.735002,-0.735002,...,-0.746974,-0.746974,-0.746974,-0.746974,-0.746974,-0.746974,-0.746974,-0.746974,-0.746974,-0.604569
2016-12-31 21:00:00,-0.783872,-0.783872,-0.783872,-0.783872,-0.783872,-0.783872,-0.783872,-0.783872,-0.783872,-0.783872,...,-0.746974,-0.746974,-0.746974,-0.746974,-0.746974,-0.746974,-0.746974,-0.746974,-0.746974,-0.604569
2016-12-31 22:00:00,-0.783872,-0.783872,-0.783872,-0.783872,-0.783872,-0.783872,-0.783872,-0.783872,-0.783872,-0.783872,...,-0.746974,-0.746974,-0.746974,-0.746974,-0.746974,-0.746974,-0.746974,-0.746974,-0.746974,-0.659885


In [12]:
df_temperature_missing = df_weather.pivot_table(index='timestamp', columns=['building_id','meter'], values='air_temperature')
df_temperature_missing.index = pd.to_datetime(df_temperature_missing.index)
df_temperature_missing.columns = df_temperature_missing.columns.get_level_values(0).astype('str')+'_'+df_temperature_missing.columns.get_level_values(1).astype('str')

df_temperature_missing = (df_temperature_missing-df_temperature_missing.mean())/df_temperature_missing.std()
df_temperature_missing = df_temperature_missing.fillna(method='ffill').fillna(method='bfill')
df_temperature_missing = df_temperature_missing.loc[:, df_power_meter_missing.columns]

df_temperature_missing

Unnamed: 0_level_0,0_0,1_0,2_0,3_0,4_0,5_0,6_0,7_0,7_1,8_0,...,1440_0,1441_0,1442_0,1442_2,1443_0,1444_0,1445_0,1446_0,1447_0,1448_0
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-01 00:00:00,0.358911,0.358911,0.358911,0.358911,0.358911,0.358911,0.358911,0.358911,0.358911,0.358911,...,-0.954904,-0.954904,-0.954904,-0.954904,-0.954904,-0.954904,-0.954904,-0.954904,-0.954904,-0.954904
2016-01-01 01:00:00,0.259571,0.259571,0.259571,0.259571,0.259571,0.259571,0.259571,0.259571,0.259571,0.259571,...,-0.954904,-0.954904,-0.954904,-0.954904,-0.954904,-0.954904,-0.954904,-0.954904,-0.954904,-0.954904
2016-01-01 02:00:00,-0.005334,-0.005334,-0.005334,-0.005334,-0.005334,-0.005334,-0.005334,-0.005334,-0.005334,-0.005334,...,-0.954904,-0.954904,-0.954904,-0.954904,-0.954904,-0.954904,-0.954904,-0.954904,-0.954904,-0.954904
2016-01-01 03:00:00,-0.286796,-0.286796,-0.286796,-0.286796,-0.286796,-0.286796,-0.286796,-0.286796,-0.286796,-0.286796,...,-0.954904,-0.954904,-0.954904,-0.954904,-0.954904,-0.954904,-0.954904,-0.954904,-0.954904,-0.954904
2016-01-01 04:00:00,-0.468919,-0.468919,-0.468919,-0.468919,-0.468919,-0.468919,-0.468919,-0.468919,-0.468919,-0.468919,...,-0.954904,-0.954904,-0.954904,-0.954904,-0.954904,-0.954904,-0.954904,-0.954904,-0.954904,-0.954904
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12-31 19:00:00,-0.005334,-0.005334,-0.005334,-0.005334,-0.005334,-0.005334,-0.005334,-0.005334,-0.005334,-0.005334,...,-0.586131,-0.586131,-0.586131,-0.586131,-0.586131,-0.586131,-0.586131,-0.586131,-0.586131,-0.586131
2016-12-31 20:00:00,0.077449,0.077449,0.077449,0.077449,0.077449,0.077449,0.077449,0.077449,0.077449,0.077449,...,-0.604569,-0.604569,-0.604569,-0.604569,-0.604569,-0.604569,-0.604569,-0.604569,-0.604569,-0.604569
2016-12-31 21:00:00,0.077449,0.077449,0.077449,0.077449,0.077449,0.077449,0.077449,0.077449,0.077449,0.077449,...,-0.604569,-0.604569,-0.604569,-0.604569,-0.604569,-0.604569,-0.604569,-0.604569,-0.604569,-0.604569
2016-12-31 22:00:00,-0.005334,-0.005334,-0.005334,-0.005334,-0.005334,-0.005334,-0.005334,-0.005334,-0.005334,-0.005334,...,-0.659885,-0.659885,-0.659885,-0.659885,-0.659885,-0.659885,-0.659885,-0.659885,-0.659885,-0.659885


## Trian/Valid/Test split

In [13]:
df_meta

Unnamed: 0,site_id,building_id,primary_use,square_feet,year_built,floor_count,meter,merged_id
0,0,0,Education,7432,2008.0,,0,0_0
1,0,1,Education,2720,2004.0,,0,1_0
2,0,2,Education,5376,1991.0,,0,2_0
3,0,3,Education,23685,2002.0,,0,3_0
4,0,4,Education,116607,1975.0,,0,4_0
...,...,...,...,...,...,...,...,...
2375,15,1444,Entertainment/public assembly,19619,1914.0,,0,1444_0
2376,15,1445,Education,4298,,,0,1445_0
2377,15,1446,Entertainment/public assembly,11265,1997.0,,0,1446_0
2378,15,1447,Lodging/residential,29775,2001.0,,0,1447_0


In [14]:
df_split = pd.DataFrame(data=df_power_meter.columns, columns=['merged_id']).copy()
df_split = df_split.merge(df_meta, on='merged_id')
#df_split.loc[df_split[df_split['usage'].isna()].sample(frac=0.5, random_state=42).index, 'usage'] = 'valid'
#df_split = df_split.fillna('test')
df_split

Unnamed: 0,merged_id,site_id,building_id,primary_use,square_feet,year_built,floor_count,meter
0,116_0,1,116,Education,37265,,5.0,0
1,118_0,1,118,Education,138316,1960.0,8.0,0
2,121_0,1,121,Education,150318,1906.0,9.0,0
3,121_3,1,121,Education,150318,1906.0,9.0,3
4,122_0,1,122,Education,83043,1991.0,6.0,0
...,...,...,...,...,...,...,...,...
1474,1322_3,14,1322,Entertainment/public assembly,166489,,,3
1475,1323_0,14,1323,Office,87200,,,0
1476,1323_3,14,1323,Office,87200,,,3
1477,1324_0,14,1324,Entertainment/public assembly,84688,,,0


In [15]:
df_split.groupby(['site_id'])[['building_id']].count()

Unnamed: 0_level_0,building_id
site_id,Unnamed: 1_level_1
1,24
2,222
3,260
4,79
5,82
6,44
7,16
9,281
10,24
11,9


In [16]:
df_split.loc[df_split['site_id'].isin([1,3,11,15]), 'cv_group']=0
df_split.loc[df_split['site_id'].isin([2,4]), 'cv_group']=1
df_split.loc[df_split['site_id'].isin([7,9]), 'cv_group']=2
df_split.loc[df_split['site_id'].isin([5,6,14]), 'cv_group']=3
df_split.loc[df_split['site_id'].isin([10,12,13]), 'cv_group']=4

In [17]:
df_split.groupby(['cv_group'])[['building_id']].count()

Unnamed: 0_level_0,building_id
cv_group,Unnamed: 1_level_1
0.0,294
1.0,301
2.0,297
3.0,288
4.0,299


In [18]:
df_split['cv_group'].value_counts().sort_index()/1479

0.0    0.198783
1.0    0.203516
2.0    0.200811
3.0    0.194726
4.0    0.202164
Name: cv_group, dtype: float64

In [19]:
df_split.to_pickle(('GEPIII/df_split.pickle'))

## Output dataset

In [20]:
df_meta.to_pickle('autoencoder_model/df_meta.pickle.gz',compression='gzip')

In [21]:
df_power_meter.to_pickle('autoencoder_model/df_power_meter.pickle.gz',compression='gzip')
df_temperature.to_pickle('autoencoder_model/df_temperature.pickle.gz',compression='gzip')

In [22]:
df_power_meter_missing.to_pickle('autoencoder_model/df_power_meter_missing.pickle.gz',compression='gzip')
df_temperature_missing.to_pickle('autoencoder_model/df_temperature_missing.pickle.gz',compression='gzip')

## Augmentation of data

In [23]:
df_power_meter_flip = df_power_meter.copy()
df_power_meter_flip = -df_power_meter_flip
df_power_meter_flip.columns = df_power_meter_flip.columns + '_flip'
df_power_meter_flip = pd.concat([df_power_meter, df_power_meter_flip],axis=1)
df_power_meter_flip.to_pickle('autoencoder_model/df_power_meter_flip.pickle.gz',compression='gzip')


df_temperature_flip = df_temperature.copy()
df_temperature_flip.columns = df_temperature_flip.columns + '_flip'
df_temperature_flip = pd.concat([df_temperature, df_temperature_flip],axis=1)
df_temperature_flip.to_pickle('autoencoder_model/df_temperature_flip.pickle.gz',compression='gzip')

df_power_meter_flip

Unnamed: 0_level_0,116_0,118_0,121_0,121_3,122_0,123_0,124_0,125_0,126_0,128_0,...,1318_3_flip,1319_3_flip,1320_0_flip,1321_0_flip,1321_3_flip,1322_3_flip,1323_0_flip,1323_3_flip,1324_0_flip,1400_1_flip
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-01 00:00:00,,,,299.729004,,46.400002,9.300000,,46.799999,31.200001,...,-169.291000,-219.996002,-89.814102,-342.0,-591.427979,-1380.939941,-177.0,-1407.920044,-109.0,-3.4857
2016-01-01 01:00:00,137.300003,234.399994,449.5,593.002014,267.100006,93.000000,17.900000,151.100006,90.400002,64.000000,...,-303.881989,-394.898987,-89.823997,-321.0,-496.996002,-1495.680054,-180.0,-1330.880005,-108.0,-3.4089
2016-01-01 02:00:00,137.500000,234.800003,451.5,600.000000,268.700012,93.800003,18.299999,150.199997,92.000000,61.400002,...,-298.700989,-388.167999,-81.471497,-302.0,-0.000000,-1462.170044,-177.0,-1503.050049,-108.0,-0.0000
2016-01-01 03:00:00,137.699997,236.399994,445.0,600.000000,267.700012,92.900002,18.299999,148.199997,91.000000,61.599998,...,-308.022003,-400.279999,-107.120003,-310.0,-0.000000,-1393.040039,-167.0,-1451.959961,-108.0,-3.7246
2016-01-01 04:00:00,137.399994,239.800003,464.0,600.000000,267.799988,93.800003,16.600000,149.600006,91.800003,57.599998,...,-295.656006,-384.209991,-70.371101,-304.0,-0.000000,-1496.670044,-138.0,-1046.949951,-109.0,-3.7386
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12-31 19:00:00,149.399994,321.799988,441.5,600.551025,240.899994,104.500000,12.300000,162.100006,103.000000,80.000000,...,-722.163025,-938.461975,-87.759102,-1139.0,-1018.250000,-2378.909912,-176.0,-1492.079956,-13.0,-1.3462
2016-12-31 20:00:00,148.899994,299.000000,447.0,650.942017,238.199997,102.800003,12.200000,160.399994,101.000000,79.300003,...,-716.072021,-930.547974,-88.565598,-971.0,-917.234985,-2590.959961,-170.0,-1771.050049,-12.0,-1.6299
2016-12-31 21:00:00,149.600006,286.100006,420.0,600.000000,236.300003,102.800003,12.500000,163.800003,101.800003,78.699997,...,-708.320984,-920.474976,-73.811401,-896.0,-1335.050049,-2702.250000,-179.0,-1856.020020,-13.0,-1.8843
2016-12-31 22:00:00,148.199997,280.700012,445.0,648.379028,234.800003,103.300003,12.100000,162.000000,99.800003,69.800003,...,-744.077026,-966.940979,-74.548698,-930.0,-1418.949951,-2588.090088,-167.0,-1846.670044,-12.0,-1.5253


In [24]:
df_power_meter_rolling = df_power_meter.copy()
df_temperature_rolling = df_temperature.copy()

random_state = 1
for col in tqdm.tqdm(df_power_meter.columns):
  np.random.seed(random_state)
  shift = np.random.randint(1,52)*7*24
  df_power_meter_rolling[col+ '_rolling' + str(int(shift))] = pd.concat([df_power_meter[col].iloc[shift:],
                                              df_power_meter[col].iloc[:shift]],
                                              axis=0).values
  df_temperature_rolling[col+ '_rolling' + str(int(shift))] = pd.concat([df_temperature[col].iloc[shift:],
                                              df_temperature[col].iloc[:shift]],
                                              axis=0).values
  random_state = random_state + 1
  

df_power_meter_rolling.to_pickle('autoencoder_model/df_power_meter_rolling.pickle.gz',compression='gzip')

df_temperature_rolling.to_pickle('autoencoder_model/df_temperature_rolling.pickle.gz',compression='gzip')

df_power_meter_rolling

  df_power_meter_rolling[col+ '_rolling' + str(int(shift))] = pd.concat([df_power_meter[col].iloc[shift:],
  df_temperature_rolling[col+ '_rolling' + str(int(shift))] = pd.concat([df_temperature[col].iloc[shift:],
100%|██████████| 1479/1479 [00:04<00:00, 310.19it/s]


Unnamed: 0_level_0,116_0,118_0,121_0,121_3,122_0,123_0,124_0,125_0,126_0,128_0,...,1318_3_rolling1176,1319_3_rolling6216,1320_0_rolling5544,1321_0_rolling2520,1321_3_rolling3864,1322_3_rolling3024,1323_0_rolling2352,1323_3_rolling6888,1324_0_rolling2016,1400_1_rolling4872
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-01 00:00:00,,,,299.729004,,46.400002,9.300000,,46.799999,31.200001,...,645.155029,0.000000,304.110992,444.0,327.394989,1188.979980,201.0,,8.0,9.412400
2016-01-01 01:00:00,137.300003,234.399994,449.5,593.002014,267.100006,93.000000,17.900000,151.100006,90.400002,64.000000,...,716.007019,0.000000,219.669998,434.0,242.748001,688.833984,185.0,,9.0,7.691100
2016-01-01 02:00:00,137.500000,234.800003,451.5,600.000000,268.700012,93.800003,18.299999,150.199997,92.000000,61.400002,...,714.231018,112.199997,185.259003,322.0,223.222000,889.719971,192.0,,8.0,10.579100
2016-01-01 03:00:00,137.699997,236.399994,445.0,600.000000,267.700012,92.900002,18.299999,148.199997,91.000000,61.599998,...,663.552979,0.000000,199.218994,324.0,221.632004,981.132996,151.0,,9.0,3.747700
2016-01-01 04:00:00,137.399994,239.800003,464.0,600.000000,267.799988,93.800003,16.600000,149.600006,91.800003,57.599998,...,299.880005,150.744003,136.606995,330.0,365.612000,957.616028,128.0,,9.0,3.779600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12-31 19:00:00,149.399994,321.799988,441.5,600.551025,240.899994,104.500000,12.300000,162.100006,103.000000,80.000000,...,860.198975,140.136002,306.556000,687.0,185.296997,1685.530029,234.0,,8.0,27.095900
2016-12-31 20:00:00,148.899994,299.000000,447.0,650.942017,238.199997,102.800003,12.200000,160.399994,101.000000,79.300003,...,916.793030,125.647003,170.050995,631.0,188.582001,1375.489990,240.0,,8.0,38.395500
2016-12-31 21:00:00,149.600006,286.100006,420.0,600.000000,236.300003,102.800003,12.500000,163.800003,101.800003,78.699997,...,911.466980,0.000000,246.643997,538.0,191.789993,1293.319946,225.0,,8.0,25.579599
2016-12-31 22:00:00,148.199997,280.700012,445.0,648.379028,234.800003,103.300003,12.100000,162.000000,99.800003,69.800003,...,911.057007,0.000000,327.902008,519.0,210.136993,1300.619995,225.0,,8.0,20.779301


In [25]:
df_power_meter_rolling_flip = df_power_meter_rolling.copy()
df_power_meter_rolling_flip = -df_power_meter_rolling_flip
df_power_meter_rolling_flip.columns = df_power_meter_rolling_flip.columns + '_flip'
df_power_meter_rolling_flip = pd.concat([df_power_meter_rolling, df_power_meter_rolling_flip],axis=1)
df_power_meter_rolling_flip.to_pickle('autoencoder_model/df_power_meter_rolling_flip.pickle.gz',compression='gzip')

df_temperature_rolling_flip = df_temperature_rolling.copy()
df_temperature_rolling_flip.columns = df_temperature_rolling_flip.columns + '_flip'
df_temperature_rolling_flip = pd.concat([df_temperature_rolling, df_temperature_rolling_flip],axis=1)
df_temperature_rolling_flip.to_pickle('autoencoder_model/df_temperature_rolling_flip.pickle.gz',compression='gzip')

df_power_meter_rolling_flip

Unnamed: 0_level_0,116_0,118_0,121_0,121_3,122_0,123_0,124_0,125_0,126_0,128_0,...,1318_3_rolling1176_flip,1319_3_rolling6216_flip,1320_0_rolling5544_flip,1321_0_rolling2520_flip,1321_3_rolling3864_flip,1322_3_rolling3024_flip,1323_0_rolling2352_flip,1323_3_rolling6888_flip,1324_0_rolling2016_flip,1400_1_rolling4872_flip
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-01 00:00:00,,,,299.729004,,46.400002,9.300000,,46.799999,31.200001,...,-645.155029,-0.000000,-304.110992,-444.0,-327.394989,-1188.979980,-201.0,,-8.0,-9.412400
2016-01-01 01:00:00,137.300003,234.399994,449.5,593.002014,267.100006,93.000000,17.900000,151.100006,90.400002,64.000000,...,-716.007019,-0.000000,-219.669998,-434.0,-242.748001,-688.833984,-185.0,,-9.0,-7.691100
2016-01-01 02:00:00,137.500000,234.800003,451.5,600.000000,268.700012,93.800003,18.299999,150.199997,92.000000,61.400002,...,-714.231018,-112.199997,-185.259003,-322.0,-223.222000,-889.719971,-192.0,,-8.0,-10.579100
2016-01-01 03:00:00,137.699997,236.399994,445.0,600.000000,267.700012,92.900002,18.299999,148.199997,91.000000,61.599998,...,-663.552979,-0.000000,-199.218994,-324.0,-221.632004,-981.132996,-151.0,,-9.0,-3.747700
2016-01-01 04:00:00,137.399994,239.800003,464.0,600.000000,267.799988,93.800003,16.600000,149.600006,91.800003,57.599998,...,-299.880005,-150.744003,-136.606995,-330.0,-365.612000,-957.616028,-128.0,,-9.0,-3.779600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12-31 19:00:00,149.399994,321.799988,441.5,600.551025,240.899994,104.500000,12.300000,162.100006,103.000000,80.000000,...,-860.198975,-140.136002,-306.556000,-687.0,-185.296997,-1685.530029,-234.0,,-8.0,-27.095900
2016-12-31 20:00:00,148.899994,299.000000,447.0,650.942017,238.199997,102.800003,12.200000,160.399994,101.000000,79.300003,...,-916.793030,-125.647003,-170.050995,-631.0,-188.582001,-1375.489990,-240.0,,-8.0,-38.395500
2016-12-31 21:00:00,149.600006,286.100006,420.0,600.000000,236.300003,102.800003,12.500000,163.800003,101.800003,78.699997,...,-911.466980,-0.000000,-246.643997,-538.0,-191.789993,-1293.319946,-225.0,,-8.0,-25.579599
2016-12-31 22:00:00,148.199997,280.700012,445.0,648.379028,234.800003,103.300003,12.100000,162.000000,99.800003,69.800003,...,-911.057007,-0.000000,-327.902008,-519.0,-210.136993,-1300.619995,-225.0,,-8.0,-20.779301
