In [8]:
# load and clean-up data
import numpy as np
import pandas as pd
from keras.models import Model
from numpy import array
import matplotlib.pyplot as plt
 
# load raw data
dataset = pd.read_csv('../input/household_power_consumption.txt', sep=';', header=0, low_memory=False, infer_datetime_format=True, parse_dates={'datetime':[0,1]}, index_col=['datetime'])
print(dataset.shape)
dataset.head(10)

Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2006-12-16 17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0
2006-12-16 17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0
2006-12-16 17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
2006-12-16 17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
2006-12-16 17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0
2006-12-16 17:29:00,3.52,0.522,235.02,15.0,0.0,2.0,17.0
2006-12-16 17:30:00,3.702,0.52,235.09,15.8,0.0,1.0,17.0
2006-12-16 17:31:00,3.7,0.52,235.22,15.8,0.0,1.0,17.0
2006-12-16 17:32:00,3.668,0.51,233.99,15.8,0.0,1.0,17.0
2006-12-16 17:33:00,3.662,0.51,233.86,15.8,0.0,2.0,16.0


In [9]:
# fill missing values with a value at the same time one day ago
def fill_missing_data(values):
    one_day = 60 * 24
    for row in range(values.shape[0]):
        for col in range(values.shape[1]):
            if np.isnan(values[row, col]):
                values[row, col] = values[row - one_day, col]

In [10]:
dataset.replace('?', np.nan, inplace=True)
# make dataset numeric
dataset = dataset.astype('float32')
# fill missing
fill_missing_data(dataset.values)
# add a column for for the remainder of sub metering
values = dataset.values
dataset['sub_metering_4'] = (values[:,0] * 1000 / 60) - (values[:,4] + values[:,5] + values[:,6])
# save updated dataset
dataset.to_csv('household_power_consumption.csv')
dataset.head(10)

Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,sub_metering_4
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2006-12-16 17:24:00,4.216,0.418,234.839996,18.4,0.0,1.0,17.0,52.26667
2006-12-16 17:25:00,5.36,0.436,233.630005,23.0,0.0,1.0,16.0,72.333336
2006-12-16 17:26:00,5.374,0.498,233.289993,23.0,0.0,2.0,17.0,70.566666
2006-12-16 17:27:00,5.388,0.502,233.740005,23.0,0.0,1.0,17.0,71.800003
2006-12-16 17:28:00,3.666,0.528,235.679993,15.8,0.0,1.0,17.0,43.099998
2006-12-16 17:29:00,3.52,0.522,235.020004,15.0,0.0,2.0,17.0,39.666668
2006-12-16 17:30:00,3.702,0.52,235.089996,15.8,0.0,1.0,17.0,43.700001
2006-12-16 17:31:00,3.7,0.52,235.220001,15.8,0.0,1.0,17.0,43.666668
2006-12-16 17:32:00,3.668,0.51,233.990005,15.8,0.0,1.0,17.0,43.133335
2006-12-16 17:33:00,3.662,0.51,233.860001,15.8,0.0,2.0,16.0,43.033333


In [11]:
# load the new file
dataset = pd.read_csv('household_power_consumption.csv', header=0, infer_datetime_format=True, parse_dates=['datetime'], index_col=['datetime'])
# resample data to daily
daily_groups_data = dataset.resample('D')
daily_data = daily_groups_data.sum()
# summarize
print(daily_data.shape)
# save
daily_data.to_csv('household_power_consumption_days.csv')
daily_data.head(10)

(1442, 8)


Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,sub_metering_4
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2006-12-16,1209.176,34.922,93552.53,5180.8,0.0,546.0,4926.0,14680.933319
2006-12-17,3390.46,226.006,345725.32,14398.6,2033.0,4187.0,13341.0,36946.666732
2006-12-18,2203.826,161.792,347373.64,9247.2,1063.0,2621.0,14018.0,19028.433281
2006-12-19,1666.194,150.942,348479.01,7094.0,839.0,7602.0,6197.0,13131.900043
2006-12-20,2225.748,160.998,348923.61,9313.0,0.0,2648.0,14063.0,20384.800011
2006-12-21,1723.288,144.434,347096.41,7266.4,1765.0,2692.0,10456.0,13808.466697
2006-12-22,2341.338,186.906,347305.75,9897.0,3151.0,350.0,11131.0,24390.300043
2006-12-23,4773.386,221.47,345795.95,20200.4,2669.0,425.0,14726.0,61736.433386
2006-12-24,2550.012,149.9,348029.91,11002.2,1703.0,5082.0,6891.0,28824.199967
2006-12-25,2743.12,240.28,350495.9,11450.2,6620.0,1962.0,5795.0,31341.666629


In [16]:
# split a univariate dataset into train/test sets
# split into standard weeks
data=daily_data.values
train, test = data[1:-328], data[-328:-6]
# restructure into windows of weekly data
train = array(np.split(train, len(train)/7))
test = array(np.split(test, len(test)/7))
print(train.shape)
print(test.shape)

(159, 7, 8)
(46, 7, 8)
