In [2]:
import pandas as pd
import sys
import numpy as np
from multiprocessing import cpu_count, Pool
from pandas import DataFrame
from pandas import concat
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from keras.layers import Input, Dense, Dropout, LSTM, Reshape, Flatten
from keras import Sequential
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam

dir_path = sys.argv[1]

headers = ['Date', 'Time', 'Global_active_power', 'Global_reactive_power',
           'Voltage', 'Global_intensity', 'Sub_metering_1', 'Sub_metering_2',
           'Sub_metering_3']

dtypes = {'Date':'str', 'Time':'str', 'Global_active_power':'float',
          'Global_reactive_power': 'float', 'Voltage':'float',
          'Global_intensity':'float', 'Sub_metering_1':'float',
          'Sub_metering_2':'float', 'Sub_metering_3':'float'}

# print(df.head)


def parallel_map(data, func):
    n_cores = cpu_count()
    data_split = np.array_split(data, n_cores)
    pool = Pool(n_cores)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data

def parse(row):
    row['DateTime'] = pd.to_datetime(row['DateTime'],format='%d/%m/%Y %H:%M:%S')
    return row

def series_to_supervised(data, window_size, horizon, inputs, targets):
    """
    Frame a time series as a supervised learning dataset.
    
    Arguments:
        data: A pandas DataFrame containing the time series
        (the index must be a DateTimeIndex).
        window_size: Number of lagged observations as input.
        horizon: Number of steps to forecast ahead.
        inputs: A list of the columns of the dataframe to be lagged.
        targets: A list of the columns of the dataframe to be forecasted.
    
    Returns:
        Pandas DataFrame of series framed for supervised learning.
    """
    
    if targets == 'all':
        targets = data.columns
    
    if inputs == 'all':
        inputs = data.columns

    
    result = DataFrame(index=df.index)
    names = []
    
    # input sequence (t-w, ..., t-1)
    for i in range(window_size, 0, -1):
        result = pd.concat([result, data[inputs].shift(i)], axis=1)
        names += [(f'{data[inputs].columns[j]}(t-{i})') for j in range(len(inputs))]
    
    # the input not shifted (t)
    result = pd.concat([result, data.copy()], axis=1)
    names += [(f'{column}(t)') for column in data.columns]
    
    # forecast (t+h)
    for i in [horizon]:
        result = pd.concat([result, data[targets].shift(-i)], axis=1)
        names += [(f'{data[targets].columns[j]}(t+{i})') for j in range(len(targets))]
    
    # put it all together
    result.columns = names

    # drop rows with NaN values
    result.dropna(inplace=True)
    return result

def train_validate_test_split(df, train_percent=.6, validate_percent=.2, seed=None):
    np.random.seed(seed)
    m = len(df)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[:train_end]
    validate = df.iloc[train_end:validate_end]
    test = df.iloc[validate_end:]
    return train, validate, test

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100




Using TensorFlow backend.


In [3]:
df = pd.read_csv("household_power_consumption.txt", sep=';',dtype=dtypes, na_values=['?'])

In [4]:
# Preprocessing
df['DateTime'] = df['Date'] + ' ' + df['Time']
df = parallel_map(df, parse)

In [5]:
df.drop(['Date', 'Time'], axis=1, inplace=True)
df = df[[df.columns[-1]] + list(df.columns[:-1])]
df.set_index('DateTime', inplace=True)

In [6]:
df['hour'] = df.index.hour
df['day'] = df.index.day
df['month'] = df.index.month
df['day_of_week'] = df.index.dayofweek
df['Rest_active_power'] = df['Global_active_power'] * 1000 / 60 - df['Sub_metering_1'] - df['Sub_metering_2'] - df['Sub_metering_3']

In [7]:
inputs = ['Global_active_power', 'Global_reactive_power', 'Voltage',
          'Global_intensity', 'Sub_metering_1', 'Sub_metering_2',
          'Sub_metering_3', 'Rest_active_power']
targets = ['Global_active_power']

In [8]:
df_supervised = series_to_supervised(df, window_size=5, horizon=1, inputs=inputs, targets=targets)
# df_supervised.head()

In [9]:
df_supervised.head()

Unnamed: 0_level_0,Global_active_power(t-5),Global_reactive_power(t-5),Voltage(t-5),Global_intensity(t-5),Sub_metering_1(t-5),Sub_metering_2(t-5),Sub_metering_3(t-5),Rest_active_power(t-5),Global_active_power(t-4),Global_reactive_power(t-4),...,Global_intensity(t),Sub_metering_1(t),Sub_metering_2(t),Sub_metering_3(t),hour(t),day(t),month(t),day_of_week(t),Rest_active_power(t),Global_active_power(t+1)
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2006-12-16 17:29:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0,52.266667,5.36,0.436,...,15.0,0.0,2.0,17.0,17,16,12,5,39.666667,3.702
2006-12-16 17:30:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0,72.333333,5.374,0.498,...,15.8,0.0,1.0,17.0,17,16,12,5,43.7,3.7
2006-12-16 17:31:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0,70.566667,5.388,0.502,...,15.8,0.0,1.0,17.0,17,16,12,5,43.666667,3.668
2006-12-16 17:32:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0,71.8,3.666,0.528,...,15.8,0.0,1.0,17.0,17,16,12,5,43.133333,3.662
2006-12-16 17:33:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0,43.1,3.52,0.522,...,15.8,0.0,2.0,16.0,17,16,12,5,43.033333,4.448


In [None]:
train,validate, test = train_validate_test_split(df_supervised)
print(type(train))
print(train.shape)

X_train = train.values[:, :-1]
y_train = train.values[:, -1]

X_validate = validate.values[:, :-1]
y_validate = validate.values[:, -1]

X_test = test.values[:, :-1]
y_test = test.values[:, -1]

print("Here")
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_validate = scaler.transform(X_validate)
X_test = scaler.transform(X_test)


<class 'pandas.core.frame.DataFrame'>
(1229311, 53)
