In [107]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import os 
import numpy as np 
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
from sklearn.preprocessing import MinMaxScaler

import seaborn as sns
sns.set_style('white', {"xtick.major.size": 2, "ytick.major.size": 2})
flatui = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71","#f4cae4"]
sns.set_palette(sns.color_palette(flatui,7))
import matplotlib.pyplot as plt

In [25]:
base_dir = os.path.dirname(os.getcwd())
data_dir = os.path.join(base_dir, 'Dataset')
file_name = 'electricity_data_resampled.csv'

In [26]:
data = pd.read_csv(os.path.join(data_dir, file_name))
data.head()

Unnamed: 0,DateTime,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,2006-12-16,3.053475,0.088187,236.243763,13.082828,0.0,1.378788,12.439394
1,2006-12-17,2.354486,0.156949,240.087028,9.999028,1.411806,2.907639,9.264583
2,2006-12-18,1.530435,0.112356,241.231694,6.421667,0.738194,1.820139,9.734722
3,2006-12-19,1.157079,0.104821,241.999313,4.926389,0.582639,5.279167,4.303472
4,2006-12-20,1.545658,0.111804,242.308063,6.467361,0.0,1.838889,9.765972


## Problem Statement

Given a week's data of Global_reactive_power, Voltage, Global_intensity and sub_metering readings, predict the next week average global power

### Feature Engineering and Cleaning

In [28]:
data.DateTime = pd.to_datetime(data.DateTime)
data.head(2)

Unnamed: 0,DateTime,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,2006-12-16,3.053475,0.088187,236.243763,13.082828,0.0,1.378788,12.439394
1,2006-12-17,2.354486,0.156949,240.087028,9.999028,1.411806,2.907639,9.264583


- Global Active power is the total power that is supplied to houses and sub_metering_i is the electricity consumed by different appliances in the houses. However, the left out electricity reading is not given we can calculate that using the below formula

sub_metering_rem = (Global_active_power * 1000) / 60 - (sub_metering_1 + sub_metering_2 + sub_metering_3)

In [29]:
data['sub_metering_rem'] = (data['Global_active_power'] * 1000)/60 - (data['Sub_metering_1'] + data['Sub_metering_2'] + data['Sub_metering_3'])
data.head()

Unnamed: 0,DateTime,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,sub_metering_rem
0,2006-12-16,3.053475,0.088187,236.243763,13.082828,0.0,1.378788,12.439394,37.073064
1,2006-12-17,2.354486,0.156949,240.087028,9.999028,1.411806,2.907639,9.264583,25.657407
2,2006-12-18,1.530435,0.112356,241.231694,6.421667,0.738194,1.820139,9.734722,13.21419
3,2006-12-19,1.157079,0.104821,241.999313,4.926389,0.582639,5.279167,4.303472,9.119375
4,2006-12-20,1.545658,0.111804,242.308063,6.467361,0.0,1.838889,9.765972,14.156111


In [30]:
data = data.set_index(['DateTime'])

In [35]:
data['day'] = data.index.day_name()
data.head(5)

Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,sub_metering_rem,day
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2006-12-16,3.053475,0.088187,236.243763,13.082828,0.0,1.378788,12.439394,37.073064,Saturday
2006-12-17,2.354486,0.156949,240.087028,9.999028,1.411806,2.907639,9.264583,25.657407,Sunday
2006-12-18,1.530435,0.112356,241.231694,6.421667,0.738194,1.820139,9.734722,13.21419,Monday
2006-12-19,1.157079,0.104821,241.999313,4.926389,0.582639,5.279167,4.303472,9.119375,Tuesday
2006-12-20,1.545658,0.111804,242.308063,6.467361,0.0,1.838889,9.765972,14.156111,Wednesday


In [46]:
# Balancing the weeks in the day. For this data, week's day starts from monday to Sunday
# To balance that out, lets drop first 2 rows , and last 5 rows

data = data.iloc[2:, :]
data = data.iloc[:-5,:]

data.head(2)

Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,sub_metering_rem,day
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2006-12-18,1.530435,0.112356,241.231694,6.421667,0.738194,1.820139,9.734722,13.21419,Monday
2006-12-19,1.157079,0.104821,241.999313,4.926389,0.582639,5.279167,4.303472,9.119375,Tuesday


In [47]:
data.tail(2)

Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,sub_metering_rem,day
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2010-11-20,1.525699,0.106783,240.608333,6.472361,3.032639,2.046528,7.939583,12.40956,Saturday
2010-11-21,0.625632,0.083072,241.180194,2.637917,0.0,0.351389,3.318056,6.757755,Sunday


In [50]:
# Drop the day column
data.drop(columns=['day'],axis=1, inplace=True)

### Train Test Split

In [64]:
total_weeks = data.shape[0] / 7
print(f"Total Weeks in the data : {total_weeks}")

def split_data(data, train_pct = 0.7):
    # Splitting the data based on number of weeks 
    # 70 % weeks in training data and 30 % weeks in test data 
    train_index = int((data.shape[0] / 7)* train_pct * 100 // 100) * 7

    train_data = data.iloc[:train_index, :]
    test_data = data.iloc[train_index:, :]

    return train_data, test_data

Total Weeks in the data : 205.0


In [66]:
train_data, test_data = split_data(data)
print(train_data.shape, test_data.shape)

assert train_data.shape[0]+test_data.shape[0] == data.shape[0]

(1001, 8) (434, 8)


In [114]:
def scale_data(train_data, test_data):
    scaler = MinMaxScaler().fit(train_data)
    return scaler.transform(train_data), scaler.transform(test_data)

In [115]:
train_data , test_data = scale_data(train_data, test_data)

In [120]:
def prepare_data_for_training(data):
    """Method will prepare the data into time series format and drop the global_active_power from the X"""
    X = []
    y = []
    for start_idx in range(data.shape[0]):
        end_idx = start_idx + 7
        if end_idx > data.shape[0] - 7:
            break

        tmp_X = data[start_idx:end_idx, 1:]
        tmp_y = data[end_idx:end_idx+7,0]
        X.append(tmp_X)
        y.append(tmp_y)

    return np.array(X), np.mean(np.array(y),axis=1)

In [121]:
X_train, y_train = prepare_data_for_training(train_data)
X_test, y_test = prepare_data_for_training(test_data)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(988, 7, 7) (988,)
(421, 7, 7) (421,)


In [122]:
np.save(os.path.join(data_dir, 'X_train.npy'), X_train)
np.save(os.path.join(data_dir, 'y_train.npy'), y_train)

np.save(os.path.join(data_dir, 'X_test.npy'), X_test)
np.save(os.path.join(data_dir, 'y_test.npy'), y_test)