#### Quick Introduction

The purpose of this notebook is to provide a very simplistic example of Multivariate Time Series Forecasting using Python

In [None]:
# Load libraries
import pandas as pd
import numpy as np
import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

# Load data
df = pd.read_csv("../Data/Appliances Energy Usage Prediction/energydata_complete.csv")
df.head()

# Convert date column in the appropriate format
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d %H:%M:%S')

df.head()

In [None]:
# Define the features and target variables
target = ["Appliances"]
features = ["lights", "T1", "T2", "T3"]

# Define Train Test Split Ratio
train_split = 0.7
test_split = 0.3

# Obtain the master dataset
master_df = df[["date"] + target + features].copy()
master_df.head()

In [None]:
# Compute the lags of each and every variable
newFeatures = []
num_lags = int(6)
for i in target + features:
    for k in range(1,num_lags+1):
        # Create lags
        master_df["{}_{}".format(str(i), str(k))] = master_df[i].shift(k)
        newFeatures.append("{}_{}".format(str(i), str(k)))
master_df.head()

In [None]:
df[target].plot()

In [5]:
master_df[["date","Appliances","Appliances_1","Appliances_2"]].head()

Unnamed: 0,date,Appliances,Appliances_1,Appliances_2
0,2016-01-11 17:00:00,60,,
1,2016-01-11 17:10:00,60,60.0,
2,2016-01-11 17:20:00,50,60.0,60.0
3,2016-01-11 17:30:00,50,50.0,60.0
4,2016-01-11 17:40:00,60,50.0,50.0


In [6]:
# Drop Missing Values
master_df = master_df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
master_df.head()

Unnamed: 0,date,Appliances,lights,T1,T2,T3,Appliances_1,Appliances_2,Appliances_3,Appliances_4,...,T2_3,T2_4,T2_5,T2_6,T3_1,T3_2,T3_3,T3_4,T3_5,T3_6
6,2016-01-11 18:00:00,60,50,19.89,19.2,19.79,50.0,60.0,50.0,50.0,...,19.2,19.2,19.2,19.2,19.79,19.79,19.79,19.79,19.79,19.79
7,2016-01-11 18:10:00,60,50,19.856667,19.2,19.73,60.0,50.0,60.0,50.0,...,19.2,19.2,19.2,19.2,19.79,19.79,19.79,19.79,19.79,19.79
8,2016-01-11 18:20:00,60,40,19.79,19.2,19.73,60.0,60.0,50.0,60.0,...,19.2,19.2,19.2,19.2,19.73,19.79,19.79,19.79,19.79,19.79
9,2016-01-11 18:30:00,70,40,19.856667,19.23,19.79,60.0,60.0,60.0,50.0,...,19.2,19.2,19.2,19.2,19.73,19.73,19.79,19.79,19.79,19.79
10,2016-01-11 18:40:00,230,70,19.926667,19.356667,19.79,70.0,60.0,60.0,60.0,...,19.2,19.2,19.2,19.2,19.79,19.73,19.73,19.79,19.79,19.79


In [7]:
# Split dataset into train/ test set
train_records = int(np.round(train_split * master_df.shape[0]))
test_records = int(master_df.shape[0]-train_records)

X_train = master_df[:train_records][newFeatures].copy()
y_train = master_df[:train_records][target].copy()

X_test = master_df[-test_records:][newFeatures].copy()
y_test = master_df[-test_records:][target].copy()

In [8]:
X_train.shape

(13810, 30)

In [9]:
y_test.shape

(5919, 1)

In [10]:
# Train a regression model
reg = LinearRegression().fit(X_train, y_train)
preds = reg.predict(X_train)

# Measure the model performance on the train set
print("Performance on Training Set:",mean_absolute_error(y_train, preds))

# Measure the model performance on the test set
preds_test = reg.predict(X_test)
print("Performance on Test Set:",mean_absolute_error(y_test, preds_test))

Performance on Training Set: 31.423182004977598
Performance on Test Set: 28.415453203833344
