### Import dependencies

In [47]:
import torch
import numpy as np
import pandas as pd
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

### Import data

In [5]:
! kaggle datasets download -d srinuti/residential-power-usage-3years-data-timeseries 

Downloading residential-power-usage-3years-data-timeseries.zip to c:\Users\nilso\Documents\Coding\Data Science Class\M3-Assignments-Deep-Learning\Assignment 2




  0%|          | 0.00/227k [00:00<?, ?B/s]
100%|██████████| 227k/227k [00:00<00:00, 2.51MB/s]


In [60]:
df = pd.read_csv("data\power_usage_2016_to_2020.csv")

### Data cleaning

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35952 entries, 0 to 35951
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   StartDate    35952 non-null  object 
 1   Value (kWh)  35952 non-null  float64
 2   day_of_week  35952 non-null  int64  
 3   notes        35952 non-null  object 
dtypes: float64(1), int64(1), object(2)
memory usage: 1.1+ MB


In [62]:
df.head()

Unnamed: 0,StartDate,Value (kWh),day_of_week,notes
0,2016-01-06 00:00:00,1.057,2,weekday
1,2016-01-06 01:00:00,1.171,2,weekday
2,2016-01-06 02:00:00,0.56,2,weekday
3,2016-01-06 03:00:00,0.828,2,weekday
4,2016-01-06 04:00:00,0.932,2,weekday


In [63]:
df["StartDate"] = pd.to_datetime(df.StartDate)

In [64]:
start_date_series = df["StartDate"]

In [65]:
date_df = pd.DataFrame(start_date_series)
date_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35952 entries, 0 to 35951
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   StartDate  35952 non-null  datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 281.0 KB


In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35952 entries, 0 to 35951
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   StartDate    35952 non-null  datetime64[ns]
 1   Value (kWh)  35952 non-null  float64       
 2   day_of_week  35952 non-null  int64         
 3   notes        35952 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 1.1+ MB


In [67]:
df.set_index("StartDate", inplace=True)

In [68]:
df

Unnamed: 0_level_0,Value (kWh),day_of_week,notes
StartDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-01-06 00:00:00,1.057,2,weekday
2016-01-06 01:00:00,1.171,2,weekday
2016-01-06 02:00:00,0.560,2,weekday
2016-01-06 03:00:00,0.828,2,weekday
2016-01-06 04:00:00,0.932,2,weekday
...,...,...,...
2020-07-07 19:00:00,1.307,1,weekday
2020-07-07 20:00:00,2.872,1,weekday
2020-07-07 21:00:00,2.138,1,weekday
2020-07-07 22:00:00,2.199,1,weekday


In [69]:
dummies = pd.get_dummies(df.notes)

In [70]:
df = pd.concat([df[["Value (kWh)", "day_of_week"]], dummies], axis=1)

In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 35952 entries, 2016-01-06 00:00:00 to 2020-07-07 23:00:00
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Value (kWh)     35952 non-null  float64
 1   day_of_week     35952 non-null  int64  
 2   COVID_lockdown  35952 non-null  uint8  
 3   vacation        35952 non-null  uint8  
 4   weekday         35952 non-null  uint8  
 5   weekend         35952 non-null  uint8  
dtypes: float64(1), int64(1), uint8(4)
memory usage: 983.1 KB


In [72]:
scaler = MinMaxScaler() 
data_scaled = scaler.fit_transform(df[["Value (kWh)", "day_of_week"]])
data_s = pd.DataFrame(data_scaled, columns=["Value (kWh)", "day_of_week"])

In [73]:
data_s.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35952 entries, 0 to 35951
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Value (kWh)  35952 non-null  float64
 1   day_of_week  35952 non-null  float64
dtypes: float64(2)
memory usage: 561.9 KB


In [74]:
scaled = pd.concat([data_s, date_df], axis=1)
scaled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35952 entries, 0 to 35951
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Value (kWh)  35952 non-null  float64       
 1   day_of_week  35952 non-null  float64       
 2   StartDate    35952 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(2)
memory usage: 842.8 KB


In [75]:
scaled.set_index("StartDate", inplace=True)
scaled

Unnamed: 0_level_0,Value (kWh),day_of_week
StartDate,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-01-06 00:00:00,0.155594,0.333333
2016-01-06 01:00:00,0.173457,0.333333
2016-01-06 02:00:00,0.077719,0.333333
2016-01-06 03:00:00,0.119712,0.333333
2016-01-06 04:00:00,0.136008,0.333333
...,...,...
2020-07-07 19:00:00,0.194767,0.166667
2020-07-07 20:00:00,0.439987,0.166667
2020-07-07 21:00:00,0.324976,0.166667
2020-07-07 22:00:00,0.334535,0.166667


In [76]:
ready_df = pd.concat([scaled, df[["weekend", "weekday", "vacation", "COVID_lockdown"]]], axis=1)

In [77]:
ready_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 35952 entries, 2016-01-06 00:00:00 to 2020-07-07 23:00:00
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Value (kWh)     35952 non-null  float64
 1   day_of_week     35952 non-null  float64
 2   weekend         35952 non-null  uint8  
 3   weekday         35952 non-null  uint8  
 4   vacation        35952 non-null  uint8  
 5   COVID_lockdown  35952 non-null  uint8  
dtypes: float64(2), uint8(4)
memory usage: 983.1 KB


In [78]:
ready_df

Unnamed: 0_level_0,Value (kWh),day_of_week,weekend,weekday,vacation,COVID_lockdown
StartDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-01-06 00:00:00,0.155594,0.333333,0,1,0,0
2016-01-06 01:00:00,0.173457,0.333333,0,1,0,0
2016-01-06 02:00:00,0.077719,0.333333,0,1,0,0
2016-01-06 03:00:00,0.119712,0.333333,0,1,0,0
2016-01-06 04:00:00,0.136008,0.333333,0,1,0,0
...,...,...,...,...,...,...
2020-07-07 19:00:00,0.194767,0.166667,0,1,0,0
2020-07-07 20:00:00,0.439987,0.166667,0,1,0,0
2020-07-07 21:00:00,0.324976,0.166667,0,1,0,0
2020-07-07 22:00:00,0.334535,0.166667,0,1,0,0


Comment to myself: Day of weeks needs dummies

In [79]:
energy_data_all = ready_df[["Value (kWh)", "weekend", "weekday", "vacation", "COVID_lockdown"]].to_numpy()
energy_data = ready_df[["Value (kWh)"]].to_numpy()

### Data preparation

In [84]:
# Split the data into sequences for lstm input
sequence_length = 24*30 # To take a months data into consideration to predict future value 
sequence_data = []
sequence_labels = []
for i in range(len(energy_data) - sequence_length):
    sequence_data.append(energy_data[i:i+sequence_length])
    sequence_labels.append(energy_data[i+sequence_length])
sequence_data = np.array(sequence_data)
sequence_labels = np.array(sequence_labels)

training_ratio = int(.6*len(sequence_data))
print("Training length:", training_ratio)

train_data = sequence_data[:training_ratio]
train_labels = sequence_labels[:training_ratio]
test_data = sequence_data[training_ratio:]
test_labels = sequence_labels[training_ratio:]

Training length: 21139


In [85]:
# Split the data into training and testing sets
train_data = TensorDataset(torch.from_numpy(train_data), torch.from_numpy(train_labels))
test_data = TensorDataset(torch.from_numpy(test_data), torch.from_numpy(test_labels))

train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

### Create Neural network

In [86]:
# Initialize the LSTM model
input_size = 1
hidden_size = 6
output_size = 1
learning_rate = 0.01

# 1. Creating an LSTM model
lstm = torch.nn.LSTM(input_size, hidden_size, batch_first=True)
fc = torch.nn.Linear(hidden_size, output_size)

criterion = torch.nn.MSELoss()
optimizer = optim.SGD(lstm.parameters(), lr=learning_rate)

### Training loop

In [89]:
# Train the lstm model
n_total_steps = len(train_loader)

num_epochs = 100
train_losses = []
test_losses = []

for epoch in range(num_epochs):
    train_loss = 0

    for i, (data, label) in enumerate(train_loader):
        optimizer.zero_grad()

        # Run the forward pass
        hidden = (torch.zeros(1, data.size(0), hidden_size),
                  torch.zeros(1, data.size(0), hidden_size))
        output, hidden = lstm(data.float(), hidden)
        output = fc(hidden[0][-1])

        # 2. Network Evaluation
        loss = criterion(output, label.float())

        # 3. Gradient Calculation
        loss.backward()

        # 4. Back Propagation
        optimizer.step()
        if (i+1) % 100 == 0:
            print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')
        train_loss += loss.item()

    # Calculate the average training loss
    train_loss = train_loss / len(train_loader)
    train_losses.append(train_loss)

# Plot the MSE loss for each epoch
plt.plot(train_losses)
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.title('Training Loss over Epochs')
plt.show()

# Evaluate the lstm model
mse = 0

with torch.no_grad():
    for data, label in test_loader:
        hidden = (torch.zeros(1, data.size(0), hidden_size),
                  torch.zeros(1, data.size(0), hidden_size))
        output, hidden = lstm(data.float(), hidden)
        output = fc(hidden[0][-1])

        mse += ((output - label)**2).mean().item()

mse /= len(test_loader)
print('Test MSE: {}'.format(mse))

Epoch [1/100], Step [100/331], Loss: 0.0661
Epoch [1/100], Step [200/331], Loss: 0.0624
Epoch [1/100], Step [300/331], Loss: 0.0216
Epoch [2/100], Step [100/331], Loss: 0.0164
Epoch [2/100], Step [200/331], Loss: 0.0253
Epoch [2/100], Step [300/331], Loss: 0.0237
Epoch [3/100], Step [100/331], Loss: 0.0216
Epoch [3/100], Step [200/331], Loss: 0.0203
Epoch [3/100], Step [300/331], Loss: 0.0325
Epoch [4/100], Step [100/331], Loss: 0.0216
Epoch [4/100], Step [200/331], Loss: 0.0226
Epoch [4/100], Step [300/331], Loss: 0.0248
Epoch [5/100], Step [100/331], Loss: 0.0106
Epoch [5/100], Step [200/331], Loss: 0.0318
Epoch [5/100], Step [300/331], Loss: 0.0226
Epoch [6/100], Step [100/331], Loss: 0.0180
Epoch [6/100], Step [200/331], Loss: 0.0399
Epoch [6/100], Step [300/331], Loss: 0.0233
Epoch [7/100], Step [100/331], Loss: 0.0176
Epoch [7/100], Step [200/331], Loss: 0.0211
Epoch [7/100], Step [300/331], Loss: 0.0189
Epoch [8/100], Step [100/331], Loss: 0.0270
Epoch [8/100], Step [200/331], L

KeyboardInterrupt: 