In [31]:
import os
import random
import numpy as np
import pandas as pd
from math import sqrt

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

import torch
import torch.nn as nn
import torch.utils.data
import torch.optim as optim

In [32]:
theta = 2
num_epochs = 100
dropout_ratio = 0.2

# data_path = 'gdrive/My Drive/MIDA/MIDA-pytorch-master/data/BostonHousing.csv'
mechanism = 'mcar'
method = 'uniform'

test_size = 0.3
use_cuda = True
batch_size  = 288 # not in the paper

In [33]:
import pandas as pd
import torch
data = pd.read_csv('Data_Augmented.csv')
train_truth = data.iloc[0:69120,:]
test_truth  = data.iloc[86400:103680,:]

# standardized between 0 and 1
scaler = MinMaxScaler()
scaler.fit(train_truth)
train_truth = scaler.transform(train_truth)
test_truth = scaler.transform(test_truth)

In [18]:
mask = pd.read_csv('2hr_BM.csv')
corrupted = pd.read_csv('Data_Augmented.csv')

mask_train = mask.iloc[0:69120,:]
mask_train = torch.tensor(mask_train.values)
train_corrupted = corrupted.iloc[0:69120,:]
train_corrupted = scaler.transform(train_corrupted)
train_corrupted[mask_train.numpy()==False] = 0

mask_test = mask.iloc[86400:103680,:]
mask_test = torch.tensor(mask_test.values)
test_corrupted = corrupted.iloc[86400:103680,:]
test_corrupted = scaler.transform(test_corrupted)
test_corrupted[mask_test.numpy()==False] = 0
# test_corrupted = torch.tensor(test_corrupted.values)

train_corrupted = torch.from_numpy(train_corrupted).float()
test_corrupted = torch.from_numpy(test_corrupted).float()
train_truth = torch.from_numpy(train_truth).float()

train_data = torch.utils.data.TensorDataset(train_truth, train_corrupted)
train_loader = torch.utils.data.DataLoader(dataset=train_data,
                                           batch_size=batch_size,
                                           shuffle=True)

In [19]:
train_truth.shape

torch.Size([69120, 98])

In [20]:
# mask = pd.read_csv('20%MCAR.csv')
# corrupted = pd.read_csv('Pseudo_imputed.csv')


# mask_train = mask.iloc[0:69120,:]
# mask_train = torch.tensor(mask_train.values)
# train_corrupted = corrupted.iloc[0:69120,:]
# train_corrupted[mask_train.numpy()==False] = 0
# train_corrupted = torch.tensor(train_corrupted.values)
# train_corrupted.shape


# mask_test = mask.iloc[86400:103680,:]
# mask_test = torch.tensor(mask_test.values)
# test_corrupted = corrupted.iloc[86400:103680,:]
# test_corrupted[mask_test.numpy()==False] = 0
# test_corrupted = torch.tensor(test_corrupted.values)
# test_corrupted.shape

In [21]:
import torch.nn.functional as F

class Autoencoder(nn.Module):
    def __init__(self, dim):
        super(Autoencoder, self).__init__()
        self.dim = dim
        
        self.drop_out = nn.Dropout(p=0)
        
        self.encoder = nn.Sequential(
            nn.Linear(dim+theta*0, dim+theta*1),
            nn.Tanh(),
            nn.Linear(dim+theta*1, dim+theta*2),
            nn.Tanh(),
            nn.Linear(dim+theta*2, dim+theta*3),
            nn.Tanh(),
            nn.Linear(dim+theta*3, dim+theta*4),
            nn.Tanh(),
            nn.Linear(dim+theta*4, dim+theta*5),
        )
            
        self.decoder = nn.Sequential(
            nn.Linear(dim+theta*5, dim+theta*4),
            nn.Tanh(),
            nn.Linear(dim+theta*4, dim+theta*3),
            nn.Tanh(),
            nn.Linear(dim+theta*3, dim+theta*2),
            nn.Tanh(),
            nn.Linear(dim+theta*2, dim+theta*1),
            nn.Tanh(),
            nn.Linear(dim+theta*1, dim+theta*0)
        )
        
    def forward(self, x):
        x = x.view(-1, self.dim)
        x_missed = self.drop_out(x)
        
        z = self.encoder(x_missed)

        out = self.decoder(z)
        
        out = out.view(-1, self.dim)
        
        return out

In [22]:
model = Autoencoder(dim=98)

In [23]:
loss = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), momentum=0.99, lr=0.025, nesterov=True)

In [24]:
cost_list = []
early_stop = False

for epoch in range(num_epochs):
    
    total_batch = len(train_data) // batch_size
    
    for i, batch_data in enumerate(train_loader):
        
        train_truth, train_corrupted = batch_data
        
        reconst_data = model(train_corrupted)
        cost = loss(reconst_data, train_truth)
        
        optimizer.zero_grad()
        cost.backward()
        optimizer.step()
                
        if (i+1) % (total_batch//2) == 0:
            print('Epoch [%d/%d], lter [%d/%d], Loss: %.6f'
                 %(epoch+1, num_epochs, i+1, total_batch, cost.item()))
            
        # early stopping rule 1 : MSE < 1e-06
        if cost.item() < 1e-06 :
            early_stop = True
            break
            
#         early stopping rule 2 : simple moving average of length 5
#         sometimes it doesn't work well.
#         if len(cost_list) > 5 :
#            if cost.item() > np.mean(cost_list[-5:]):
#                early_stop = True
#                break
                
        cost_list.append(cost.item())

    if early_stop :
        break
        
print("Learning Finished!")

Epoch [1/100], lter [120/240], Loss: 0.087414
Epoch [1/100], lter [240/240], Loss: 0.022934
Epoch [2/100], lter [120/240], Loss: 0.013766
Epoch [2/100], lter [240/240], Loss: 0.011588
Epoch [3/100], lter [120/240], Loss: 0.014395
Epoch [3/100], lter [240/240], Loss: 0.011927
Epoch [4/100], lter [120/240], Loss: 0.010589
Epoch [4/100], lter [240/240], Loss: 0.011541
Epoch [5/100], lter [120/240], Loss: 0.010661
Epoch [5/100], lter [240/240], Loss: 0.006488
Epoch [6/100], lter [120/240], Loss: 0.007448
Epoch [6/100], lter [240/240], Loss: 0.006693
Epoch [7/100], lter [120/240], Loss: 0.008091
Epoch [7/100], lter [240/240], Loss: 0.006387
Epoch [8/100], lter [120/240], Loss: 0.007815
Epoch [8/100], lter [240/240], Loss: 0.007069
Epoch [9/100], lter [120/240], Loss: 0.006672
Epoch [9/100], lter [240/240], Loss: 0.005528
Epoch [10/100], lter [120/240], Loss: 0.004396
Epoch [10/100], lter [240/240], Loss: 0.004754
Epoch [11/100], lter [120/240], Loss: 0.003480
Epoch [11/100], lter [240/240],

In [25]:
len(cost_list)

24000

In [26]:
model.eval()
pred = model(test_corrupted.float())
pred = pred.cpu().detach().numpy()

In [27]:
pred = scaler.inverse_transform(pred)
test_truth = scaler.inverse_transform(test_truth)

In [28]:
MIDA_pred = pred[mask_test.numpy()==False]

ground_truth = test_truth[mask_test.numpy()==False]
print("Test MAE of MIDA is: "+str(np.mean(abs(MIDA_pred-ground_truth))))
print("Test RMSE of MIDA is: "+str(sqrt(mean_squared_error(MIDA_pred, ground_truth))))

Test MAE of MIDA is: 2.7102901905690207
Test RMSE of MIDA is: 5.881549388590342


In [478]:
import pandas as pd
dtr = pd.date_range(start='2016-06-27', end='2016-08-26', freq='5min')

pred = pd.DataFrame(pred)
pred['tmst']=dtr[0:17280]
pred.to_csv('./Model_results/12hrBM_MIDA_Imputed.csv', index=False)