In [1]:
import plotly.graph_objects as go
import pyreadr

import pandas as pd
import time
import torch
import torch.nn as nn
import torch.optim as optim

from dataset import Dataset
from model import Autoencoder
from torch.utils.data import DataLoader

In [2]:
normal = pyreadr.read_r('.\\data\\TEP_FaultFree_Training.RData')['fault_free_training'][::5]
faulty = pyreadr.read_r('.\\data\\TEP_Faulty_Training.RData')['faulty_training']

print(normal.shape)
print(faulty.shape)

normal.head(10)

(50000, 55)
(5000000, 55)


Unnamed: 0,faultNumber,simulationRun,sample,xmeas_1,xmeas_2,xmeas_3,xmeas_4,xmeas_5,xmeas_6,xmeas_7,...,xmv_2,xmv_3,xmv_4,xmv_5,xmv_6,xmv_7,xmv_8,xmv_9,xmv_10,xmv_11
0,0.0,1.0,1,0.25038,3674.0,4529.0,9.232,26.889,42.402,2704.3,...,53.744,24.657,62.544,22.137,39.935,42.323,47.757,47.51,41.258,18.447
5,0.0,1.0,6,0.29303,3691.7,4502.2,9.378,27.111,41.999,2703.8,...,53.595,28.898,59.107,22.291,38.673,39.89,47.571,47.43,41.308,19.134
10,0.0,1.0,11,0.2348,3677.4,4489.8,9.3199,26.695,42.014,2703.9,...,54.542,23.133,61.425,21.93,42.262,33.648,42.375,47.328,40.344,17.198
15,0.0,1.0,16,0.29176,3713.7,4501.2,9.3591,26.744,42.092,2706.8,...,53.998,28.45,60.255,22.152,38.096,34.711,45.437,47.058,41.889,14.375
20,0.0,1.0,21,0.27833,3649.7,4479.9,9.3486,26.387,42.564,2701.5,...,53.947,27.761,60.589,21.743,39.398,38.607,46.686,46.688,41.585,18.294
25,0.0,1.0,26,0.23546,3763.0,4542.1,9.3654,26.8,42.203,2700.0,...,53.072,23.022,60.996,22.045,39.542,33.833,45.547,47.723,40.676,20.1
30,0.0,1.0,31,0.22515,3689.6,4525.4,9.4095,27.133,42.395,2698.9,...,54.227,22.439,58.462,21.963,39.188,38.612,49.676,48.762,41.649,17.802
35,0.0,1.0,36,0.25645,3635.5,4408.3,9.2819,26.774,42.549,2696.6,...,53.693,24.89,60.471,21.63,41.046,38.444,44.951,49.639,40.576,19.258
40,0.0,1.0,41,0.25674,3683.3,4472.1,9.2549,26.829,42.854,2696.2,...,54.05,25.067,62.835,21.498,41.471,37.175,43.988,49.92,41.145,19.425
45,0.0,1.0,46,0.21873,3645.2,4548.2,9.2642,26.832,42.127,2694.7,...,53.761,21.891,62.665,21.494,39.822,37.228,44.141,50.434,39.663,19.696


In [3]:
norm = lambda x: (x - x.mean()) / x.std()
denorm = lambda x: (x * x.std()) + x.mean()

def train(model, train_loader, criterion, optimizer, epochs=20):
    
    print('starting model training...')
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"training on {'cuda' if torch.cuda.is_available() else 'cpu'}")
    model.to(device)
    history = []
    
    for epoch in range(epochs):
        start = time.time()
        model.train()
        training_loss, running_loss = 0.0, 0.0
        
        # Iterando sobre o dataset
        for batch_i, data in enumerate(train_loader):
            X = data['X'].to(device)
            
            # Zero Grad
            optimizer.zero_grad()

            # Forward Pass
            output = model(X)

            # Loss Function
            loss = criterion(output, X)

            # Backward Pass
            loss.backward()
            running_loss += loss.item()
            training_loss += loss.item()

            # Update
            optimizer.step()

            if batch_i % 100 == 99:
                print('Batch: {}, Avg. Loss: {}'.format(batch_i + 1, running_loss / 100))
                running_loss = 0.0
        
        # Epoch results
        training_loss /= len(train_loader)
        print(f'[{round(time.time() - start, 3)} secs] Epoch: {epoch+1}/{epochs}', end='')
        print(f', Training loss: {training_loss}', end='\n\n')
        history.append(training_loss)
    
    print('training finished.')
    return history

In [4]:
train_dataset = Dataset(rdata_file='TEP_FaultFree_Training.RData', rdata_key='fault_free_training')
train_loader = DataLoader(train_dataset, batch_size=128, 
                          shuffle=True, num_workers=0)

model = Autoencoder(input_dim=normal.iloc[:, 3:].shape[1])
print(model)

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

Autoencoder(
  (encoder1): Linear(in_features=52, out_features=256, bias=True)
  (encoder2): Linear(in_features=256, out_features=128, bias=True)
  (encoder3): Linear(in_features=128, out_features=26, bias=True)
  (decoder1): Linear(in_features=26, out_features=128, bias=True)
  (decoder2): Linear(in_features=128, out_features=256, bias=True)
  (decoder3): Linear(in_features=256, out_features=52, bias=True)
)


In [5]:
loss = nn.MSELoss()
model.to('cpu')
for i, data in enumerate(train_loader):
    target = data['X']
    output = model(data['X'])
    
    print(target.size())
    print(output.size())
    
    mse = loss(output, target)
    print(mse.item())
    
    # observe 4th batch and stop.
    if i == 0: break

torch.Size([128, 52])
torch.Size([128, 52])
0.937488853931427


In [10]:
train_loss = train(model, train_loader, criterion, optimizer, epochs=20)

starting model training...
training on cuda
Batch: 100, Avg. Loss: 0.20919720247387885
Batch: 200, Avg. Loss: 0.20781473740935325
Batch: 300, Avg. Loss: 0.20669965848326682
Batch: 400, Avg. Loss: 0.2074257315695286
Batch: 500, Avg. Loss: 0.20614519402384757
Batch: 600, Avg. Loss: 0.20558804422616958
Batch: 700, Avg. Loss: 0.2063021893799305
Batch: 800, Avg. Loss: 0.2060234174132347
Batch: 900, Avg. Loss: 0.2050905130803585
Batch: 1000, Avg. Loss: 0.20507169038057327
Batch: 1100, Avg. Loss: 0.20491344794631006
Batch: 1200, Avg. Loss: 0.20522597312927246
Batch: 1300, Avg. Loss: 0.20530777618288995
Batch: 1400, Avg. Loss: 0.20445897668600083
Batch: 1500, Avg. Loss: 0.2055441352725029
Batch: 1600, Avg. Loss: 0.20404853239655496
Batch: 1700, Avg. Loss: 0.2039948982000351
Batch: 1800, Avg. Loss: 0.20297938093543053
Batch: 1900, Avg. Loss: 0.20319450050592422
[30.796 secs] Epoch: 1/20, Training loss: 0.20550741418479287

Batch: 100, Avg. Loss: 0.20317937046289444
Batch: 200, Avg. Loss: 0.2033

Batch: 600, Avg. Loss: 0.18187814801931382
Batch: 700, Avg. Loss: 0.18111098021268845
Batch: 800, Avg. Loss: 0.18278243854641915
Batch: 900, Avg. Loss: 0.18195960372686387
Batch: 1000, Avg. Loss: 0.18229833826422692
Batch: 1100, Avg. Loss: 0.18160767003893852
Batch: 1200, Avg. Loss: 0.181830033659935
Batch: 1300, Avg. Loss: 0.18262666031718255
Batch: 1400, Avg. Loss: 0.18162624225020407
Batch: 1500, Avg. Loss: 0.180937712341547
Batch: 1600, Avg. Loss: 0.1812059225142002
Batch: 1700, Avg. Loss: 0.18233041495084762
Batch: 1800, Avg. Loss: 0.18073254317045212
Batch: 1900, Avg. Loss: 0.18097605377435685
[32.417 secs] Epoch: 10/20, Training loss: 0.18161598959806144

Batch: 100, Avg. Loss: 0.17978702023625373
Batch: 200, Avg. Loss: 0.18101937919855118
Batch: 300, Avg. Loss: 0.18087986156344413
Batch: 400, Avg. Loss: 0.1800988546013832
Batch: 500, Avg. Loss: 0.18120553374290466
Batch: 600, Avg. Loss: 0.1806648376584053
Batch: 700, Avg. Loss: 0.18074526146054268
Batch: 800, Avg. Loss: 0.18203

Batch: 1200, Avg. Loss: 0.17635643735527992
Batch: 1300, Avg. Loss: 0.17602833047509192
Batch: 1400, Avg. Loss: 0.17576718032360078
Batch: 1500, Avg. Loss: 0.17570056959986688
Batch: 1600, Avg. Loss: 0.17481169432401658
Batch: 1700, Avg. Loss: 0.1765129755437374
Batch: 1800, Avg. Loss: 0.1764976716041565
Batch: 1900, Avg. Loss: 0.17605319902300834
[31.685 secs] Epoch: 19/20, Training loss: 0.17616173351982356

Batch: 100, Avg. Loss: 0.1747602951526642
Batch: 200, Avg. Loss: 0.1754518038034439
Batch: 300, Avg. Loss: 0.17615912735462189
Batch: 400, Avg. Loss: 0.1749882172048092
Batch: 500, Avg. Loss: 0.17741490349173547
Batch: 600, Avg. Loss: 0.17568191215395929
Batch: 700, Avg. Loss: 0.17563376814126969
Batch: 800, Avg. Loss: 0.17520580992102622
Batch: 900, Avg. Loss: 0.1754494023323059
Batch: 1000, Avg. Loss: 0.17493412882089615
Batch: 1100, Avg. Loss: 0.17503251537680625
Batch: 1200, Avg. Loss: 0.17494365379214286
Batch: 1300, Avg. Loss: 0.17396737813949584
Batch: 1400, Avg. Loss: 0.1

In [11]:
fig = go.Figure(data=go.Scatter(y=train_loss))
fig.show()

In [16]:
mse = []
model.to('cpu')

points = 5000
for index, data in enumerate(train_dataset):
    if index % points == points-1: break
    output = model(data['X'])
    mse.append(criterion(output, data['X']).item())

In [17]:
fig = go.Figure(data=go.Scatter(y=mse, mode='lines+markers'))
fig.show()

In [35]:
train_faulty_dataset = Dataset(rdata_file='TEP_Faulty_Training.RData', rdata_key='faulty_training')

mse_faulty = []
model.to('cpu')

points = 5000
for index, data in enumerate(train_faulty_dataset):
    if index % points == points-1: break
    output = model(data['X'])
    mse_faulty.append(criterion(output, data['X']).item())

MemoryError: Unable to allocate 1.94 GiB for an array with shape (52, 5000000) and data type float64

In [19]:
fig = go.Figure(data=go.Scatter(y=mse_faulty, mode='lines+markers'))
fig.show()

In [34]:
faulty.shape

(5000000, 55)