# Simple Autoencoder Function

## 1. Import modules and set paths

All used libraries are here.

In [0]:
# Data managment 
import pandas as pd 
import numpy as np 

#Machine learning
import torch as t
from torch import nn, optim
from torch.utils import data as data_lib
import torch.nn.functional as F

#Utilities
import time
import os

## 2. Preprocess and create generators.
Generators will be used for training, validation and test.

Some specific features of PyTorch are used.

In [0]:
def prepareData(data):
  
  no_samples, no_features = data.shape
  no_features = no_features - 1 #Take -1 becasue of label column
  
  labels = data['label']
  data = data.drop('label', axis = 1)
  
  crit = labels.astype(bool)
  f_indices = np.where(crit)[0] # Flagged indices
  no_f_indices = np.where(~crit)[0] # Not flagged indices

  #Shuffle intances to ensure that with each run different samples are drawn
  np.random.seed(int(time.time()))
  np.random.shuffle(f_indices)
  np.random.shuffle(no_f_indices)

  # 10% of all correct instances will be used as validation
  valid_split = int(np.floor(.1 * len(no_f_indices))) 
  # 90% of all samples that were calssified as outliers will be used for test 
  # since none of them are used for traning. 10% are rejected to not overfit.
  # Same amount correct and flagged samples will be used as test.
  test_split = int(np.floor(.9* sum(crit)))

  #Create dataset
  X = t.FloatTensor(data.to_numpy()) #features
  Y = t.IntTensor(labels.to_numpy()) #labels
  dataset = data_lib.TensorDataset(X,Y)

  #Set training indices of correct cases
  test_indices_c, valid_indices, train_indices = np.split(no_f_indices, [test_split, test_split+valid_split])
  # Add flagged indices to test indices 
  test_indices = np.concatenate((test_indices_c, f_indices[:test_split]))

  # Create samplers
  train_sampler = data_lib.SubsetRandomSampler(train_indices)
  valid_sampler = data_lib.SubsetRandomSampler(valid_indices)
  test_sampler = data_lib.SubsetRandomSampler(test_indices)

  #Parameters
  params = {'batch_size': 10, 'num_workers': 8}

  #Create generators
  train_gen = data_lib.DataLoader(dataset, **params,sampler=train_sampler)
  valid_gen = data_lib.DataLoader(dataset, **params, sampler=valid_sampler)
  test_gen = data_lib.DataLoader(dataset, **params, sampler=test_sampler)
  
  return (no_samples, no_features, train_gen, valid_gen, test_gen)

## 3. Definine architectures of Neural Networks
For now implemented:

-Simple Autoencoder

### Simple Autoencoder

In [0]:
class Autoencoder(nn.Module):
    def __init__(self, no_features):
        super().__init__()
        NF = no_features
        
        self.fc1 = nn.Linear(NF, round(0.9*NF))
        self.fc2 = nn.Linear(round(0.9*NF), round(0.8*NF))
        
        self.fc3 = nn.Linear(round(0.8*NF), round(0.7*NF)) 
        self.fc4 = nn.Linear(round(0.7*NF), round(0.8*NF))
        
        self.fc5 = nn.Linear(round(0.8*NF), round(0.9*NF))
        self.fc6 = nn.Linear(round(0.9*NF), NF)  
        
        self.dropout = nn.Dropout(p=0.25)
        
    def forward(self, x):
        
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.dropout(F.relu(self.fc2(x)))
        x = self.dropout(F.relu(self.fc3(x)))
        x = self.dropout(F.relu(self.fc4(x)))
        x = self.dropout(F.relu(self.fc5(x)))
                               
        x = F.relu(self.fc6(x))
        return x

## 4. Train the network function
Network is being trained only on **not flagged** data!

In [0]:
def trainModel(n_epochs, model, train_gen, valid_gen, model_path, gpu, optimizer, criterion):

  valid_loss_min = np.Inf # to track change in validation loss

  # Iterate on epochs 
  for epoch in range(1, n_epochs+1):

      # keep track of training and validation loss
      train_loss = 0.0
      valid_loss = 0.0

      # Set model to train mode (to include dropout)
      model.train()

      # iterate on data batches, discard labels
      for features, _ in train_gen:
          # move tensors to GPU if CUDA is available
          if gpu:
              features = features.cuda()
          # clear the gradients of all optimized variables
          optimizer.zero_grad()
          # forward pass: compute predicted outputs by passing inputs throught the model
          output = model.forward(features)
          # calculate the batch loss by compering to initial features
          loss = criterion(output, features)
          # backward pass: compute gradient of the loss with respect to model parameters
          loss.backward()
          # perform a single optimization step (parameter update)
          optimizer.step()
          # update average validation loss, mulitply by batchsize for bigger nums
          train_loss += loss.item() *features.size(0)

      # Validate the model 

      # Set model to evaluation mode tu use its full power
      model.eval()

      for features, _ in valid_gen:
          # move tensors to GPU if CUDA is available
          if gpu:
              features = features.cuda()
          # forward pass: compute predicted outputs by passing inputs to the model
          output = model.forward(features)
          # calculate the batch loss by compering to initial features
          loss = criterion(output, features)
          # update average validation loss, mulitply by batchsize for bigger nums
          valid_loss += loss.item() *features.size(0)

      # calculate average losses
      train_loss = train_loss/len(train_gen.sampler)
      valid_loss = valid_loss/len(valid_gen.sampler)

      # save model if validation loss has decreased
      if valid_loss <= valid_loss_min:
          t.save(model.state_dict(), model_path)
          valid_loss_min = valid_loss

## 5. Test
Load model which had smallest validation loss in training. 

**Count losses**

Calcualte sum of losses sepratly for **outliers** and **correct samples** and divide them by half of sampler length. Amount of samples in both cathegories is same. This way we obtain avg MSE for both types.



In [0]:
def testModel(test_gen, model, model_path, gpu, criterion):

  #Load model with best parameters
  model.load_state_dict(t.load(model_path))

  # track test loss
  outliers_loss = 0.0
  correct_loss = 0.0
  loss_lab_list = []

  # Set model to evaluation to use its full power
  model.eval()
  # iterate over test data
  for features, labels in test_gen:
      # move tensors to GPU if CUDA is available
      if gpu:
         features, labels = features.cuda(), labels.cuda()
      # forward pass: compute predicted outputs by passing inputs to the model
      output = model.forward(features)

      # Update test loss for score purposes
      for samp_no, lab in enumerate(labels):
          loss = criterion(output[samp_no], features[samp_no])
          # Create list of losses and labels for next step
          loss_lab_list.append((loss.item(), lab.item()))
          if lab.item(): # true if flagged
              outliers_loss += loss.item()
          else:
              correct_loss += loss.item()

  # Print average MSE
  avg_outlieres_loss = outliers_loss / (len(test_gen.sampler)/2)
  avg_correct_loss = correct_loss / (len(test_gen.sampler)/2)
  return avg_outlieres_loss, avg_correct_loss

## 6. Main functions

###Simple Autoencoder

In [0]:
def getSimpleAutoencoderLosses(data, no_epochs, model_path):
    """This function trains autoencoder on given data.
    
    It will also test model and return obtained test scores.
    
    Returns:
    
    float - avg_outlieres_loss - Average MSE of test samples with outlier label   
    float - avg_correct_loss - Average MSE of test samples without label  
    """
    no_samples, no_features, train_gen, valid_gen, test_gen = prepareData(data)
    # Put net to model object
    model = Autoencoder(no_features)
    # Mean Square Error criterion
    criterion = nn.MSELoss()
    # Optimizer - Adam
    optimizer = optim.Adam(model.parameters(), lr=0.003)

    gpu = t.cuda.is_available()

    #Set hardware variable (to know if moving model to gpu is option)
    if not gpu:
        model.cpu()
    else:
        model.cuda()

    trainModel(no_epochs, model, train_gen, valid_gen, model_path, gpu, optimizer, criterion)
    avg_outlieres_loss, avg_correct_loss = testModel(test_gen, model, model_path, gpu, criterion)  

    return avg_outlieres_loss, avg_correct_loss

## 7. Test 
Should be commented before notebook is used.

In [0]:
# %%capture
# #Get data, small file so no if
# if not os.path.isfile('all_scaled0_1.csv'): 
#     !wget 'https://drive.google.com/uc?export=download&id=1-ET9vXPKudU92XuWeR0wIL67byS2llq-' -O all_scaled0_1.csv

In [0]:
# no_epochs = 10
# model_path = 'simple_ae_model.pt'
# data = pd.read_csv('all_scaled0_1.csv',index_col=0)
# data = data.drop(['chunkID','run','period'], axis = 1)
  
# avg_outlieres_loss, avg_correct_loss = getSimpleAutoencoderLosses(data, no_epochs, model_path)

# print(avg_outlieres_loss*1000)
# print(avg_correct_loss*1000)

41.66836613168319
3.538534586162617

Score:
38.129831545520574
