# Autoencoders

### Import the libraries

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

### Data Aquisition

In [2]:
data = 'http://storage.googleapis.com/download.tensorflow.org/data/ecg.csv'

In [3]:
df = pd.read_csv(data, header=None)

In [4]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,131,132,133,134,135,136,137,138,139,140
0,-0.112522,-2.827204,-3.773897,-4.349751,-4.376041,-3.474986,-2.181408,-1.818287,-1.250522,-0.477492,...,0.792168,0.933541,0.796958,0.578621,0.25774,0.228077,0.123431,0.925286,0.193137,1.0
1,-1.100878,-3.99684,-4.285843,-4.506579,-4.022377,-3.234368,-1.566126,-0.992258,-0.75468,0.042321,...,0.538356,0.656881,0.78749,0.724046,0.555784,0.476333,0.77382,1.119621,-1.43625,1.0
2,-0.567088,-2.59345,-3.87423,-4.584095,-4.187449,-3.151462,-1.74294,-1.490658,-1.18358,-0.394229,...,0.886073,0.531452,0.311377,-0.021919,-0.713683,-0.532197,0.321097,0.904227,-0.421797,1.0
3,0.490473,-1.914407,-3.616364,-4.318823,-4.268016,-3.88111,-2.99328,-1.671131,-1.333884,-0.965629,...,0.350816,0.499111,0.600345,0.842069,0.952074,0.990133,1.086798,1.403011,-0.383564,1.0
4,0.800232,-0.874252,-2.384761,-3.973292,-4.338224,-3.802422,-2.53451,-1.783423,-1.59445,-0.753199,...,1.148884,0.958434,1.059025,1.371682,1.277392,0.960304,0.97102,1.614392,1.421456,1.0


In [5]:
df.shape

(4998, 141)

## Data Preprocessing

### Remove the last column of the dataframe which is the target

In [6]:
X = df.drop(140, axis=1)

In [7]:
y = df[140]

In [8]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,130,131,132,133,134,135,136,137,138,139
0,-0.112522,-2.827204,-3.773897,-4.349751,-4.376041,-3.474986,-2.181408,-1.818287,-1.250522,-0.477492,...,0.160348,0.792168,0.933541,0.796958,0.578621,0.25774,0.228077,0.123431,0.925286,0.193137
1,-1.100878,-3.99684,-4.285843,-4.506579,-4.022377,-3.234368,-1.566126,-0.992258,-0.75468,0.042321,...,0.560327,0.538356,0.656881,0.78749,0.724046,0.555784,0.476333,0.77382,1.119621,-1.43625
2,-0.567088,-2.59345,-3.87423,-4.584095,-4.187449,-3.151462,-1.74294,-1.490658,-1.18358,-0.394229,...,1.284825,0.886073,0.531452,0.311377,-0.021919,-0.713683,-0.532197,0.321097,0.904227,-0.421797
3,0.490473,-1.914407,-3.616364,-4.318823,-4.268016,-3.88111,-2.99328,-1.671131,-1.333884,-0.965629,...,0.491173,0.350816,0.499111,0.600345,0.842069,0.952074,0.990133,1.086798,1.403011,-0.383564
4,0.800232,-0.874252,-2.384761,-3.973292,-4.338224,-3.802422,-2.53451,-1.783423,-1.59445,-0.753199,...,0.966606,1.148884,0.958434,1.059025,1.371682,1.277392,0.960304,0.97102,1.614392,1.421456


In [9]:
y.head()

0    1.0
1    1.0
2    1.0
3    1.0
4    1.0
Name: 140, dtype: float64

### Split data into testing and training sets

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [12]:
train_index = y_train[y_train == 1].index

In [13]:
train_data = X_train.loc[train_index]

In [14]:
train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,130,131,132,133,134,135,136,137,138,139
135,-0.724519,-3.153214,-4.184146,-4.269935,-3.579077,-2.647449,-1.686265,-1.476999,-1.152826,-0.446696,...,0.944162,1.151915,0.986375,1.164575,1.272097,1.064766,1.149777,1.371746,2.049515,1.194698
523,-1.465665,-3.747409,-4.139122,-4.055948,-3.590447,-2.476538,-1.610886,-1.509331,-0.911027,-0.406744,...,1.070128,1.002913,1.146862,1.231868,1.012807,0.850873,1.018095,1.532687,1.546711,-0.587728
1281,-2.655536,-3.124683,-2.891266,-4.22155,-3.82715,-3.371037,-1.984218,-0.411518,0.098434,0.818204,...,-0.975136,-1.077949,-0.933255,-0.958637,-0.356952,0.06351,0.29631,0.466348,0.102173,-0.689607
3108,-0.919318,-3.01494,-3.384893,-3.981646,-3.828617,-3.234646,-1.804082,-1.140253,-0.655269,0.352905,...,1.403169,1.211733,1.471334,1.036322,0.224713,-0.79102,-0.645683,-0.361866,-0.113474,-1.988195
91,1.10779,-2.569991,-1.559817,-2.261683,-1.930483,-2.080601,-1.901656,-1.587257,-1.518856,-1.191894,...,2.091798,2.00988,2.088132,1.381001,0.737655,0.273563,0.521182,0.138129,1.672358,0.474793


### Scale the values in between 0 and 1

In [15]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))

In [16]:
X_train_scaled = scaler.fit_transform(train_data.copy())

In [17]:
X_test_scaled = scaler.fit_transform(X_test.copy())

## Create autoencoder model

### Import functions

In [18]:
from tensorflow.keras import Model, Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.losses import MeanSquaredLogarithmicError

### Create an autoencoder model

In [19]:
class Autoencoder(Model):
    def __init__(self, output_units, code_size=8):
        super().__init__()
        self.encoder = Sequential([
            Dense(64, activation='relu'),
            Dropout(0.1),
            Dense(32, activation='relu'),
            Dropout(0.1),
            Dense(16, activation='relu'),
            Dropout(0.1),
            Dense(code_size, activation='relu')
        ])
        self.decoder = Sequential([
            Dense(16, activation='relu'),
            Dropout(0.1),
            Dense(32, activation='relu'),
            Dropout(0.1),
            Dense(64, activation='relu'),
            Dropout(0.1),
            Dense(output_units, activation='sigmoid')
        ])
    
    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

In [20]:
X_train_scaled.shape

(2317, 140)

### Initialise Model

In [21]:
model = Autoencoder(output_units = X_train_scaled.shape[1])

### Compile Model

In [22]:
model.compile(loss='msle', metrics=['mse', 'acc'], optimizer='adam')

### Fit the data to the model

In [23]:
model.fit(X_train_scaled, 
          X_train_scaled, 
          epochs = 20, 
          batch_size = 512, 
          validation_data = (X_test_scaled, X_test_scaled)
         )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f040416fc40>

In [28]:
def find_threshold(model, X_train_scaled):
  reconstructions = model.predict(X_train_scaled)
  # provides losses of individual instances
  reconstruction_errors = tf.keras.losses.msle(reconstructions, X_train_scaled)

  # threshold for anomaly scores
  threshold = np.mean(reconstruction_errors.numpy()) + np.std(reconstruction_errors.numpy())
  return threshold

def get_predictions(model, X_test_scaled, threshold):
  predictions = model.predict(X_test_scaled)
  # provides losses of individual instances
  errors = tf.keras.losses.msle(predictions, X_test_scaled)
  # 0 = anomaly, 1 = normal
  anomaly_mask = pd.Series(errors) > threshold
  preds = anomaly_mask.map(lambda x: 0.0 if x == True else 1.0)
  return preds

In [29]:
threshold = find_threshold(model, X_train_scaled)
print(f"Threshold method one: {threshold}")

Threshold method one: 0.009678983900157347


In [30]:
from sklearn.metrics import accuracy_score

In [31]:
preds = get_predictions(model, X_test_scaled, threshold)
accuracy_score(preds, y_test)

0.797