# Autoencoder and GAN

This data set is made available by UCI. It contains data about patients with and without heart problems. Each row represents a single patient. There two files: heart-normal (contains patients without any heart problems) and heart_anomaly (contains patients with heart problems). Anomaly detection task: build an autoencoder on normal patients to identify anomalous observations. 

## Goal

Use the data set **heart-normal.csv** data set to train an autoencoder on healthy (i.e., normal) patients. Then, use the observations in **heart-anomaly.csv** data set to check whether the autoencoder can successfully detect patients who have a heart anomaly. 

# Read and Prepare the Data

In [1]:
import numpy as np
import pandas as pd

random_state=42

In [4]:
# Read in normal dataset
heart_normal = pd.read_csv("heart_normal.csv")
heart_normal.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [5]:
# Read in anomalous dataset
heart_anomaly = pd.read_csv("heart_anomaly.csv")
heart_anomaly.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,67,1,0,160,286,0,0,108,1,1.5,1,3,2
1,67,1,0,120,229,0,0,129,1,2.6,1,2,3
2,62,0,0,140,268,0,0,160,0,3.6,0,2,2
3,63,1,0,130,254,0,0,147,0,1.4,1,1,3
4,53,1,0,140,203,1,0,155,1,3.1,0,0,3


# Standardize Numeric Values

In [6]:
#Standardize the data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

#fit transform normal
heart_normal_std = scaler.fit_transform(heart_normal)

#transform anomaly
heart_anomaly_std = scaler.transform(heart_anomaly)

In [7]:
heart_normal_std.shape, heart_anomaly_std.shape

((165, 13), (20, 13))

# Autoencoder

In [8]:
import tensorflow as tf
from tensorflow import keras

In [9]:
model = keras.models.Sequential()

#Encoder - progressively reduce neurons
model.add(keras.layers.Input(shape=13))
model.add(keras.layers.Dense(11, activation='selu'))
model.add(keras.layers.Dense(9, activation='selu'))

#Decoder - progressively increase neurons
model.add(keras.layers.Dense(9, activation='selu'))
model.add(keras.layers.Dense(11, activation='selu'))
model.add(keras.layers.Dense(13))     # no activation for ouput layer since inputs are continuous

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 11)                154       
_________________________________________________________________
dense_1 (Dense)              (None, 9)                 108       
_________________________________________________________________
dense_2 (Dense)              (None, 9)                 90        
_________________________________________________________________
dense_3 (Dense)              (None, 11)                110       
_________________________________________________________________
dense_4 (Dense)              (None, 13)                156       
Total params: 618
Trainable params: 618
Non-trainable params: 0
_________________________________________________________________


In [10]:
#define learning rate
lr = 0.001

#Available optimizers:
adagrad = keras.optimizers.Adagrad(lr=lr, epsilon=None, decay=0.0)
sgd = keras.optimizers.SGD(lr=lr, momentum=0.9, decay=0.0, nesterov=True)
rmsprop = keras.optimizers.RMSprop(lr=lr, rho=0.9, epsilon=None, decay=0.0)
adam = keras.optimizers.Adam(lr=lr, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
nesterov_adam = keras.optimizers.Nadam(lr=lr, beta_1=0.9, beta_2=0.999, epsilon=None, schedule_decay=0.004)

#Initializations:
xavier = keras.initializers.glorot_normal(seed=None)
he = keras.initializers.he_normal(seed=None)

# Activation functions
activation = 'elu' 
#activation = 'relu'
#activation = 'tanh'
#activation = 'sigmoid'

#Compile model using Nadam optimizer
model.compile(loss='mean_squared_error', optimizer=nesterov_adam, metrics=['mean_squared_error'])

In [11]:
#define early-stopping parameters
from tensorflow.keras.callbacks import EarlyStopping

earlystop = EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='auto')

callback = [earlystop]

In [21]:
#fit model using normal dataset
model.fit(heart_normal_std, heart_normal_std, 
          validation_data = (heart_normal_std, heart_normal_std),
          epochs=150, batch_size=100, callbacks=callback)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150


Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78/150
Epoch 79/150
Epoch 80/150
Epoch 81/150
Epoch 82/150
Epoch 83/150
Epoch 84/150
Epoch 85/150
Epoch 86/150
Epoch 87/150
Epoch 88/150
Epoch 89/150
Epoch 90/150
Epoch 91/150
Epoch 92/150
Epoch 93/150
Epoch 94/150
Epoch 95/150
Epoch 96/150
Epoch 97/150
Epoch 98/150
Epoch 99/150
Epoch 100/150
Epoch 101/150
Epoch 102/150
Epoch 103/150
Epoch 104/150
Epoch 105/150
Epoch 106/150
Epoch 107/150
Epoch 108/150
Epoch 109/150
Epoch 110/150
Epoch 111/150
Epoch 112/150
Epoch 113/150
Epoch 114/150
Epoch 115/150
Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/15

<tensorflow.python.keras.callbacks.History at 0x20ba1e7cb50>

### Check the average MSE on the "normal" data

In [22]:
model.evaluate(heart_normal_std, heart_normal_std)



[0.3071291446685791, 0.3071291148662567]

### Check the average MSE on the "anomalous" data

In [23]:
model.evaluate(heart_anomaly_std, heart_anomaly_std)



[0.7215893864631653, 0.7215893864631653]

## Predict first 20 in normal data

In [27]:
from sklearn.metrics import mean_squared_error

for i in range(0,20):
    prediction = model.predict(heart_normal_std[i:i+1])
    print((mean_squared_error(heart_normal_std[i:i+1], prediction))*100)
    
#Error terms are multiplied by 100 to make sense of the numbers

62.20077400698803
108.0323876232637
16.86513164557531
17.491780917573305
38.39224302014403
31.4315651950609
7.917338086099719
9.353659691451433
31.69405746281771
25.151877607206348
11.760038508137091
19.226134584782592
10.216019723494002
47.83772195730014
18.381826425373422
14.321221372162949
20.27706038743175
35.22579933028892
34.755953388678606
40.10043202057958


## Predict all 20 in anomaly data

In [26]:
for i in range(0,20):
    prediction = model.predict(heart_anomaly_std[i:i+1])
    print(100*(mean_squared_error(heart_anomaly_std[i:i+1], prediction))*100)
    
#Error terms are multiplied by 100 to make sense of the numbers

9091.148114043328
9294.896658594946
15097.924898732083
5732.6237652421805
11221.010327301316
5808.499371932188
1458.4131929357118
2109.927403811218
10561.739586586216
8022.078200154128
9731.577971712724
7368.822528473503
3166.102369397873
9343.649677373847
3277.4492848985597
4976.412495243414
14531.92302781102
1442.4777641583091
4050.669814714267
8030.519503528285


# Build a GAN

Build a GAN that can generate patients with heart anomalies. Test the effectiveness of the GAN using the autoencoder.

In [56]:
#number of input variables
codings_size = 30   

#define the generator
generator = keras.models.Sequential([
    keras.layers.Input(shape=codings_size),
    keras.layers.Dense(25, activation="selu"),
    keras.layers.Dense(25, activation="selu"),
    keras.layers.Dense(13, activation=None) #no activation due to continous vars
])

generator.summary()

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_23 (Dense)             (None, 25)                775       
_________________________________________________________________
dense_24 (Dense)             (None, 25)                650       
_________________________________________________________________
dense_25 (Dense)             (None, 13)                338       
Total params: 1,763
Trainable params: 1,763
Non-trainable params: 0
_________________________________________________________________


In [57]:
#define the discriminator
discriminator = keras.models.Sequential([
    keras.layers.Input(shape=[13]),
    keras.layers.Dense(25, activation="selu"),
    keras.layers.Dense(25, activation="selu"),
    keras.layers.Dense(1, activation="sigmoid") #1 neuron output - binary classification (real/fake)
])

discriminator.summary()

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_26 (Dense)             (None, 25)                350       
_________________________________________________________________
dense_27 (Dense)             (None, 25)                650       
_________________________________________________________________
dense_28 (Dense)             (None, 1)                 26        
Total params: 1,026
Trainable params: 1,026
Non-trainable params: 0
_________________________________________________________________


In [58]:
#define GAN model
gan = keras.models.Sequential([generator, discriminator])

#compile discriminator
discriminator.compile(loss="binary_crossentropy", optimizer="rmsprop")
discriminator.trainable = False

#compile GAN
gan.compile(loss="binary_crossentropy", optimizer="rmsprop")

In [59]:
#determine input dataset for generator
batch_size = 10
dataset = tf.data.Dataset.from_tensor_slices(heart_anomaly_std).shuffle(1000)
dataset = dataset.batch(batch_size, drop_remainder=True).prefetch(1)

In [60]:
#create train_gan function - 15 epochs
def train_gan(gan, dataset, batch_size, codings_size, n_epochs=15):
    generator, discriminator = gan.layers
    for epoch in range(n_epochs):
        for X_batch in dataset:
            # phase 1 - training the discriminator
            noise = tf.random.normal(shape=[batch_size, codings_size])
            generated_data = tf.cast(generator(noise), tf.float64)
            X_fake_and_real = tf.concat([generated_data, X_batch], axis=0)
            y1 = tf.constant([[0.]] * batch_size + [[1.]] * batch_size)
            discriminator.trainable = True
            discriminator.train_on_batch(X_fake_and_real, y1)
            # phase 2 - training the generator
            noise = tf.random.normal(shape=[batch_size, codings_size])
            y2 = tf.constant([[1.]] * batch_size)
            discriminator.trainable = False
            gan.train_on_batch(noise, y2)
        print("Epoch: {}/{}".format(epoch, n_epochs))
        

In [61]:
train_gan(gan, dataset, batch_size, codings_size)

Epoch: 0/15
Epoch: 1/15
Epoch: 2/15
Epoch: 3/15
Epoch: 4/15
Epoch: 5/15
Epoch: 6/15
Epoch: 7/15
Epoch: 8/15
Epoch: 9/15
Epoch: 10/15
Epoch: 11/15
Epoch: 12/15
Epoch: 13/15
Epoch: 14/15


### Generate new data using trained generator

In [62]:
#generate 10 rows of fake data
noise = tf.random.normal(shape=[10, codings_size])
generated_data = tf.cast(generator(noise), tf.float64)

generated_data

<tf.Tensor: shape=(10, 13), dtype=float64, numpy=
array([[-1.51731205,  1.17190707,  1.31554747,  0.30161977,  2.1152215 ,
        -0.30834025, -0.08885502,  1.3466531 ,  0.08363234,  0.34198651,
        -0.03918867, -0.32027131,  0.737701  ],
       [ 0.31898195,  1.05178297, -0.35875118,  0.69852394, -0.39236924,
        -1.49213016,  1.36047769, -0.25507545,  0.8817628 , -0.65677702,
         0.34380084,  0.81756979, -1.34627461],
       [-0.3749606 , -0.72759658,  1.88149822, -0.13057487, -0.85450542,
        -0.05750478, -1.21586359,  0.00722514, -0.18779019,  2.53429961,
        -0.7722373 ,  0.43467569,  1.28557539],
       [ 1.18621111,  1.10002673,  1.05017281,  0.87861317,  1.11924279,
        -1.7135545 ,  1.00956678, -0.3426277 ,  1.25140548, -2.63345075,
        -0.58396989,  1.0251199 , -0.66922277],
       [ 0.3209154 , -0.14166456,  0.04485056,  0.15100434, -1.35214496,
        -0.52834725, -2.57674885, -3.69230771,  0.22196059,  0.15067512,
        -0.07310046, -0.2347

### Check data against Autoencoder

In [63]:
#print MSEs of the 10 rows of generated data
for i in range(0,9):
    prediction = model.predict(generated_data[i:i+1])
    print(100*(mean_squared_error(generated_data[i:i+1], prediction))*100)
    
#Error terms are multiplied by 100 to make sense of the numbers

3058.1423979961246
3057.2961043870528
2978.2070502151587
9331.885846453491
11534.433271586493
7559.766074981058
12888.898050971065
8104.466128051596
22774.111452012872
