### Outlier Detection Tutorial


In [None]:
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

from daml.datasets import DamlDataset
from daml.metrics.outlier_detection import AE, Threshold, ThresholdType

tf.random.set_seed(108)
tf.keras.utils.set_random_seed(408)

#### Load the data
We will use the tensorflow mnist dataset for this tutorial on outlier detection

In [None]:
# Load in the mnist dataset from tensorflow datasets
(images, ds_info) = tfds.load(
    "mnist",
    split="train",
    with_info=True,
)  # type: ignore

tfds.visualization.show_examples(images,ds_info)
images = images.shuffle(images.cardinality())
images = [i["image"].numpy() for i in list(images.take(3000))]
dataset = DamlDataset(np.array(images))

#### Initialize the model
Now, lets look at how to use DAML's outlier detection methods.  
We will focus on a simple autoencoder network from our Alibi Detect provider

First, let's initialize our outlier detection model with the input image size of 28x28x1

In [None]:
# Initialize the autoencoder-based outlier detector from alibi-detect
metric = AE()
metric.initialize_detector(dataset.images[0].shape)
print("Outlier Detection Model:", metric.detector)

#### Train the model
Next we will train a model on the dataset.
For better results, the epochs can be increased.
We set the outlier threshold to detect the most extreme 1% of training data as outliers.

In [None]:
# Train the detector on the set of images
metric.fit_dataset(
    dataset=dataset,
    epochs=12,
    threshold=Threshold(100, ThresholdType.PERCENTAGE),
    verbose=False
)

#### Test for outliers
We have trained our detector on a dataset of digits.  
What happens when we give it corrupted images of digits (which we expect to be "outliers")?

In [None]:
corr_images,ds_info = tfds.load("mnist_corrupted/translate",
    split="train",
    with_info=True,
)  # type: ignore

tfds.visualization.show_examples(corr_images,ds_info)
corr_images = corr_images.shuffle(corr_images.cardinality())
corr_images = [i["image"].numpy() for i in list(corr_images.take(3000))]
corr_dataset = DamlDataset(np.array(corr_images))

Now we evaluate the two datasets using the trained model.

In [None]:
preds_in = metric.evaluate(dataset).is_outlier
print(f"Original digits outliers: {np.mean(preds_in)*100}%")

In [None]:
preds_corr = metric.evaluate(corr_dataset).is_outlier
print(f"Corrupted digits outliers: {np.mean(preds_corr)*100}%")

##### Results
We identify a significant number of the corrupted images as outliers!  
Additional epochs when fitting the dataset will further improve the performance of outlier detection.