# From Climatology Test to Anomaly Detection

### Objective:
Explain the concept of the Anomaly Detection approach to quality control


Create a synthetic conceptual case, with random normally distributed data on 3 dimensions. Each dimension is normal, so bad data doesn't necessarily can be seen by all dimensions, but on might be visible in one single dimension. and can explore the corners. 

In [None]:
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
import numpy as np
from scipy import stats

import cotede


In [None]:
output_notebook()

## Synthetic data
Let's create some synthetic data to illustrate some concepts.

In [None]:
# Number of samples
N = 3000
# True mean and standard deviation of this dataset
mu, sigma = 0, 1
# Let's fix the random seed so everyone gets the same result
np.random.seed(42)

t = np.arange(N)
x = np.random.normal(mu, sigma, N)

In [None]:
# w = np.blackman(11)
# x = np.convolve(x, w, 'same')

### How does this dataset look like?

In [None]:
# A time series with the data
p = figure(plot_width=750, plot_height=300)
p.circle(t, x, size=8, line_color="orange", fill_color="orange", fill_alpha=0.5)
show(p) # show the results

In [None]:
def plot_hist(hist, edges):
    """Plot an histogram
    
    Create an histogram from the output of numpy.hist().
    We will create several histograms in this notebook so let's save this as a function to
    reuse this code.
    """
    #title = 'test'
    # p = figure(title=title, tools='', background_fill_color="#fafafa")
    p = figure(plot_width=750, plot_height=300,
        tools='', background_fill_color="#fafafa")
    p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
           fill_color="navy", line_color="white", alpha=0.5)
    # p.line(x, pdf, line_color="#ff8888", line_width=4, alpha=0.7, legend_label="PDF")
    # p.line(x, cdf, line_color="orange", line_width=2, alpha=0.7, legend_label="CDF")

    p.y_range.start = 0
    # p.legend.location = "center_right"
    # p.legend.background_fill_color = "#fefefe"
    p.xaxis.axis_label = 'x'
    p.yaxis.axis_label = 'Pr(x)'
    p.grid.grid_line_color="white"
    return p


#### Data Distribution
Let's plot an histogram

In [None]:
hist, edges = np.histogram(x, density=True, bins=50)

p = plot_hist(hist, edges)
show(p)

We know that this dataset has a normal distribution, so we can approximate it to a Gaussian.

In [None]:
mu_estimated, sigma_estimated = stats.norm.fit(x)

print("Estimated mean: {:.3f}, and standard deviation: {:.3f}".format(mu_estimated, sigma_estimated))

In [None]:
x_ref = np.linspace(x.min(), x.max(), 1000)
pdf = stats.norm.pdf(x_ref, loc=mu_estimated, scale=sigma_estimated)
# sf = stats.norm.sf(x_ref, loc=mu_estimated, scale=sigma_estimated)

In [None]:
p = plot_hist(hist, edges)
p.line(x_ref, pdf, line_color="orange", line_width=8, alpha=0.7, legend_label="PDF")
# p.line(x_ref, sf, line_color="red", line_width=8, alpha=0.7, legend_label="SF")
show(p)

### Bad data
Let's add some bad measurements in random positions on our dataset

In [None]:
N_bad = 5
idx = np.random.permutation(x.size)[:N_bad]
x[idx] = np.random.uniform(mu-10*sigma, mu+10*sigma, N_bad)

print(sorted(x[idx]))

In [None]:

idx_good = [tn not in idx for tn in t]

# A time series with the data
p = figure(plot_width=750, plot_height=300, title="Some bad measurements")
p.circle(t[idx_good], x[idx_good], size=8, line_color="green", fill_color="green", fill_alpha=0.3)
p.triangle(t[idx], x[idx], size=12, line_color="red", fill_color="red", fill_alpha=0.8)
# p.line([0, N], 2*[-6 * sigma], line_color="orange", line_width=3, alpha=0.7)
# p.line([0, N], 2*[6 * sigma], line_color="orange", line_width=3, alpha=0.7)
show(p) # show the results

In [None]:
mu_estimated, sigma_estimated = stats.norm.fit(x)

print("Estimated mean: {:.3f}, and standard deviation: {:.3f}".format(mu_estimated, sigma_estimated))

x_ref = np.linspace(x.min(), x.max(), 1000)
pdf = stats.norm.pdf(x_ref, loc=mu_estimated, scale=sigma_estimated)

p = plot_hist(hist, edges)
p.line(x_ref, pdf, line_color="orange", line_width=8, alpha=0.7, legend_label="PDF")
# p.line(x_ref, sf, line_color="red", line_width=8, alpha=0.7, legend_label="SF")
p.triangle(x[idx], 0.05, size=12, line_color="red", fill_color="red", fill_alpha=0.8, legend_label="Bad values")
show(p)

### Climatology Test

Note that if the number of bad measurements is small, it doesn't compromise the estimate of the mean and standard deviation.

This is the concept of the climatology test. Any value beyond 3 standard deviations is still possible, but improbable. As long as the data are actually normally distributed and there is enough observations to estimate the mean and standard deviation, we can model it and easily predict how improbable would be a measurement.

This is a good solution, more restrictive than the Global Range test, but that doesn't cover everything. It is possible bad measurements in the range of feasible values.

## Different perspectives from different tests

Let's consider another case where the data has some periodicity.

In [None]:
x2 = x + 2 * np.sin(2 * np.pi * t/1000)
x2[idx] = np.random.uniform(mu-10*sigma, mu+10*sigma, N_bad)

# A time series with the data
p = figure(plot_width=750, plot_height=300)
p.circle(t[idx_good], x2[idx_good], size=8, line_color="green", fill_color="green", fill_alpha=0.3)
p.triangle(t[idx], x2[idx], size=12, line_color="red", fill_color="red", fill_alpha=0.8)
show(p)

In [None]:
mu_estimated, sigma_estimated = stats.norm.fit(x2)

print("Estimated mean: {:.3f}, and standard deviation: {:.3f}".format(mu_estimated, sigma_estimated))

x_ref = np.linspace(x.min(), x.max(), 1000)
pdf = stats.norm.pdf(x_ref, loc=mu_estimated, scale=sigma_estimated)

hist, edges = np.histogram(x2, density=True, bins=50)
p = plot_hist(hist, edges)
p.line(x_ref, pdf, line_color="orange", line_width=8, alpha=0.7, legend_label="PDF")
# p.line(x_ref, sf, line_color="red", line_width=8, alpha=0.7, legend_label="SF")
p.triangle(x2[idx], 0.05, size=12, line_color="red", fill_color="red", fill_alpha=0.8, legend_label="Bad values")
show(p)

Most of the bad data is clearly distinct from the good data pattern, but is inside the feasible range so the climatology can't do much to distinguish the good from bad data.

Let's try a different test, the gradient check.

In [None]:
import cotede.qctests
y_gradient = cotede.qctests.gradient(x2)

# A time series with the data
p = figure(plot_width=750, plot_height=300, title="Spike")
p.circle(t[idx_good], y_gradient[idx_good], size=8, line_color="green", fill_color="green", fill_alpha=0.3)
p.triangle(t[idx], y_gradient[idx], size=12, line_color="red", fill_color="red", fill_alpha=0.8)
show(p) # show the results

In [None]:
import cotede.qctests
y_spike = np.abs(cotede.qctests.tukey53H(x2))

# A time series with the data
p = figure(plot_width=750, plot_height=300, title="Spike")
p.circle(t[idx_good], y_spike[idx_good], size=8, line_color="green", fill_color="green", fill_alpha=0.3)
p.triangle(t[idx], y_spike[idx], size=12, line_color="red", fill_color="red", fill_alpha=0.8)
show(p) # show the results

The spike projects the original data in a new space, and this projection is commonly called "feature" in the Machine Learning world. Note that the spike feature allow to better distinguish the good data from bad data.

### Gronell & Wijffels, 2008
Beyond the climatology of actual measurements, let's do climatologies of features, such as gradient and spike.

In [None]:
gradient_mu, gradient_sigma = stats.norm.fit(y_gradient[np.isfinite(y_gradient)])

gradient_mu, gradient_sigma

In [None]:
gradient_mu, gradient_sigma = stats.norm.fit(y_gradient[np.isfinite(y_gradient)])

y_ref = np.linspace(np.nanmin(y_gradient), np.nanmax(y_gradient), 50)
gradient_pdf = stats.norm.pdf(y_ref, loc=gradient_mu, scale=gradient_sigma)

gradient_hist, gradient_edges = np.histogram(y_gradient[np.isfinite(y_gradient)], density=True, bins=50)
p = plot_hist(gradient_hist, gradient_edges)
p.line(y_ref, gradient_pdf, line_color="orange", line_width=8, alpha=0.7, legend_label="PDF")
p.triangle(y_gradient[idx], 0.05, size=12, line_color="red", fill_color="red", fill_alpha=0.8, legend_label="Bad values")
show(p)

In [None]:
spike_mu, spike_sigma = stats.norm.fit(y_spike[np.isfinite(y_spike)])

y_ref = np.linspace(np.nanmin(y_spike), np.nanmax(y_spike), 50)
spike_pdf = stats.norm.pdf(y_ref, loc=spike_mu, scale=spike_sigma)

spike_hist, spike_edges = np.histogram(y_spike[np.isfinite(y_spike)], density=True, bins=50)
p = plot_hist(spike_hist, spike_edges)
p.line(y_ref, spike_pdf, line_color="orange", line_width=8, alpha=0.7, legend_label="PDF")
p.triangle(y_spike[idx], 0.05, size=12, line_color="red", fill_color="red", fill_alpha=0.8, legend_label="Bad values")
show(p)

In [None]:
y_gradient = cotede.qctests.gradient(x2)

p = figure(plot_width=750, plot_height=300, title="Spike")
p.circle(y[idx_good], y_gradient[idx_good], size=8, line_color="green", fill_color="green", fill_alpha=0.3)
p.triangle(y[idx], y_gradient[idx], size=12, line_color="red", fill_color="red", fill_alpha=0.8)
show(p) # show the results

In [None]:
x3 = x/20 + 2 * np.sin(2 * np.pi * t/2000)
# x2[idx] = np.random.uniform(mu-10*sigma, mu+10*sigma, N_bad)

# A time series with the data
p = figure(plot_width=750, plot_height=300)
p.circle(t[idx_good], x2[idx_good], size=8, line_color="green", fill_color="green", fill_alpha=0.3)
p.triangle(t[idx], x2[idx], size=12, line_color="red", fill_color="red", fill_alpha=0.8)
show(p)

In [None]:
x3 = x/20 + 2 * np.cos(2 * np.pi * t/6000)


x3[1150:1250] += np.random.normal(0, .2, 100)


# A time series with the data
p = figure(plot_width=750, plot_height=300)
p.circle(t[idx_good], x3[idx_good], size=8, line_color="green", fill_color="green", fill_alpha=0.3)
# p.triangle(t[idx], x3[idx], size=12, line_color="red", fill_color="red", fill_alpha=0.8)
show(p) # show the results

In [None]:
y4 = cotede.qctests.rate_of_change(x3)

p = figure(plot_width=750, plot_height=300)
p.circle(t, y4, size=8, line_color="green", fill_color="green", fill_alpha=0.3)
# p.triangle(t[idx], x3[idx], size=12, line_color="red", fill_color="red", fill_alpha=0.8)
show(p)

In [None]:
y.compressed()

In [None]:
import matplotlib.pyplot as plt
plt.hist(y)

In [None]:
spike_hist

In [None]:
stats.norm.pdf(x[idx], loc=mu_estimated, scale=sigma_estimated)

In [None]:
pdf = stats.norm.cdf(x_ref, loc=mu_estimated, scale=sigma_estimated)

In [None]:
pdf

In [None]:
from seabird import fCNV

In [None]:
!pip install seabird

In [None]:
data = fCNV('/Users/castelao/work/science/articles/cotedepaper/data/dPIRX010.cnv')

p = figure(plot_width=500, plot_height=600)
p.circle(data['TEMP'], -data['PRES'], size=8, line_color="green", fill_color="green", fill_alpha=0.3)
show(p)

In [None]:
plt.hist(cotede.qctests.rate_of_change(data['TEMP']), 50)

Climatology Test

Any value beyond 3 standard deviations is still possible, but improbable. This is the traditional climatology test. As long as the observations are actually a normally distributed and there is enough observations to estimate the mean and standard deviation, we can model it and easily predict how improbable would be a measurement.

In [None]:
# Number of samples
N = 300
N_bad = 24
# True mean and standard deviation of this dataset
mu, sigma = 0, 0.1
# Let's fix the random seed so everyone gets the same result
np.random.seed(42)

In [None]:
t = np.arange(N)
noise = np.random.normal(mu, sigma, N)
x = 3 * np.sin(2 * np.pi * t / 190 + 0.3) + noise

chunk = np.random.uniform(mu-10*sigma, mu+10*sigma, N_bad)
x[160:160+chunk.size] += chunk


# A time series with the data
p = figure(plot_width=750, plot_height=300)
p.circle(t, x, size=8, line_color="orange", fill_color="orange", fill_alpha=0.5, legend_label="Good values")
# p.triangle(data["epoch"][idx_bad], data["water_level"][idx_bad], size=12, line_color="red", fill_color="red", fill_alpha=0.8, legend_label="Bad values")
show(p)