# CTD profile QC

## Objective:
Illustrate some QC procedures using CoTeDe.

In [None]:
from bokeh.io import output_notebook, show
from bokeh.layouts import column, row
from bokeh.plotting import figure
import numpy as np
from scipy import stats

import cotede
from cotede.qc import ProfileQC
from cotede import qctests, datasets

In [None]:
output_notebook()

## Data
Let's start by loading a sample dataset.

In [None]:
data = cotede.datasets.load_ctd()

print("There is a total of {} observations.\n".format(len(data["TEMP"])))
print("The variables are: ", data.keys())

In [None]:
p1 = figure(plot_width=420, plot_height=600)
p1.circle(data['TEMP'], -data['PRES'], size=8, line_color="seagreen", fill_color="mediumseagreen", fill_alpha=0.3)
p1.xaxis.axis_label = "Temperature [C]"
p1.yaxis.axis_label = "Depth [m]"

p2 = figure(plot_width=420, plot_height=600)
p2.y_range = p1.y_range
p2.circle(data['PSAL'], -data['PRES'], size=8, line_color="seagreen", fill_color="mediumseagreen", fill_alpha=0.3)
p2.xaxis.axis_label = "Salinity"
p2.yaxis.axis_label = "Depth [m]"

p = row(p1, p2)
show(p)

## Global Range: Check for Feasible Values

In [None]:
idx_valid = (data['TEMP'] > -2) & (data['TEMP'] < 40)

p1 = figure(plot_width=420, plot_height=600, title="Global Range Check (-2 <= T <= 40)")
p1.circle(data['TEMP'][idx_valid], -data['PRES'][idx_valid], size=8, line_color="seagreen", fill_color="mediumseagreen", fill_alpha=0.3)
p1.triangle(data['TEMP'][~idx_valid], -data['PRES'][~idx_valid], size=8, line_color="red", fill_color="red", fill_alpha=0.3)
p1.xaxis.axis_label = "Temperature [C]"
p1.yaxis.axis_label = "Depth [m]"


idx_valid = (data['PSAL'] > 0) & (data['PSAL'] < 41)

p2 = figure(plot_width=420, plot_height=600, title="Global Range Check (0 <= S <= 41)")
p2.y_range = p1.y_range
p2.circle(data['PSAL'][idx_valid], -data['PRES'][idx_valid], size=8, line_color="seagreen", fill_color="mediumseagreen", fill_alpha=0.3)
p2.triangle(data['PSAL'][~idx_valid], -data['PRES'][~idx_valid], size=8, line_color="red", fill_color="red", fill_alpha=0.3)
p2.xaxis.axis_label = "Pratical Salinity"
p2.yaxis.axis_label = "Depth [m]"

p = row(p1, p2)
show(p)

## GTSPP's Spike Check

In [None]:
def spike(x):
    """Spike check as defined by GTSPP
    """
    y = np.nan * x
    y[1:-1] = np.abs(x[1:-1] - (x[:-2] + x[2:]) / 2.0) - np.abs((x[2:] - x[:-2]) / 2.0)
    return y


The spike check and many other ones are already implemented in CoTeDe, so let's use it cotede.qctests.spike().

GTSPP recommends a spike threshold equal to 2C for temperature and 0.3 for salinity.

In [None]:
t_spike = qctests.spike(data["TEMP"])

idx_good = np.absolute(t_spike) <= 2
idx_bad = np.absolute(t_spike) > 2

p1 = figure(plot_width=420, plot_height=500)
p1.circle(data['TEMP'][idx_good], -data['PRES'][idx_good], size=8, line_color="green", fill_color="green", fill_alpha=0.3)
p1.triangle(data['TEMP'][idx_bad], -data['PRES'][idx_bad], size=8, line_color="red", fill_color="red", fill_alpha=0.3)
p1.xaxis.axis_label = "Temperature [C]"
p1.yaxis.axis_label = "Depth [m]"

p2 = figure(plot_width=420, plot_height=500)
p2.y_range = p1.y_range
p2.circle(t_spike[idx_good], -data['PRES'][idx_good], size=8, line_color="green", fill_color="green", fill_alpha=0.3)
p2.triangle(t_spike[idx_bad], -data['PRES'][idx_bad], size=8, line_color="red", fill_color="red", fill_alpha=0.3)
p2.xaxis.axis_label = "Spike(T)"
p2.yaxis.axis_label = "Depth [m]"


s_spike = qctests.spike(data["PSAL"])

idx_good = np.absolute(s_spike) <= 2
idx_bad = np.absolute(s_spike) > 2

p3 = figure(plot_width=420, plot_height=500)
p3.y_range = p1.y_range
p3.circle(data['PSAL'][idx_good], -data['PRES'][idx_good], size=8, line_color="green", fill_color="green", fill_alpha=0.3)
p3.triangle(data['PSAL'][idx_bad], -data['PRES'][idx_bad], size=8, line_color="red", fill_color="red", fill_alpha=0.3)
p3.xaxis.axis_label = "Salinity"
p3.yaxis.axis_label = "Depth [m]"

p4 = figure(plot_width=420, plot_height=500)
p4.y_range = p1.y_range
p4.circle(s_spike[idx_good], -data['PRES'][idx_good], size=8, line_color="green", fill_color="green", fill_alpha=0.3)
p4.triangle(s_spike[idx_bad], -data['PRES'][idx_bad], size=8, line_color="red", fill_color="red", fill_alpha=0.3)
p4.xaxis.axis_label = "Spike(S)"
p4.yaxis.axis_label = "Depth [m]"

p = column(row(p1, p2), row(p3, p4))
show(p)

## Using CoTeDe QC framework
CoTeDe automates many procedures for QC. Let's start using the standard procedure.

In [None]:
pqc = cotede.ProfileQC(data)

That's it, the primary and secondary sensors were evaluated. First the same variables in the input are available in the output object.

In [None]:
print("Variables available in data: {}\n".format(data.keys()))
print("Variables available in pqc: {}\n".format(pqc.keys()))

In [None]:
print("Flags available for temperature {}\n".format(pqc.flags["TEMP"].keys()))
print("Flags available for salinity {}\n".format(pqc.flags["PSAL"].keys()))

The flags are on IOC standard, thus 1 means good while 4 means bad.
0 is used when the QC there was no QC. For instance, the spike test is defined so that it depends on the previous and following measurements, thus the first and last data point of the array will always have a spike flag equal to 0.

Let's check the salinity with feasible values:

In [None]:
pqc.flags["PSAL"]["global_range"]

In [None]:
pqc.flags["PSAL"]["spike"]

Let's check the salinity measurements that are bad of probably bad according to the Global Range check, i.e. unfeasible values of salinity.

In [None]:
idx = pqc.flags["PSAL"]["global_range"] >= 3
pqc["PSAL"][idx]

The magnitudes of the tests are stored in features.

Let's check which features were saved for temperature,

In [None]:
print("Features for temperature: {}\n".format(pqc.features["TEMP"].keys()))

The flag "overall" is the maximum value among all other flags as recommended by IOC flagging system.
Therefore, if one measurement is flagged bad (flag=4) in a single test, it will get a flag 4.
Likewise, a measurement with flag 1 means that from all applied tests there is no suspicious of being a bad measurement.

In [None]:
pqc.flags["PSAL"]["overall"]

It's the same for salinity. Let's plot the salinity and it's respective normalized bias in respect to the WOA.

In [None]:
idx_good = pqc.flags["PSAL"]["overall"] <= 2
idx_bad = pqc.flags["PSAL"]["overall"] >= 3

pressure = -pqc["PRES"]
salinity = pqc["PSAL"]
woa_normbias = pqc.features["PSAL"]["woa_normbias"]


p1 = figure(plot_width=420, plot_height=500)
p1.circle(salinity[idx_good], pressure[idx_good], size=8, line_color="green", fill_color="green", fill_alpha=0.3)
p1.triangle(salinity[idx_bad], pressure[idx_bad], size=8, line_color="red", fill_color="red", fill_alpha=0.3)
p1.xaxis.axis_label = "Salinity"
p1.yaxis.axis_label = "Depth [m]"

p2 = figure(plot_width=420, plot_height=500)
p2.y_range = p1.y_range
p2.circle(woa_normbias[idx_good], pressure[idx_good], size=8, line_color="green", fill_color="green", fill_alpha=0.3)
p2.triangle(woa_normbias[idx_bad], pressure[idx_bad], size=8, line_color="red", fill_color="red", fill_alpha=0.3)
p2.xaxis.axis_label = "WOA normalized bias"
p2.yaxis.axis_label = "Depth [m]"

p = row(p1, p2)
show(p)

Let's look at the salinity in respect to the spike and WOA normalized bias.
Near the bottom of the profile there some bad salinity measurement, which are mostly identified with the spike test.
A few measurements aren't critically bad in respect to the spike or the climatology individually.
One of the goals of the Anomaly Detection is to combine multiple features to an overall decision.

In [None]:

idx_good = pqc.flags["PSAL"]["spike"] <= 2
idx_bad = pqc.flags["PSAL"]["spike"] >= 3

p1 = figure(plot_width=500, plot_height=600)
p1.circle(pqc.features["PSAL"]["spike"][idx_good], -pqc['PRES'][idx_good], size=8, line_color="green", fill_color="green", fill_alpha=0.3)
p1.triangle(pqc.features["PSAL"]["spike"][idx_bad], -data['PRES'][idx_bad], size=8, line_color="red", fill_color="red", fill_alpha=0.3)

p2 = figure(plot_width=500, plot_height=600)
p2.y_range = p1.y_range
p2.circle(pqc['PSAL'][idx_good], -pqc['PRES'][idx_good], size=8, line_color="green", fill_color="green", fill_alpha=0.3)
p2.line(pqc.features["PSAL"]["woa_mean"] - 6 * pqc.features["PSAL"]["woa_std"], -data['PRES'], line_width=4, line_color="orange", alpha=0.4)
p2.line(pqc.features["PSAL"]["woa_mean"] + 6 * pqc.features["PSAL"]["woa_std"], -data['PRES'], line_width=4, line_color="orange", alpha=0.4)
p2.triangle(data['PSAL'][idx_bad], -data['PRES'][idx_bad], size=8, line_color="red", fill_color="red", fill_alpha=0.3)

p = row(p1, p2)
show(p)