/
woa_normbias.py
216 lines (175 loc) · 7.42 KB
/
woa_normbias.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
# -*- coding: utf-8 -*-
"""
Create a test to check if cfg['vars'] does exist in the climatology file,
to avoid error like have t_mn in one and t_an in the other.
An idea to improve the climatology test. Gridpoints estimated from few
measurements should be less trustable. Here I'm using a threshold of at
least 3 samples to be considered. Deep ocean is in general quite
stable, so few measurements should be sufficient. One possibility is to
estimate the standard error, which depends on the ammount of samples and
estimated standard deviation. This would be the 'uncertainty on the
estimated average climatology. Any measurement in that range would be
considered statistically identical to the climatology. Above that
difference, it would be normalized by the standard deviation. Therefore,
stable areas would be less tolerant to variability, even with few samples.
"""
from datetime import timedelta
import logging
import numpy as np
from numpy import ma
from oceansdb import WOA
from .qctests import QCCheckVar
from ..utils import extract_coordinates, extract_time, day_of_year, extract_depth
module_logger = logging.getLogger(__name__)
def woa_normbias(data, varname, attrs=None, use_standard_error=False):
"""
Notes
-----
- Include arguments to overwrite target variable (timename=None, latname=None, lonname=None)
"""
try:
doy = day_of_year(extract_time(data, attrs))
except LookupError as err:
module_logger.error("Missing time")
raise
try:
# Note that QCCheck fallback to self.data.attrs if attrs not given
lat, lon = extract_coordinates(data, attrs)
except LookupError:
module_logger.error("Missing geolocation (lat/lon)")
raise
kwargs = {"lat": lat, "lon": lon}
if (np.size(lat) > 1) | (np.size(lon) > 1):
dLmax = max(np.max(lat) - np.min(lat), np.max(lon) - np.min(lon))
if dLmax >= 0.01:
mode = "track"
# kwargs["alongtrack_axis"] = ['lat', 'lon']
else:
mode = "profile"
kwargs = {
"lat": np.mean(lat),
"lon": np.mean(lon),
}
module_logger.warning("Multiple lat/lon positions but too close to each other so it will be considered a single position for the WOA comparison. lat: {}, lon: {}".format(kwargs["lat"], kwargs["lon"]))
else:
mode = "profile"
depth = extract_depth(data)
db = WOA()
# This must go away. This was a trick to handle Seabird CTDs, but
# now that seabird is a different package it should be handled there.
if isinstance(varname, str) and (varname[-1] == "2"):
vtype = varname[:-1]
else:
vtype = varname
woa_vars = [
"mean",
"standard_deviation",
"standard_error",
"number_of_observations",
]
# Eventually the case of some invalid depth levels will be handled by
# OceansDB and the following steps will be simplified.
valid_depth = depth
if (np.size(depth) > 0):
idx = ~ma.getmaskarray(depth) & (np.array(depth) >= 0) & np.isfinite(depth)
if not idx.any():
module_logger.error("Invalid depth(s) for WOA comparison: {}".format(depth))
raise IndexError
elif not idx.all():
valid_depth = depth[idx]
if mode == "track":
woa = db[vtype].track(var=woa_vars, doy=doy, depth=valid_depth, **kwargs)
else:
woa = db[vtype].extract(var=woa_vars, doy=doy, depth=valid_depth, **kwargs)
if not np.all(depth == valid_depth):
for v in woa.keys():
tmp = ma.masked_all(depth.shape, dtype=woa[v].dtype)
tmp[idx] = woa[v]
woa[v] = tmp
features = {
"woa_mean": woa["mean"],
"woa_std": woa["standard_deviation"],
"woa_nsamples": woa["number_of_observations"],
"woa_se": woa["standard_error"],
}
features["woa_bias"] = data[varname] - features["woa_mean"]
for v in features:
idx = ma.getmaskarray(features[v])
if idx.any():
if v == "woa_nsamples":
missing_value = -1
else:
missing_value = np.nan
features[v][idx] = missing_value
features[v] = np.array(features[v])
# if use_standard_error = True, the comparison with the climatology
# considers the standard error, i.e. the bias will be only the
# ammount above the standard error range.
if use_standard_error is True:
standard_error = features["woa_std"]
idx = features["woa_nsamples"] > 0
standard_error[~idx] = np.nan
standard_error[idx] /= features["woa_nsamples"][idx] ** 0.5
idx = np.absolute(features["woa_bias"]) <= standard_error
features["woa_bias"][idx] = 0
idx = np.absolute(features["woa_bias"]) > standard_error
features["woa_bias"][idx] -= (
np.sign(features["woa_bias"][idx]) * standard_error[idx]
)
features["woa_normbias"] = features["woa_bias"] / features["woa_std"]
return features
class WOA_NormBias(QCCheckVar):
"""Compares measurements with WOA climatology
Notes
-----
* Although using standard error is a good idea, the default is to don't use
standard error to estimate the bias to follow the traaditional approach.
This can have a signifcant impact in the deep oceans and regions lacking
extensive sampling.
FIXME: Move this procedure into a class to conform with the new system
and include a limit in minimum ammount of samples to trust it. For
example, consider as masked all climatologic values estimated from
less than 5 samples.
"""
flag_bad = 3
use_standard_error = False
# 3 is the possible minimum to estimate the std, but I shold use higher.
min_samples = 3
def __init__(self, data, varname, cfg=None, autoflag=True):
try:
self.use_standard_error = cfg["use_standard_error"]
except (KeyError, TypeError):
module_logger.debug("use_standard_error undefined. Using default value")
try:
self.min_samples = cfg["min_samples"]
except (KeyError, TypeError):
module_logger.debug("min_samples undefined. Using default value")
super().__init__(data, varname, cfg, autoflag)
def set_features(self):
try:
self.features = woa_normbias(self.data, self.varname, self.attrs)
except LookupError:
self.features = {}
def test(self):
self.flags = {}
threshold = self.cfg["threshold"]
assert (np.size(threshold) == 1) and (threshold is not None)
flag = np.zeros(self.data[self.varname].shape, dtype="i1")
if "woa_normbias" not in self.features:
self.flags["woa_normbias"] = flag
return
normbias_abs = np.absolute(self.features["woa_normbias"])
ind = np.nonzero(
(self.features["woa_nsamples"] >= self.min_samples)
& np.array(normbias_abs <= threshold)
)
flag[ind] = self.flag_good
ind = np.nonzero(
(self.features["woa_nsamples"] >= self.min_samples)
& np.array(normbias_abs > threshold)
)
flag[ind] = self.flag_bad
# Flag as 9 any masked input value
x = self.data[self.varname]
flag[ma.getmaskarray(x) | ~np.isfinite(np.array(x))] = 9
self.flags["woa_normbias"] = flag