diff --git a/aodntools/timeseries_products/aggregated_timeseries.py b/aodntools/timeseries_products/aggregated_timeseries.py index e508850..2b63f6e 100644 --- a/aodntools/timeseries_products/aggregated_timeseries.py +++ b/aodntools/timeseries_products/aggregated_timeseries.py @@ -13,7 +13,7 @@ from pkg_resources import resource_filename from aodntools import __version__ -from aodntools.timeseries_products.common import NoInputFilesError, check_file +from aodntools.timeseries_products.common import NoInputFilesError, check_file, in_water TEMPLATE_JSON = resource_filename(__name__, 'aggregated_timeseries_template.json') @@ -71,18 +71,6 @@ def get_instrument_id(nc): return '; '.join([deployment_code, instrument, instrument_serial_number]) -def in_water(nc): - """ - cut data the entire dataset to in-water only timestamps, dropping the out-of-water records. - :param nc: xarray dataset - :return: xarray dataset - """ - time_deployment_start = np.datetime64(nc.attrs['time_deployment_start'][:-1]) - time_deployment_end = np.datetime64(nc.attrs['time_deployment_end'][:-1]) - TIME = nc['TIME'][:] - return nc.where((TIME >= time_deployment_start) & (TIME <= time_deployment_end), drop=True) - - def get_nominal_depth(nc): """ retunr nominal depth from NOMINAL_DEPTH variable or diff --git a/aodntools/timeseries_products/common.py b/aodntools/timeseries_products/common.py index 81c6f63..7a454af 100644 --- a/aodntools/timeseries_products/common.py +++ b/aodntools/timeseries_products/common.py @@ -1,4 +1,5 @@ """Code shared by all timeseries product generating code""" +import numpy as np class NoInputFilesError(Exception): @@ -66,7 +67,8 @@ def check_file(nc, site_code, variables_of_interest, * At least one variable of interest is present * All variables of interest have only the allowed dimensions (TIME, LATITUDE, LONGITUDE) * If LATITUDE or LONIGUTDE are dimension, they have length 1 - * Global attributes time_deployment_start and time_deployment_end exist + * Global attributes time_deployment_start and time_deployment_end exist, and there is at least one + value of the TIME variable that falls within the range they define. * QC flag variables use the IMOS flag conventions :param nc: open xarray dataset @@ -116,8 +118,14 @@ def check_file(nc, site_code, variables_of_interest, error_list.append('no NOMINAL_DEPTH') required_attributes = {'time_deployment_start', 'time_deployment_end'} + have_time_attributes = True for attr in required_attributes - attributes: error_list.append('no {} attribute'.format(attr)) + have_time_attributes = False + + # check for existence of in-water data + if have_time_attributes and not in_water_index(nc).any(): + error_list.append('no in-water data') # check qc flag conventions for VoI and depth/pressure error_list.extend(check_imos_flag_conventions(nc)) @@ -153,3 +161,26 @@ def check_velocity_file(nc, site_code, return check_file(nc, site_code, variables_of_interest=('UCUR', 'VCUR', 'WCUR'), required_variables=required_variables, allowed_dimensions=allowed_dimensions) + + +def in_water_index(nc): + """ + Return a boolean index of only the in-water data in a dataset. + In-water period is defined by the global attributes time_deployment_start and time_deployment_end. + + :param nc: xarray dataset + :return: numpy.ndarray boolean index array + """ + time_deployment_start = np.datetime64(nc.attrs['time_deployment_start'][:-1]) + time_deployment_end = np.datetime64(nc.attrs['time_deployment_end'][:-1]) + TIME = nc['TIME'][:] + return (TIME >= time_deployment_start) & (TIME <= time_deployment_end) + +def in_water(nc): + """ + cut data to in-water only timestamps, dropping resulting NaN. + + :param nc: xarray dataset + :return: xarray dataset + """ + return nc.where(in_water_index(nc), drop=True) \ No newline at end of file diff --git a/aodntools/timeseries_products/hourly_timeseries.py b/aodntools/timeseries_products/hourly_timeseries.py index 15c6e03..bee6f4e 100644 --- a/aodntools/timeseries_products/hourly_timeseries.py +++ b/aodntools/timeseries_products/hourly_timeseries.py @@ -14,8 +14,7 @@ from aodntools import __version__ from aodntools.timeseries_products import aggregated_timeseries as utils -from aodntools.timeseries_products.common import NoInputFilesError, check_file, get_qc_variable_names - +from aodntools.timeseries_products.common import NoInputFilesError, check_file, get_qc_variable_names, in_water TEMPLATE_JSON = resource_filename(__name__, 'hourly_timeseries_template.json') BINNING_METHOD_JSON = resource_filename(__name__, 'binning_method.json') @@ -66,19 +65,6 @@ def get_parameter_names(nc): return params -def in_water(nc): - """ - cut data to in-water only timestamps, dropping resulting NaN. - - :param nc: xarray dataset - :return: xarray dataset - """ - time_deployment_start = np.datetime64(nc.attrs['time_deployment_start'][:-1]) - time_deployment_end = np.datetime64(nc.attrs['time_deployment_end'][:-1]) - TIME = nc['TIME'][:] - return nc.where((TIME >= time_deployment_start) & (TIME <= time_deployment_end), drop=True) - - def good_data_only(nc, qcflags): """ mask all the variables with QC for QC value less or equal than the specified @@ -421,10 +407,9 @@ def hourly_aggregator(files_to_aggregate, site_code, qcflags, input_dir='', outp 'NOMINAL_DEPTH': get_nominal_depth(nc)}, ignore_index=True) - df_temp = nc_clean.to_dataframe() + # reindex in case TIME had out-of-range values before cleaning + df_temp = nc_clean.reindex({'TIME': nc_clean.TIME.values}).to_dataframe() - ## keep TIME as the only index - df_temp = df_temp.reset_index().set_index('TIME') df_temp = df_temp[parameter_names] df_temp = PDresample_by_hour(df_temp, function_dict, function_stats) # do the magic diff --git a/test_aodntools/timeseries_products/IMOS_ANMN-NRS_STZ_20181213_NRSROT_FV02_hourly-timeseries_END-20190523_C-20220404.nc b/test_aodntools/timeseries_products/IMOS_ANMN-NRS_STZ_20181213_NRSROT_FV02_hourly-timeseries_END-20190523_C-20220404.nc new file mode 100644 index 0000000..d02738a Binary files /dev/null and b/test_aodntools/timeseries_products/IMOS_ANMN-NRS_STZ_20181213_NRSROT_FV02_hourly-timeseries_END-20190523_C-20220404.nc differ diff --git a/test_aodntools/timeseries_products/IMOS_ANMN-NSW_TZ_20200703T001500Z_PH100_FV01_PH100-2007-Aqualogger-520T-96_END-20200907T233000Z_C-20210112T044909Z.nc b/test_aodntools/timeseries_products/IMOS_ANMN-NSW_TZ_20200703T001500Z_PH100_FV01_PH100-2007-Aqualogger-520T-96_END-20200907T233000Z_C-20210112T044909Z.nc new file mode 100644 index 0000000..1fd003a Binary files /dev/null and b/test_aodntools/timeseries_products/IMOS_ANMN-NSW_TZ_20200703T001500Z_PH100_FV01_PH100-2007-Aqualogger-520T-96_END-20200907T233000Z_C-20210112T044909Z.nc differ diff --git a/test_aodntools/timeseries_products/IMOS_ANMN-NSW_TZ_PH100_ALL_FLAGGED_BAD.nc b/test_aodntools/timeseries_products/IMOS_ANMN-NSW_TZ_PH100_ALL_FLAGGED_BAD.nc new file mode 100644 index 0000000..b9015e4 Binary files /dev/null and b/test_aodntools/timeseries_products/IMOS_ANMN-NSW_TZ_PH100_ALL_FLAGGED_BAD.nc differ diff --git a/test_aodntools/timeseries_products/IMOS_ANMN-NSW_TZ_PH100_NO_INWATER_DATA.nc b/test_aodntools/timeseries_products/IMOS_ANMN-NSW_TZ_PH100_NO_INWATER_DATA.nc new file mode 100644 index 0000000..d215f8e Binary files /dev/null and b/test_aodntools/timeseries_products/IMOS_ANMN-NSW_TZ_PH100_NO_INWATER_DATA.nc differ diff --git a/test_aodntools/timeseries_products/IMOS_ANMN-NSW_TZ_SYD100_BAD_TIMESTAMPS.nc b/test_aodntools/timeseries_products/IMOS_ANMN-NSW_TZ_SYD100_BAD_TIMESTAMPS.nc new file mode 100644 index 0000000..de3efb8 Binary files /dev/null and b/test_aodntools/timeseries_products/IMOS_ANMN-NSW_TZ_SYD100_BAD_TIMESTAMPS.nc differ diff --git a/test_aodntools/timeseries_products/test_aggregated_timeseries.py b/test_aodntools/timeseries_products/test_aggregated_timeseries.py index 1a41b63..2055efd 100644 --- a/test_aodntools/timeseries_products/test_aggregated_timeseries.py +++ b/test_aodntools/timeseries_products/test_aggregated_timeseries.py @@ -87,7 +87,8 @@ def test_source_file_attributes(self): self.assertEqual(dataset['source_file'].opendap_url_prefix, 'http://test.opendap.url') def test_all_rejected(self): - self.assertRaises(NoInputFilesError, main_aggregator, [BAD_FILE], 'TEMP', 'NRSROT', input_dir=TEST_ROOT) + self.assertRaises(NoInputFilesError, main_aggregator, [BAD_FILE], 'TEMP', 'NRSROT', + input_dir=TEST_ROOT, output_dir='/tmp') if __name__ == '__main__': diff --git a/test_aodntools/timeseries_products/test_common.py b/test_aodntools/timeseries_products/test_common.py index becf692..382d553 100644 --- a/test_aodntools/timeseries_products/test_common.py +++ b/test_aodntools/timeseries_products/test_common.py @@ -6,7 +6,7 @@ import xarray as xr from aodntools.timeseries_products.common import (check_file, check_velocity_file, get_qc_variable_names, - check_imos_flag_conventions) + check_imos_flag_conventions, in_water_index, in_water) TEST_ROOT = os.path.dirname(__file__) GOOD_TZ_FILE = os.path.join( @@ -80,7 +80,8 @@ def test_bad_velocity_file(self): error_list = check_velocity_file(nc, 'NWSROW') self.assertEqual(set(error_list), {'VCUR variable missing', 'DEPTH variable missing', - "dimension(s) {'DIST_ALONG_BEAMS'} not allowed for UCUR" + "dimension(s) {'DIST_ALONG_BEAMS'} not allowed for UCUR", + 'no in-water data' } ) @@ -94,5 +95,29 @@ def test_am_file(self): } ) + +class TestInWater(unittest.TestCase): + def test_in_water_index_ok(self): + with xr.open_dataset(BAD_TZ_FILE) as nc: + nc.attrs['time_deployment_start'] = '2018-12-13T08:00:00Z' + nc.attrs['time_deployment_end'] = '2018-12-14T00:30:00Z' + index = in_water_index(nc) + self.assertTrue(all(index[:-2])) + self.assertFalse(any(index[-2:])) + + def test_in_water_index_bad(self): + with xr.open_dataset(BAD_V_FILE) as nc: + index = in_water_index(nc) + self.assertFalse(all(index)) + + def test_in_water_ok(self): + with xr.open_dataset(BAD_TZ_FILE) as nc: + nc.attrs['time_deployment_start'] = '2018-12-13T08:00:00Z' + nc.attrs['time_deployment_end'] = '2018-12-14T00:30:00Z' + nc_in = in_water(nc) + + self.assertEqual(len(nc_in.TIME), len(nc.TIME) - 2) + self.assertTrue(all(nc_in.TIME.values == nc.TIME[:-2].values)) + if __name__ == '__main__': unittest.main() diff --git a/test_aodntools/timeseries_products/test_hourly_timeseries.py b/test_aodntools/timeseries_products/test_hourly_timeseries.py index d262cdf..3534494 100644 --- a/test_aodntools/timeseries_products/test_hourly_timeseries.py +++ b/test_aodntools/timeseries_products/test_hourly_timeseries.py @@ -20,6 +20,9 @@ BAD_FILE ] INPUT_PATHS = [os.path.join(TEST_ROOT, f) for f in INPUT_FILES] +EXPECTED_OUTPUT_FILE = os.path.join( + TEST_ROOT, 'IMOS_ANMN-NRS_STZ_20181213_NRSROT_FV02_hourly-timeseries_END-20190523_C-20220404.nc' +) INST_VARIABLES = {'instrument_id', 'source_file', 'LONGITUDE', 'LATITUDE', 'NOMINAL_DEPTH'} OBS_VARIABLES = {'instrument_index', 'TIME'} @@ -33,6 +36,17 @@ for s in function_stats: OBS_VARIABLES.add(v + s) +NO_INWATER_DATA_FILE = 'IMOS_ANMN-NSW_TZ_PH100_NO_INWATER_DATA.nc' +PH100_FILES = [ + 'IMOS_ANMN-NSW_TZ_20200703T001500Z_PH100_FV01_PH100-2007-Aqualogger-520T-96_END-20200907T233000Z_C-20210112T044909Z.nc', + 'IMOS_ANMN-NSW_TZ_PH100_ALL_FLAGGED_BAD.nc', + NO_INWATER_DATA_FILE +] + +SYD100_FILES = [ + 'IMOS_ANMN-NSW_TZ_SYD100_BAD_TIMESTAMPS.nc', +] + class TestHourlyTimeseries(BaseTestCase): def test_hourly_aggregator(self): @@ -73,6 +87,15 @@ def test_hourly_aggregator(self): self.assertIn('hourly_timeseries.py', dataset.lineage) self.assertIn(BAD_FILE, dataset.rejected_files) + # check variable values + expected = Dataset(EXPECTED_OUTPUT_FILE) + compare_vars = ('TIME', 'NOMINAL_DEPTH', 'instrument_index', + 'TEMP', 'TEMP_count', 'TEMP_min', 'TEMP_max') + non_match_vars = [var for var in compare_vars + if not all(dataset[var][:] == expected[var][:]) + ] + self.assertEqual(non_match_vars, []) + def test_hourly_aggregator_with_nonqc(self): output_file, bad_files = hourly_aggregator(files_to_aggregate=INPUT_FILES, site_code='NRSROT', @@ -96,6 +119,27 @@ def test_hourly_aggregator_with_nonqc(self): def test_all_rejected(self): self.assertRaises(NoInputFilesError, hourly_aggregator, [BAD_FILE], 'NRSROT', (1, 2), input_dir=TEST_ROOT) + def test_some_files_without_good_data(self): + output_file, bad_files = hourly_aggregator(files_to_aggregate=PH100_FILES, + site_code='PH100', + qcflags=(1, 2), + input_dir=TEST_ROOT, + output_dir='/tmp' + ) + self.assertEqual(1, len(bad_files)) + for path, errors in bad_files.items(): + self.assertEqual(NO_INWATER_DATA_FILE, path) + self.assertIn('no in-water data', errors) + + def test_bad_timestamps(self): + output_file, bad_files = hourly_aggregator(files_to_aggregate=SYD100_FILES, + site_code='SYD100', + qcflags=(1, 2), + input_dir=TEST_ROOT, + output_dir='/tmp' + ) + self.assertEqual(0, len(bad_files)) + if __name__ == '__main__': unittest.main() diff --git a/test_aodntools/timeseries_products/test_velocity_aggregated_timeseries.py b/test_aodntools/timeseries_products/test_velocity_aggregated_timeseries.py index 12c1ecc..ac064b0 100644 --- a/test_aodntools/timeseries_products/test_velocity_aggregated_timeseries.py +++ b/test_aodntools/timeseries_products/test_velocity_aggregated_timeseries.py @@ -66,7 +66,8 @@ def test_main_aggregator(self): self.assertEqual(non_match_vars, []) def test_all_rejected(self): - self.assertRaises(NoInputFilesError, velocity_aggregated, [BAD_FILE], 'NRSROT', input_dir=TEST_ROOT) + self.assertRaises(NoInputFilesError, velocity_aggregated, [BAD_FILE], 'NRSROT', + input_dir=TEST_ROOT, output_dir='/tmp') if __name__ == '__main__': diff --git a/test_requirements.txt b/test_requirements.txt index 37c5457..67aeac6 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -1,3 +1,3 @@ --index-url https://pypi.python.org/simple/ -r requirements.txt --e .[testing] \ No newline at end of file +-e .[testing]