aodn · leonardolaiolo · May 3, 2022 · Apr 4, 2022 · Apr 26, 2022 · Apr 21, 2022
diff --git a/aodntools/timeseries_products/aggregated_timeseries.py b/aodntools/timeseries_products/aggregated_timeseries.py
@@ -13,7 +13,7 @@
 from pkg_resources import resource_filename
 
 from aodntools import __version__
-from aodntools.timeseries_products.common import NoInputFilesError, check_file
+from aodntools.timeseries_products.common import NoInputFilesError, check_file, in_water
 
 TEMPLATE_JSON = resource_filename(__name__, 'aggregated_timeseries_template.json')
 
@@ -71,18 +71,6 @@ def get_instrument_id(nc):
     return '; '.join([deployment_code, instrument, instrument_serial_number])
 
 
-def in_water(nc):
-    """
-    cut data the entire dataset to in-water only timestamps, dropping the out-of-water records.
-    :param nc: xarray dataset
-    :return: xarray dataset
-    """
-    time_deployment_start = np.datetime64(nc.attrs['time_deployment_start'][:-1])
-    time_deployment_end = np.datetime64(nc.attrs['time_deployment_end'][:-1])
-    TIME = nc['TIME'][:]
-    return nc.where((TIME >= time_deployment_start) & (TIME <= time_deployment_end), drop=True)
-
-
 def get_nominal_depth(nc):
     """
     retunr nominal depth from NOMINAL_DEPTH variable or

diff --git a/aodntools/timeseries_products/common.py b/aodntools/timeseries_products/common.py
@@ -1,4 +1,5 @@
 """Code shared by all timeseries product generating code"""
+import numpy as np
 
 
 class NoInputFilesError(Exception):
@@ -66,7 +67,8 @@ def check_file(nc, site_code, variables_of_interest,
     * At least one variable of interest is present
     * All variables of interest have only the allowed dimensions (TIME, LATITUDE, LONGITUDE)
     * If LATITUDE or LONIGUTDE are dimension, they have length 1
-    * Global attributes time_deployment_start and time_deployment_end exist
+    * Global attributes time_deployment_start and time_deployment_end exist, and there is at least one
+      value of the TIME variable that falls within the range they define.
     * QC flag variables use the IMOS flag conventions
 
     :param nc: open xarray dataset
@@ -116,8 +118,14 @@ def check_file(nc, site_code, variables_of_interest,
         error_list.append('no NOMINAL_DEPTH')
 
     required_attributes = {'time_deployment_start', 'time_deployment_end'}
+    have_time_attributes = True
     for attr in required_attributes - attributes:
         error_list.append('no {} attribute'.format(attr))
+        have_time_attributes = False
+
+    # check for existence of in-water data
+    if have_time_attributes and not in_water_index(nc).any():
+        error_list.append('no in-water data')
 
     # check qc flag conventions for VoI and depth/pressure
     error_list.extend(check_imos_flag_conventions(nc))
@@ -153,3 +161,26 @@ def check_velocity_file(nc, site_code,
 
     return check_file(nc, site_code, variables_of_interest=('UCUR', 'VCUR', 'WCUR'),
                       required_variables=required_variables, allowed_dimensions=allowed_dimensions)
+
+
+def in_water_index(nc):
+    """
+    Return a boolean index of only the in-water data in a dataset.
+    In-water period is defined by the global attributes time_deployment_start and time_deployment_end.
+
+    :param nc: xarray dataset
+    :return: numpy.ndarray boolean index array
+    """
+    time_deployment_start = np.datetime64(nc.attrs['time_deployment_start'][:-1])
+    time_deployment_end = np.datetime64(nc.attrs['time_deployment_end'][:-1])
+    TIME = nc['TIME'][:]
+    return (TIME >= time_deployment_start) & (TIME <= time_deployment_end)
+
+def in_water(nc):
+    """
+    cut data to in-water only timestamps, dropping resulting NaN.
+
+    :param nc: xarray dataset
+    :return: xarray dataset
+    """
+    return nc.where(in_water_index(nc), drop=True)
diff --git a/aodntools/timeseries_products/hourly_timeseries.py b/aodntools/timeseries_products/hourly_timeseries.py
@@ -14,8 +14,7 @@
 
 from aodntools import __version__
 from aodntools.timeseries_products import aggregated_timeseries as utils
-from aodntools.timeseries_products.common import NoInputFilesError, check_file, get_qc_variable_names
-
+from aodntools.timeseries_products.common import NoInputFilesError, check_file, get_qc_variable_names, in_water
 
 TEMPLATE_JSON = resource_filename(__name__, 'hourly_timeseries_template.json')
 BINNING_METHOD_JSON = resource_filename(__name__, 'binning_method.json')
@@ -66,19 +65,6 @@ def get_parameter_names(nc):
     return params
 
 
-def in_water(nc):
-    """
-    cut data to in-water only timestamps, dropping resulting NaN.
-
-    :param nc: xarray dataset
-    :return: xarray dataset
-    """
-    time_deployment_start = np.datetime64(nc.attrs['time_deployment_start'][:-1])
-    time_deployment_end = np.datetime64(nc.attrs['time_deployment_end'][:-1])
-    TIME = nc['TIME'][:]
-    return nc.where((TIME >= time_deployment_start) & (TIME <= time_deployment_end), drop=True)
-
-
 def good_data_only(nc, qcflags):
     """
     mask all the variables with QC for QC value less or equal than the specified
@@ -421,10 +407,9 @@ def hourly_aggregator(files_to_aggregate, site_code, qcflags, input_dir='', outp
                                               'NOMINAL_DEPTH': get_nominal_depth(nc)},
                                              ignore_index=True)
 
-            df_temp = nc_clean.to_dataframe()
+            # reindex in case TIME had out-of-range values before cleaning
+            df_temp = nc_clean.reindex({'TIME': nc_clean.TIME.values}).to_dataframe()
 
-            ## keep TIME as the only index
-            df_temp = df_temp.reset_index().set_index('TIME')
             df_temp = df_temp[parameter_names]
 
             df_temp = PDresample_by_hour(df_temp, function_dict, function_stats)  # do the magic

diff --git a/...ducts/IMOS_ANMN-NRS_STZ_20181213_NRSROT_FV02_hourly-timeseries_END-20190523_C-20220404.nc b/...ducts/IMOS_ANMN-NRS_STZ_20181213_NRSROT_FV02_hourly-timeseries_END-20190523_C-20220404.nc
diff --git a/...1500Z_PH100_FV01_PH100-2007-Aqualogger-520T-96_END-20200907T233000Z_C-20210112T044909Z.nc b/...1500Z_PH100_FV01_PH100-2007-Aqualogger-520T-96_END-20200907T233000Z_C-20210112T044909Z.nc
diff --git a/test_aodntools/timeseries_products/IMOS_ANMN-NSW_TZ_PH100_ALL_FLAGGED_BAD.nc b/test_aodntools/timeseries_products/IMOS_ANMN-NSW_TZ_PH100_ALL_FLAGGED_BAD.nc
diff --git a/test_aodntools/timeseries_products/IMOS_ANMN-NSW_TZ_PH100_NO_INWATER_DATA.nc b/test_aodntools/timeseries_products/IMOS_ANMN-NSW_TZ_PH100_NO_INWATER_DATA.nc
diff --git a/test_aodntools/timeseries_products/IMOS_ANMN-NSW_TZ_SYD100_BAD_TIMESTAMPS.nc b/test_aodntools/timeseries_products/IMOS_ANMN-NSW_TZ_SYD100_BAD_TIMESTAMPS.nc
diff --git a/test_aodntools/timeseries_products/test_aggregated_timeseries.py b/test_aodntools/timeseries_products/test_aggregated_timeseries.py
@@ -87,7 +87,8 @@ def test_source_file_attributes(self):
         self.assertEqual(dataset['source_file'].opendap_url_prefix, 'http://test.opendap.url')
 
     def test_all_rejected(self):
-        self.assertRaises(NoInputFilesError, main_aggregator, [BAD_FILE], 'TEMP', 'NRSROT', input_dir=TEST_ROOT)
+        self.assertRaises(NoInputFilesError, main_aggregator, [BAD_FILE], 'TEMP', 'NRSROT',
+                          input_dir=TEST_ROOT, output_dir='/tmp')
 
 
 if __name__ == '__main__':

diff --git a/test_aodntools/timeseries_products/test_common.py b/test_aodntools/timeseries_products/test_common.py
@@ -6,7 +6,7 @@
 import xarray as xr
 
 from aodntools.timeseries_products.common import (check_file, check_velocity_file, get_qc_variable_names,
-                                                  check_imos_flag_conventions)
+                                                  check_imos_flag_conventions, in_water_index, in_water)
 
 TEST_ROOT = os.path.dirname(__file__)
 GOOD_TZ_FILE = os.path.join(
@@ -80,7 +80,8 @@ def test_bad_velocity_file(self):
             error_list = check_velocity_file(nc, 'NWSROW')
         self.assertEqual(set(error_list), {'VCUR variable missing',
                                            'DEPTH variable missing',
-                                           "dimension(s) {'DIST_ALONG_BEAMS'} not allowed for UCUR"
+                                           "dimension(s) {'DIST_ALONG_BEAMS'} not allowed for UCUR",
+                                           'no in-water data'
                                            }
                          )
 
@@ -94,5 +95,29 @@ def test_am_file(self):
                                            }
                          )
 
+
+class TestInWater(unittest.TestCase):
+    def test_in_water_index_ok(self):
+        with xr.open_dataset(BAD_TZ_FILE) as nc:
+            nc.attrs['time_deployment_start'] = '2018-12-13T08:00:00Z'
+            nc.attrs['time_deployment_end'] = '2018-12-14T00:30:00Z'
+            index = in_water_index(nc)
+        self.assertTrue(all(index[:-2]))
+        self.assertFalse(any(index[-2:]))
+
+    def test_in_water_index_bad(self):
+        with xr.open_dataset(BAD_V_FILE) as nc:
+            index = in_water_index(nc)
+        self.assertFalse(all(index))
+
+    def test_in_water_ok(self):
+        with xr.open_dataset(BAD_TZ_FILE) as nc:
+            nc.attrs['time_deployment_start'] = '2018-12-13T08:00:00Z'
+            nc.attrs['time_deployment_end'] = '2018-12-14T00:30:00Z'
+            nc_in = in_water(nc)
+
+            self.assertEqual(len(nc_in.TIME), len(nc.TIME) - 2)
+            self.assertTrue(all(nc_in.TIME.values == nc.TIME[:-2].values))
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/test_aodntools/timeseries_products/test_hourly_timeseries.py b/test_aodntools/timeseries_products/test_hourly_timeseries.py
@@ -20,6 +20,9 @@
     BAD_FILE
 ]
 INPUT_PATHS = [os.path.join(TEST_ROOT, f) for f in INPUT_FILES]
+EXPECTED_OUTPUT_FILE = os.path.join(
+    TEST_ROOT, 'IMOS_ANMN-NRS_STZ_20181213_NRSROT_FV02_hourly-timeseries_END-20190523_C-20220404.nc'
+)
 
 INST_VARIABLES = {'instrument_id', 'source_file', 'LONGITUDE', 'LATITUDE', 'NOMINAL_DEPTH'}
 OBS_VARIABLES = {'instrument_index', 'TIME'}
@@ -33,6 +36,17 @@
     for s in function_stats:
         OBS_VARIABLES.add(v + s)
 
+NO_INWATER_DATA_FILE = 'IMOS_ANMN-NSW_TZ_PH100_NO_INWATER_DATA.nc'
+PH100_FILES = [
+    'IMOS_ANMN-NSW_TZ_20200703T001500Z_PH100_FV01_PH100-2007-Aqualogger-520T-96_END-20200907T233000Z_C-20210112T044909Z.nc',
+    'IMOS_ANMN-NSW_TZ_PH100_ALL_FLAGGED_BAD.nc',
+    NO_INWATER_DATA_FILE
+]
+
+SYD100_FILES = [
+    'IMOS_ANMN-NSW_TZ_SYD100_BAD_TIMESTAMPS.nc',
+]
+
 
 class TestHourlyTimeseries(BaseTestCase):
     def test_hourly_aggregator(self):
@@ -73,6 +87,15 @@ def test_hourly_aggregator(self):
         self.assertIn('hourly_timeseries.py', dataset.lineage)
         self.assertIn(BAD_FILE, dataset.rejected_files)
 
+        # check variable values
+        expected = Dataset(EXPECTED_OUTPUT_FILE)
+        compare_vars = ('TIME', 'NOMINAL_DEPTH', 'instrument_index',
+                        'TEMP', 'TEMP_count', 'TEMP_min', 'TEMP_max')
+        non_match_vars = [var for var in compare_vars
+                          if not all(dataset[var][:] == expected[var][:])
+                          ]
+        self.assertEqual(non_match_vars, [])
+
     def test_hourly_aggregator_with_nonqc(self):
         output_file, bad_files = hourly_aggregator(files_to_aggregate=INPUT_FILES,
                                                    site_code='NRSROT',
@@ -96,6 +119,27 @@ def test_hourly_aggregator_with_nonqc(self):
     def test_all_rejected(self):
         self.assertRaises(NoInputFilesError, hourly_aggregator, [BAD_FILE], 'NRSROT', (1, 2), input_dir=TEST_ROOT)
 
+    def test_some_files_without_good_data(self):
+        output_file, bad_files = hourly_aggregator(files_to_aggregate=PH100_FILES,
+                                                   site_code='PH100',
+                                                   qcflags=(1, 2),
+                                                   input_dir=TEST_ROOT,
+                                                   output_dir='/tmp'
+                                                   )
+        self.assertEqual(1, len(bad_files))
+        for path, errors in bad_files.items():
+            self.assertEqual(NO_INWATER_DATA_FILE, path)
+            self.assertIn('no in-water data', errors)
+
+    def test_bad_timestamps(self):
+        output_file, bad_files = hourly_aggregator(files_to_aggregate=SYD100_FILES,
+                                                   site_code='SYD100',
+                                                   qcflags=(1, 2),
+                                                   input_dir=TEST_ROOT,
+                                                   output_dir='/tmp'
+                                                   )
+        self.assertEqual(0, len(bad_files))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test_aodntools/timeseries_products/test_velocity_aggregated_timeseries.py b/test_aodntools/timeseries_products/test_velocity_aggregated_timeseries.py
@@ -66,7 +66,8 @@ def test_main_aggregator(self):
         self.assertEqual(non_match_vars, [])
 
     def test_all_rejected(self):
-        self.assertRaises(NoInputFilesError, velocity_aggregated, [BAD_FILE], 'NRSROT', input_dir=TEST_ROOT)
+        self.assertRaises(NoInputFilesError, velocity_aggregated, [BAD_FILE], 'NRSROT',
+                          input_dir=TEST_ROOT, output_dir='/tmp')
 
 
 if __name__ == '__main__':

diff --git a/test_requirements.txt b/test_requirements.txt
@@ -1,3 +1,3 @@
 --index-url https://pypi.python.org/simple/
 -r requirements.txt
--e .[testing]
+-e .[testing]