Merge d0e12cd into b605b17

astropy · Jul 27, 2019 · ab5a6a8 · ab5a6a8
2 parents b605b17 + d0e12cd
commit ab5a6a8
Show file tree

Hide file tree

Showing 4 changed files with 115 additions and 4 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -10,6 +10,9 @@ New Features
 - Allow identification of FITS files in ``ImageFileCollection`` based on content
   of the files instead of file name extension. [#620, #680]
 
+- Add option to use regular expression matching when filtering items in
+  ``ImageFileCollection``. [#480, #595, #682]
+
 Other Changes and Additions
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 

diff --git a/ccdproc/image_collection.py b/ccdproc/image_collection.py
@@ -3,6 +3,8 @@
 from collections import OrderedDict
 import fnmatch
 from os import listdir, path
+import re
+
 import logging
 
 import numpy as np
@@ -326,6 +328,12 @@ def files_filtered(self, **kwd):
             contains not just the filename, but the full path to each file.
             Default is ``False``.
 
+        regex_match : bool, keyword-only
+            If ``True``, then string values in the ``**kwd`` dictionary are
+            treated as regular expression patterns and matching is done by
+            regular expression search. The search is always
+            **case insensitive**.
+
         **kwd :
             ``**kwd`` is dict of keywords and values the files must have.
             The value '*' represents any value.
@@ -355,7 +363,8 @@ def files_filtered(self, **kwd):
 
         Notes
         -----
-        Value comparison is case *insensitive* for strings.
+        Value comparison is case *insensitive* for strings, whether matching
+        exactly or matching with regular expressions.
         """
         # force a copy by explicitly converting to a list
         current_file_mask = self.summary['file'].mask.tolist()
@@ -614,6 +623,16 @@ def _find_keywords_by_values(self, **kwd):
         """
         Find files whose keywords have given values.
 
+        Parameters
+        ----------
+
+        match_regex : bool, optional
+            If ``True`` match string values by using a regular expression
+            search instead of equality. Default value is ``False``.
+
+        The remaining arguments are keyword/value pairs specifying the
+        values to match.
+
         `**kwd` is list of keywords and values the files must have.
 
         The value '*' represents any value.
@@ -625,9 +644,11 @@ def _find_keywords_by_values(self, **kwd):
             >>> collection = ImageFileCollection('test/data', keywords=keys)
             >>> collection.files_filtered(imagetyp='LIGHT', filter='R')
             >>> collection.files_filtered(imagetyp='*', filter='')
+            >>> collection.files_filtered(imagetyp='bias|filter', regex_match=True)
 
         NOTE: Value comparison is case *insensitive* for strings.
         """
+        regex_match = kwd.pop('regex_match', False)
         keywords = kwd.keys()
         values = kwd.values()
 
@@ -652,15 +673,25 @@ def _find_keywords_by_values(self, **kwd):
                     # need to loop explicitly over array rather than using
                     # where to correctly do string comparison.
                     have_this_value = np.zeros(len(use_info), dtype=bool)
+
+                    # We are going to do a regex match no matter what.
+                    if regex_match:
+                        pattern = re.compile(value,
+                                             flags=re.IGNORECASE)
+                    else:
+                        # This pattern matches the prior behavior.
+                        pattern = re.compile('^' + value + '$',
+                                             flags=re.IGNORECASE)
+
                     for idx, file_key_value in enumerate(use_info[key].tolist()):
                         if value_not_missing[idx]:
                             try:
                                 value_matches = (
-                                    file_key_value.lower() == value.lower())
-                            except AttributeError:
+                                    pattern.search(file_key_value) is not None)
+                            except TypeError:
                                 # In case we're dealing with an object column
                                 # there could be values other than strings in it
-                                # so it could fail with an AttributeError.
+                                # so it could fail with an TypeError.
                                 value_matches = False
                         else:
                             value_matches = False
@@ -793,6 +824,13 @@ def _generator(self, return_type,
             See `~astropy.nddata.fits_ccddata_reader` for a complete list of
             parameters that can be passed through ``ccd_kwargs``.
 
+
+        regex_match : bool, keyword-only
+            If ``True``, then string values in the ``**kwd`` dictionary are
+            treated as regular expression patterns and matching is done by
+            regular expression search. The search is always
+            **case insensitive**.
+
         **kwd :
             Any additional keywords are used to filter the items returned; see
             `files_filtered` examples for details.

diff --git a/ccdproc/tests/test_image_collection.py b/ccdproc/tests/test_image_collection.py
@@ -1018,3 +1018,55 @@ def test_type_of_empty_collection(self, triage_setup):
         ic = ImageFileCollection(triage_setup.test_dir)
         assert ic.summary is None
         assert ic.keywords == []
+
+    def test_regex_match_for_search(self, triage_setup):
+        # Test regex matching in searches
+
+        ic = ImageFileCollection(triage_setup.test_dir)
+
+        files = ic.files_filtered(regex_match=True, imagetyp='b.*s')
+        assert len(files) == triage_setup.n_test['bias']
+
+        # This should return all of the files in the test set
+        all_files = ic.files_filtered(regex_match=True, imagetyp='bias|light')
+        assert len(all_files) == triage_setup.n_test['files']
+
+        # Add a column with more interesting content and see whether we
+        # match that.
+        ic.summary['match_me'] = [
+            'hello',
+            'goodbye',
+            'bye',
+            'byte',
+            'good bye hello',
+            'dog'
+        ]
+
+        hello_anywhere = ic.files_filtered(regex_match=True,
+                                           match_me='hello')
+        assert len(hello_anywhere) == 2
+
+        hello_start = ic.files_filtered(regex_match=True,
+                                        match_me='^hello')
+        assert len(hello_start) == 1
+
+        # Is it really a case-insensitive match?
+        hello_start = ic.files_filtered(regex_match=True,
+                                        match_me='^HeLlo')
+        assert len(hello_start) == 1
+
+        any_bye = ic.files_filtered(regex_match=True,
+                                    match_me='by.*e')
+        assert len(any_bye) == 4
+
+    def test_generator_with_regex(self, triage_setup):
+        ic = ImageFileCollection(triage_setup.test_dir)
+
+        n_light = 0
+
+        for h in ic.headers(regex_match=True, imagetyp='li.*t'):
+            assert h['imagetyp'].lower() == 'light'
+            n_light += 1
+
+        assert n_light == triage_setup.n_test['light']
+
diff --git a/docs/ccdproc/image_management.rst b/docs/ccdproc/image_management.rst
@@ -83,8 +83,26 @@ seconds, there is a convenience method ``.files_filtered``::
 The optional arguments to ``files_filtered`` are used to filter the list of
 files.
 
+Python regular expression patterns can also be used as the value if the
+``regex_match`` flag is set. For example, to find all of the images whose
+object is in the Kelt exoplanet survey, you might do::
+
+    >>> my_files = ic1.files_filtered(regex_match=True, object='kelt.*')
+
+To get all of the images that have image type ``BIAS`` or ``LIGHT`` you
+can also use a regular expression pattern::
+
+    >>> my_files = ic1.files_filtered(regex_match=True,
+    ...                               imagetyp='bias|light')
+
+Note that regular expression is different, and much more flexible than,
+file name matching (or "globbing") at the command line. The
+`Python documentation on the re module <https://docs.python.org/3.7/library/re.html#module-re>`_
+is useful for learning about regular expressions.
+
 Sorting files
 -------------
+
 Sometimes it is useful to bring the files into a specific order, e.g. if you
 make a plot for each object you probably want all images of the same object
 next to each other. To do this, the images in a collection can be sorted with