diff --git a/CHANGES.rst b/CHANGES.rst index b4ff0252..6eed4982 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -10,6 +10,9 @@ New Features - Allow identification of FITS files in ``ImageFileCollection`` based on content of the files instead of file name extension. [#620, #680] +- Add option to use regular expression matching when filtering items in + ``ImageFileCollection``. [#480, #595, #682] + Other Changes and Additions ^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/ccdproc/image_collection.py b/ccdproc/image_collection.py index 26f817c4..516c9fcb 100644 --- a/ccdproc/image_collection.py +++ b/ccdproc/image_collection.py @@ -3,6 +3,8 @@ from collections import OrderedDict import fnmatch from os import listdir, path +import re + import logging import numpy as np @@ -326,6 +328,12 @@ def files_filtered(self, **kwd): contains not just the filename, but the full path to each file. Default is ``False``. + regex_match : bool, keyword-only + If ``True``, then string values in the ``**kwd`` dictionary are + treated as regular expression patterns and matching is done by + regular expression search. The search is always + **case insensitive**. + **kwd : ``**kwd`` is dict of keywords and values the files must have. The value '*' represents any value. @@ -355,7 +363,8 @@ def files_filtered(self, **kwd): Notes ----- - Value comparison is case *insensitive* for strings. + Value comparison is case *insensitive* for strings, whether matching + exactly or matching with regular expressions. """ # force a copy by explicitly converting to a list current_file_mask = self.summary['file'].mask.tolist() @@ -614,6 +623,16 @@ def _find_keywords_by_values(self, **kwd): """ Find files whose keywords have given values. + Parameters + ---------- + + match_regex : bool, optional + If ``True`` match string values by using a regular expression + search instead of equality. Default value is ``False``. + + The remaining arguments are keyword/value pairs specifying the + values to match. + `**kwd` is list of keywords and values the files must have. The value '*' represents any value. @@ -625,9 +644,11 @@ def _find_keywords_by_values(self, **kwd): >>> collection = ImageFileCollection('test/data', keywords=keys) >>> collection.files_filtered(imagetyp='LIGHT', filter='R') >>> collection.files_filtered(imagetyp='*', filter='') + >>> collection.files_filtered(imagetyp='bias|filter', regex_match=True) NOTE: Value comparison is case *insensitive* for strings. """ + regex_match = kwd.pop('regex_match', False) keywords = kwd.keys() values = kwd.values() @@ -652,15 +673,25 @@ def _find_keywords_by_values(self, **kwd): # need to loop explicitly over array rather than using # where to correctly do string comparison. have_this_value = np.zeros(len(use_info), dtype=bool) + + # We are going to do a regex match no matter what. + if regex_match: + pattern = re.compile(value, + flags=re.IGNORECASE) + else: + # This pattern matches the prior behavior. + pattern = re.compile('^' + value + '$', + flags=re.IGNORECASE) + for idx, file_key_value in enumerate(use_info[key].tolist()): if value_not_missing[idx]: try: value_matches = ( - file_key_value.lower() == value.lower()) - except AttributeError: + pattern.search(file_key_value) is not None) + except TypeError: # In case we're dealing with an object column # there could be values other than strings in it - # so it could fail with an AttributeError. + # so it could fail with an TypeError. value_matches = False else: value_matches = False @@ -793,6 +824,13 @@ def _generator(self, return_type, See `~astropy.nddata.fits_ccddata_reader` for a complete list of parameters that can be passed through ``ccd_kwargs``. + + regex_match : bool, keyword-only + If ``True``, then string values in the ``**kwd`` dictionary are + treated as regular expression patterns and matching is done by + regular expression search. The search is always + **case insensitive**. + **kwd : Any additional keywords are used to filter the items returned; see `files_filtered` examples for details. diff --git a/ccdproc/tests/test_image_collection.py b/ccdproc/tests/test_image_collection.py index 583807a8..98a3b73c 100644 --- a/ccdproc/tests/test_image_collection.py +++ b/ccdproc/tests/test_image_collection.py @@ -1018,3 +1018,54 @@ def test_type_of_empty_collection(self, triage_setup): ic = ImageFileCollection(triage_setup.test_dir) assert ic.summary is None assert ic.keywords == [] + + def test_regex_match_for_search(self, triage_setup): + # Test regex matching in searches + + ic = ImageFileCollection(triage_setup.test_dir) + + files = ic.files_filtered(regex_match=True, imagetyp='b.*s') + assert len(files) == triage_setup.n_test['bias'] + + # This should return all of the files in the test set + all_files = ic.files_filtered(regex_match=True, imagetyp='bias|light') + assert len(all_files) == triage_setup.n_test['files'] + + # Add a column with more interesting content and see whether we + # match that. + ic.summary['match_me'] = [ + 'hello', + 'goodbye', + 'bye', + 'byte', + 'good bye hello', + 'dog' + ] + + hello_anywhere = ic.files_filtered(regex_match=True, + match_me='hello') + assert len(hello_anywhere) == 2 + + hello_start = ic.files_filtered(regex_match=True, + match_me='^hello') + assert len(hello_start) == 1 + + # Is it really a case-insensitive match? + hello_start = ic.files_filtered(regex_match=True, + match_me='^HeLlo') + assert len(hello_start) == 1 + + any_bye = ic.files_filtered(regex_match=True, + match_me='by.*e') + assert len(any_bye) == 4 + + def test_generator_with_regex(self, triage_setup): + ic = ImageFileCollection(triage_setup.test_dir) + + n_light = 0 + + for h in ic.headers(regex_match=True, imagetyp='li.*t'): + assert h['imagetyp'].lower() == 'light' + n_light += 1 + + assert n_light == triage_setup.n_test['light'] diff --git a/docs/ccdproc/image_management.rst b/docs/ccdproc/image_management.rst index f97e58c9..df755be7 100644 --- a/docs/ccdproc/image_management.rst +++ b/docs/ccdproc/image_management.rst @@ -83,8 +83,26 @@ seconds, there is a convenience method ``.files_filtered``:: The optional arguments to ``files_filtered`` are used to filter the list of files. +Python regular expression patterns can also be used as the value if the +``regex_match`` flag is set. For example, to find all of the images whose +object is in the Kelt exoplanet survey, you might do:: + + >>> my_files = ic1.files_filtered(regex_match=True, object='kelt.*') + +To get all of the images that have image type ``BIAS`` or ``LIGHT`` you +can also use a regular expression pattern:: + + >>> my_files = ic1.files_filtered(regex_match=True, + ... imagetyp='bias|light') + +Note that regular expression is different, and much more flexible than, +file name matching (or "globbing") at the command line. The +`Python documentation on the re module `_ +is useful for learning about regular expressions. + Sorting files ------------- + Sometimes it is useful to bring the files into a specific order, e.g. if you make a plot for each object you probably want all images of the same object next to each other. To do this, the images in a collection can be sorted with