Skip to content

Commit

Permalink
Merge d0e12cd into b605b17
Browse files Browse the repository at this point in the history
  • Loading branch information
mwcraig committed Jul 27, 2019
2 parents b605b17 + d0e12cd commit ab5a6a8
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 4 deletions.
3 changes: 3 additions & 0 deletions CHANGES.rst
Expand Up @@ -10,6 +10,9 @@ New Features
- Allow identification of FITS files in ``ImageFileCollection`` based on content
of the files instead of file name extension. [#620, #680]

- Add option to use regular expression matching when filtering items in
``ImageFileCollection``. [#480, #595, #682]

Other Changes and Additions
^^^^^^^^^^^^^^^^^^^^^^^^^^^

Expand Down
46 changes: 42 additions & 4 deletions ccdproc/image_collection.py
Expand Up @@ -3,6 +3,8 @@
from collections import OrderedDict
import fnmatch
from os import listdir, path
import re

import logging

import numpy as np
Expand Down Expand Up @@ -326,6 +328,12 @@ def files_filtered(self, **kwd):
contains not just the filename, but the full path to each file.
Default is ``False``.
regex_match : bool, keyword-only
If ``True``, then string values in the ``**kwd`` dictionary are
treated as regular expression patterns and matching is done by
regular expression search. The search is always
**case insensitive**.
**kwd :
``**kwd`` is dict of keywords and values the files must have.
The value '*' represents any value.
Expand Down Expand Up @@ -355,7 +363,8 @@ def files_filtered(self, **kwd):
Notes
-----
Value comparison is case *insensitive* for strings.
Value comparison is case *insensitive* for strings, whether matching
exactly or matching with regular expressions.
"""
# force a copy by explicitly converting to a list
current_file_mask = self.summary['file'].mask.tolist()
Expand Down Expand Up @@ -614,6 +623,16 @@ def _find_keywords_by_values(self, **kwd):
"""
Find files whose keywords have given values.
Parameters
----------
match_regex : bool, optional
If ``True`` match string values by using a regular expression
search instead of equality. Default value is ``False``.
The remaining arguments are keyword/value pairs specifying the
values to match.
`**kwd` is list of keywords and values the files must have.
The value '*' represents any value.
Expand All @@ -625,9 +644,11 @@ def _find_keywords_by_values(self, **kwd):
>>> collection = ImageFileCollection('test/data', keywords=keys)
>>> collection.files_filtered(imagetyp='LIGHT', filter='R')
>>> collection.files_filtered(imagetyp='*', filter='')
>>> collection.files_filtered(imagetyp='bias|filter', regex_match=True)
NOTE: Value comparison is case *insensitive* for strings.
"""
regex_match = kwd.pop('regex_match', False)
keywords = kwd.keys()
values = kwd.values()

Expand All @@ -652,15 +673,25 @@ def _find_keywords_by_values(self, **kwd):
# need to loop explicitly over array rather than using
# where to correctly do string comparison.
have_this_value = np.zeros(len(use_info), dtype=bool)

# We are going to do a regex match no matter what.
if regex_match:
pattern = re.compile(value,
flags=re.IGNORECASE)
else:
# This pattern matches the prior behavior.
pattern = re.compile('^' + value + '$',
flags=re.IGNORECASE)

for idx, file_key_value in enumerate(use_info[key].tolist()):
if value_not_missing[idx]:
try:
value_matches = (
file_key_value.lower() == value.lower())
except AttributeError:
pattern.search(file_key_value) is not None)
except TypeError:
# In case we're dealing with an object column
# there could be values other than strings in it
# so it could fail with an AttributeError.
# so it could fail with an TypeError.
value_matches = False
else:
value_matches = False
Expand Down Expand Up @@ -793,6 +824,13 @@ def _generator(self, return_type,
See `~astropy.nddata.fits_ccddata_reader` for a complete list of
parameters that can be passed through ``ccd_kwargs``.
regex_match : bool, keyword-only
If ``True``, then string values in the ``**kwd`` dictionary are
treated as regular expression patterns and matching is done by
regular expression search. The search is always
**case insensitive**.
**kwd :
Any additional keywords are used to filter the items returned; see
`files_filtered` examples for details.
Expand Down
52 changes: 52 additions & 0 deletions ccdproc/tests/test_image_collection.py
Expand Up @@ -1018,3 +1018,55 @@ def test_type_of_empty_collection(self, triage_setup):
ic = ImageFileCollection(triage_setup.test_dir)
assert ic.summary is None
assert ic.keywords == []

def test_regex_match_for_search(self, triage_setup):
# Test regex matching in searches

ic = ImageFileCollection(triage_setup.test_dir)

files = ic.files_filtered(regex_match=True, imagetyp='b.*s')
assert len(files) == triage_setup.n_test['bias']

# This should return all of the files in the test set
all_files = ic.files_filtered(regex_match=True, imagetyp='bias|light')
assert len(all_files) == triage_setup.n_test['files']

# Add a column with more interesting content and see whether we
# match that.
ic.summary['match_me'] = [
'hello',
'goodbye',
'bye',
'byte',
'good bye hello',
'dog'
]

hello_anywhere = ic.files_filtered(regex_match=True,
match_me='hello')
assert len(hello_anywhere) == 2

hello_start = ic.files_filtered(regex_match=True,
match_me='^hello')
assert len(hello_start) == 1

# Is it really a case-insensitive match?
hello_start = ic.files_filtered(regex_match=True,
match_me='^HeLlo')
assert len(hello_start) == 1

any_bye = ic.files_filtered(regex_match=True,
match_me='by.*e')
assert len(any_bye) == 4

def test_generator_with_regex(self, triage_setup):
ic = ImageFileCollection(triage_setup.test_dir)

n_light = 0

for h in ic.headers(regex_match=True, imagetyp='li.*t'):
assert h['imagetyp'].lower() == 'light'
n_light += 1

assert n_light == triage_setup.n_test['light']

18 changes: 18 additions & 0 deletions docs/ccdproc/image_management.rst
Expand Up @@ -83,8 +83,26 @@ seconds, there is a convenience method ``.files_filtered``::
The optional arguments to ``files_filtered`` are used to filter the list of
files.

Python regular expression patterns can also be used as the value if the
``regex_match`` flag is set. For example, to find all of the images whose
object is in the Kelt exoplanet survey, you might do::

>>> my_files = ic1.files_filtered(regex_match=True, object='kelt.*')

To get all of the images that have image type ``BIAS`` or ``LIGHT`` you
can also use a regular expression pattern::

>>> my_files = ic1.files_filtered(regex_match=True,
... imagetyp='bias|light')

Note that regular expression is different, and much more flexible than,
file name matching (or "globbing") at the command line. The
`Python documentation on the re module <https://docs.python.org/3.7/library/re.html#module-re>`_
is useful for learning about regular expressions.

Sorting files
-------------

Sometimes it is useful to bring the files into a specific order, e.g. if you
make a plot for each object you probably want all images of the same object
next to each other. To do this, the images in a collection can be sorted with
Expand Down

0 comments on commit ab5a6a8

Please sign in to comment.