Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ New Features
- Allow identification of FITS files in ``ImageFileCollection`` based on content
of the files instead of file name extension. [#620, #680]

- Add option to use regular expression matching when filtering items in
``ImageFileCollection``. [#480, #595, #682]

Other Changes and Additions
^^^^^^^^^^^^^^^^^^^^^^^^^^^

Expand Down
46 changes: 42 additions & 4 deletions ccdproc/image_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from collections import OrderedDict
import fnmatch
from os import listdir, path
import re

import logging

import numpy as np
Expand Down Expand Up @@ -326,6 +328,12 @@ def files_filtered(self, **kwd):
contains not just the filename, but the full path to each file.
Default is ``False``.

regex_match : bool, keyword-only
If ``True``, then string values in the ``**kwd`` dictionary are
treated as regular expression patterns and matching is done by
regular expression search. The search is always
**case insensitive**.

**kwd :
``**kwd`` is dict of keywords and values the files must have.
The value '*' represents any value.
Expand Down Expand Up @@ -355,7 +363,8 @@ def files_filtered(self, **kwd):

Notes
-----
Value comparison is case *insensitive* for strings.
Value comparison is case *insensitive* for strings, whether matching
exactly or matching with regular expressions.
"""
# force a copy by explicitly converting to a list
current_file_mask = self.summary['file'].mask.tolist()
Expand Down Expand Up @@ -614,6 +623,16 @@ def _find_keywords_by_values(self, **kwd):
"""
Find files whose keywords have given values.

Parameters
----------

match_regex : bool, optional
If ``True`` match string values by using a regular expression
search instead of equality. Default value is ``False``.

The remaining arguments are keyword/value pairs specifying the
values to match.

`**kwd` is list of keywords and values the files must have.

The value '*' represents any value.
Expand All @@ -625,9 +644,11 @@ def _find_keywords_by_values(self, **kwd):
>>> collection = ImageFileCollection('test/data', keywords=keys)
>>> collection.files_filtered(imagetyp='LIGHT', filter='R')
>>> collection.files_filtered(imagetyp='*', filter='')
>>> collection.files_filtered(imagetyp='bias|filter', regex_match=True)

NOTE: Value comparison is case *insensitive* for strings.
"""
regex_match = kwd.pop('regex_match', False)
keywords = kwd.keys()
values = kwd.values()

Expand All @@ -652,15 +673,25 @@ def _find_keywords_by_values(self, **kwd):
# need to loop explicitly over array rather than using
# where to correctly do string comparison.
have_this_value = np.zeros(len(use_info), dtype=bool)

# We are going to do a regex match no matter what.
if regex_match:
pattern = re.compile(value,
flags=re.IGNORECASE)
else:
# This pattern matches the prior behavior.
pattern = re.compile('^' + value + '$',
flags=re.IGNORECASE)

for idx, file_key_value in enumerate(use_info[key].tolist()):
if value_not_missing[idx]:
try:
value_matches = (
file_key_value.lower() == value.lower())
except AttributeError:
pattern.search(file_key_value) is not None)
except TypeError:
# In case we're dealing with an object column
# there could be values other than strings in it
# so it could fail with an AttributeError.
# so it could fail with an TypeError.
value_matches = False
else:
value_matches = False
Expand Down Expand Up @@ -793,6 +824,13 @@ def _generator(self, return_type,
See `~astropy.nddata.fits_ccddata_reader` for a complete list of
parameters that can be passed through ``ccd_kwargs``.


regex_match : bool, keyword-only
If ``True``, then string values in the ``**kwd`` dictionary are
treated as regular expression patterns and matching is done by
regular expression search. The search is always
**case insensitive**.

**kwd :
Any additional keywords are used to filter the items returned; see
`files_filtered` examples for details.
Expand Down
51 changes: 51 additions & 0 deletions ccdproc/tests/test_image_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -1018,3 +1018,54 @@ def test_type_of_empty_collection(self, triage_setup):
ic = ImageFileCollection(triage_setup.test_dir)
assert ic.summary is None
assert ic.keywords == []

def test_regex_match_for_search(self, triage_setup):
# Test regex matching in searches

ic = ImageFileCollection(triage_setup.test_dir)

files = ic.files_filtered(regex_match=True, imagetyp='b.*s')
assert len(files) == triage_setup.n_test['bias']

# This should return all of the files in the test set
all_files = ic.files_filtered(regex_match=True, imagetyp='bias|light')
assert len(all_files) == triage_setup.n_test['files']

# Add a column with more interesting content and see whether we
# match that.
ic.summary['match_me'] = [
'hello',
'goodbye',
'bye',
'byte',
'good bye hello',
'dog'
]

hello_anywhere = ic.files_filtered(regex_match=True,
match_me='hello')
assert len(hello_anywhere) == 2

hello_start = ic.files_filtered(regex_match=True,
match_me='^hello')
assert len(hello_start) == 1

# Is it really a case-insensitive match?
hello_start = ic.files_filtered(regex_match=True,
match_me='^HeLlo')
assert len(hello_start) == 1

any_bye = ic.files_filtered(regex_match=True,
match_me='by.*e')
assert len(any_bye) == 4

def test_generator_with_regex(self, triage_setup):
ic = ImageFileCollection(triage_setup.test_dir)

n_light = 0

for h in ic.headers(regex_match=True, imagetyp='li.*t'):
assert h['imagetyp'].lower() == 'light'
n_light += 1

assert n_light == triage_setup.n_test['light']
18 changes: 18 additions & 0 deletions docs/ccdproc/image_management.rst
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,26 @@ seconds, there is a convenience method ``.files_filtered``::
The optional arguments to ``files_filtered`` are used to filter the list of
files.

Python regular expression patterns can also be used as the value if the
``regex_match`` flag is set. For example, to find all of the images whose
object is in the Kelt exoplanet survey, you might do::

>>> my_files = ic1.files_filtered(regex_match=True, object='kelt.*')

To get all of the images that have image type ``BIAS`` or ``LIGHT`` you
can also use a regular expression pattern::

>>> my_files = ic1.files_filtered(regex_match=True,
... imagetyp='bias|light')

Note that regular expression is different, and much more flexible than,
file name matching (or "globbing") at the command line. The
`Python documentation on the re module <https://docs.python.org/3.7/library/re.html#module-re>`_
is useful for learning about regular expressions.

Sorting files
-------------

Sometimes it is useful to bring the files into a specific order, e.g. if you
make a plot for each object you probably want all images of the same object
next to each other. To do this, the images in a collection can be sorted with
Expand Down