forked from pytroll/satpy
/
__init__.py
510 lines (424 loc) · 21.6 KB
/
__init__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (c) 2015-2018 Satpy developers
#
# This file is part of satpy.
#
# satpy is free software: you can redistribute it and/or modify it under the
# terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# satpy is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
# A PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with
# satpy. If not, see <http://www.gnu.org/licenses/>.
"""Shared objects of the various reader classes."""
import logging
import os
import warnings
from datetime import datetime, timedelta
import yaml
try:
from yaml import UnsafeLoader
except ImportError:
from yaml import Loader as UnsafeLoader
from satpy.config import (config_search_paths, get_environ_config_dir,
glob_config)
from .yaml_reader import (AbstractYAMLReader,
load_yaml_configs as load_yaml_reader_configs)
LOG = logging.getLogger(__name__)
# Old Name -> New Name
OLD_READER_NAMES = {
}
def group_files(files_to_sort, reader=None, time_threshold=10,
group_keys=None, ppp_config_dir=None, reader_kwargs=None):
"""Group series of files by file pattern information.
By default this will group files by their filename ``start_time``
assuming it exists in the pattern. By passing the individual
dictionaries returned by this function to the Scene classes'
``filenames``, a series `Scene` objects can be easily created.
.. versionadded:: 0.12
Args:
files_to_sort (iterable): File paths to sort in to group
reader (str or Collection[str]): Reader or readers whose file patterns
should be used to sort files. If not given, try all readers (slow,
adding a list of readers is strongly recommended).
time_threshold (int): Number of seconds used to consider time elements
in a group as being equal. For example, if the 'start_time' item
is used to group files then any time within `time_threshold`
seconds of the first file's 'start_time' will be seen as occurring
at the same time.
group_keys (list or tuple): File pattern information to use to group
files. Keys are sorted in order and only the first key is used when
comparing datetime elements with `time_threshold` (see above). This
means it is recommended that datetime values should only come from
the first key in ``group_keys``. Otherwise, there is a good chance
that files will not be grouped properly (datetimes being barely
unequal). Defaults to a reader's ``group_keys`` configuration (set
in YAML), otherwise ``('start_time',)``. When passing multiple
readers, passing group_keys is strongly recommended as the
behaviour without doing so is undefined.
ppp_config_dir (str): Root usser configuration directory for Satpy.
This will be deprecated in the future, but is here for consistency
with other Satpy features.
reader_kwargs (dict): Additional keyword arguments to pass to reader
creation.
Returns:
List of dictionaries mapping 'reader' to a list of filenames.
Each of these dictionaries can be passed as ``filenames`` to
a `Scene` object.
"""
if reader is not None and not isinstance(reader, (list, tuple)):
reader = [reader]
reader_kwargs = reader_kwargs or {}
reader_files = _assign_files_to_readers(
files_to_sort, reader, ppp_config_dir, reader_kwargs)
if reader is None:
reader = reader_files.keys()
file_keys = _get_file_keys_for_reader_files(
reader_files, group_keys=group_keys)
file_groups = _get_sorted_file_groups(file_keys, time_threshold)
return [{rn: file_groups[group_key].get(rn, []) for rn in reader} for group_key in file_groups]
def _assign_files_to_readers(files_to_sort, reader_names, ppp_config_dir,
reader_kwargs):
"""Assign files to readers.
Given a list of file names (paths), match those to reader instances.
Internal helper for group_files.
Args:
files_to_sort (Collection[str]): Files to assign to readers.
reader_names (Collection[str]): Readers to consider
ppp_config_dir (str):
reader_kwargs (Mapping):
Returns:
Mapping[str, Tuple[reader, Set[str]]]
Mapping where the keys are reader names and the values are tuples of
(reader_configs, filenames).
"""
files_to_sort = set(files_to_sort)
reader_dict = {}
for reader_configs in configs_for_reader(reader_names, ppp_config_dir):
try:
reader = load_reader(reader_configs, **reader_kwargs)
except yaml.constructor.ConstructorError:
LOG.exception(
f"ConstructorError loading {reader_configs!s}, "
"probably a missing dependency, skipping "
"corresponding reader (if you did not explicitly "
"specify the reader, Satpy tries all; performance "
"will improve if you pass readers explicitly).")
continue
reader_name = reader.info["name"]
files_matching = set(reader.filter_selected_filenames(files_to_sort))
files_to_sort -= files_matching
if files_matching or reader_names is not None:
reader_dict[reader_name] = (reader, files_matching)
if files_to_sort:
raise ValueError("No matching readers found for these files: " +
", ".join(files_to_sort))
return reader_dict
def _get_file_keys_for_reader_files(reader_files, group_keys=None):
"""From a mapping from _assign_files_to_readers, get file keys.
Given a mapping where each key is a reader name and each value is a
tuple of reader instance (typically FileYAMLReader) and a collection
of files, return a mapping with the same keys, but where the values are
lists of tuples of (keys, filename), where keys are extracted from the filenames
according to group_keys and filenames are the names those keys were
extracted from.
Internal helper for group_files.
Returns:
Mapping[str, List[Tuple[Tuple, str]]], as described.
"""
file_keys = {}
for (reader_name, (reader_instance, files_to_sort)) in reader_files.items():
if group_keys is None:
group_keys = reader_instance.info.get('group_keys', ('start_time',))
file_keys[reader_name] = []
# make a copy because filename_items_for_filetype will modify inplace
files_to_sort = set(files_to_sort)
for _, filetype_info in reader_instance.sorted_filetype_items():
for f, file_info in reader_instance.filename_items_for_filetype(files_to_sort, filetype_info):
group_key = tuple(file_info.get(k) for k in group_keys)
if all(g is None for g in group_key):
warnings.warn(
f"Found matching file {f:s} for reader "
"{reader_name:s}, but none of group keys found. "
"Group keys requested: " + ", ".join(group_keys),
UserWarning)
file_keys[reader_name].append((group_key, f))
return file_keys
def _get_sorted_file_groups(all_file_keys, time_threshold):
"""Get sorted file groups.
Get a list of dictionaries, where each list item consists of a dictionary
mapping a tuple of keys to a mapping of reader names to files. The files
listed in each list item are considered to be grouped within the same time.
Args:
all_file_keys, as returned by _get_file_keys_for_reader_files
time_threshold: temporal threshold
Returns:
List[Mapping[Tuple, Mapping[str, List[str]]]], as described
Internal helper for group_files.
"""
# flatten to get an overall sorting; put the name in the middle in the
# interest of sorting
flat_keys = ((v[0], rn, v[1]) for (rn, vL) in all_file_keys.items() for v in vL)
prev_key = None
threshold = timedelta(seconds=time_threshold)
# file_groups is sorted, because dictionaries are sorted by insertion
# order in Python 3.7+
file_groups = {}
for gk, rn, f in sorted(flat_keys):
# use first element of key as time identifier (if datetime type)
if prev_key is None:
is_new_group = True
prev_key = gk
elif isinstance(gk[0], datetime):
# datetimes within threshold difference are "the same time"
is_new_group = (gk[0] - prev_key[0]) > threshold
else:
is_new_group = gk[0] != prev_key[0]
# compare keys for those that are found for both the key and
# this is a generator and is not computed until the if statement below
# when we know that `prev_key` is not None
vals_not_equal = (this_val != prev_val for this_val, prev_val in zip(gk[1:], prev_key[1:])
if this_val is not None and prev_val is not None)
# if this is a new group based on the first element
if is_new_group or any(vals_not_equal):
file_groups[gk] = {rn: [f]}
prev_key = gk
else:
if rn not in file_groups[prev_key]:
file_groups[prev_key][rn] = [f]
else:
file_groups[prev_key][rn].append(f)
return file_groups
def read_reader_config(config_files, loader=UnsafeLoader):
"""Read the reader `config_files` and return the extracted reader metadata."""
reader_config = load_yaml_reader_configs(*config_files, loader=loader)
return reader_config['reader']
def load_reader(reader_configs, **reader_kwargs):
"""Import and setup the reader from *reader_info*."""
return AbstractYAMLReader.from_config_files(*reader_configs, **reader_kwargs)
def configs_for_reader(reader=None, ppp_config_dir=None):
"""Generate reader configuration files for one or more readers.
Args:
reader (Optional[str]): Yield configs only for this reader
ppp_config_dir (Optional[str]): Additional configuration directory
to search for reader configuration files.
Returns: Generator of lists of configuration files
"""
search_paths = (ppp_config_dir,) if ppp_config_dir else tuple()
if reader is not None:
if not isinstance(reader, (list, tuple)):
reader = [reader]
# check for old reader names
new_readers = []
for reader_name in reader:
if reader_name.endswith('.yaml') or reader_name not in OLD_READER_NAMES:
new_readers.append(reader_name)
continue
new_name = OLD_READER_NAMES[reader_name]
# Satpy 0.11 only displays a warning
# Satpy 0.13 will raise an exception
raise ValueError("Reader name '{}' has been deprecated, use '{}' instead.".format(reader_name, new_name))
# Satpy 0.15 or 1.0, remove exception and mapping
reader = new_readers
# given a config filename or reader name
config_files = [r if r.endswith('.yaml') else r + '.yaml' for r in reader]
else:
reader_configs = glob_config(os.path.join('readers', '*.yaml'),
*search_paths)
config_files = set(reader_configs)
for config_file in config_files:
config_basename = os.path.basename(config_file)
reader_name = os.path.splitext(config_basename)[0]
reader_configs = config_search_paths(
os.path.join("readers", config_basename), *search_paths)
if not reader_configs:
# either the reader they asked for does not exist
# or satpy is improperly configured and can't find its own readers
raise ValueError("No reader named: {}".format(reader_name))
yield reader_configs
def available_readers(as_dict=False):
"""Available readers based on current configuration.
Args:
as_dict (bool): Optionally return reader information as a dictionary.
Default: False
Returns: List of available reader names. If `as_dict` is `True` then
a list of dictionaries including additionally reader information
is returned.
"""
readers = []
for reader_configs in configs_for_reader():
try:
reader_info = read_reader_config(reader_configs)
except (KeyError, IOError, yaml.YAMLError):
LOG.debug("Could not import reader config from: %s", reader_configs)
LOG.debug("Error loading YAML", exc_info=True)
continue
readers.append(reader_info if as_dict else reader_info['name'])
if as_dict:
readers = sorted(readers, key=lambda reader_info: reader_info['name'])
else:
readers = sorted(readers)
return readers
def find_files_and_readers(start_time=None, end_time=None, base_dir=None,
reader=None, sensor=None, ppp_config_dir=None,
filter_parameters=None, reader_kwargs=None,
missing_ok=False, fs=None):
"""Find files matching the provided parameters.
Use `start_time` and/or `end_time` to limit found filenames by the times
in the filenames (not the internal file metadata). Files are matched if
they fall anywhere within the range specified by these parameters.
Searching is **NOT** recursive.
Files may be either on-disk or on a remote file system. By default,
files are searched for locally. Users can search on remote filesystems by
passing an instance of an implementation of
`fsspec.spec.AbstractFileSystem` (strictly speaking, any object of a class
implementing a ``glob`` method works).
If locating files on a local file system, the returned dictionary
can be passed directly to the `Scene` object through the `filenames`
keyword argument. If it points to a remote file system, it is the
responsibility of the user to download the files first (directly
reading from cloud storage is not currently available in Satpy).
The behaviour of time-based filtering depends on whether or not the filename
contains information about the end time of the data or not:
- if the end time is not present in the filename, the start time of the filename
is used and has to fall between (inclusive) the requested start and end times
- otherwise, the timespan of the filename has to overlap the requested timespan
Example usage for querying a s3 filesystem using the s3fs module:
>>> import s3fs, satpy.readers, datetime
>>> satpy.readers.find_files_and_readers(
... base_dir="s3://noaa-goes16/ABI-L1b-RadF/2019/321/14/",
... fs=s3fs.S3FileSystem(anon=True),
... reader="abi_l1b",
... start_time=datetime.datetime(2019, 11, 17, 14, 40))
{'abi_l1b': [...]}
Args:
start_time (datetime): Limit used files by starting time.
end_time (datetime): Limit used files by ending time.
base_dir (str): The directory to search for files containing the
data to load. Defaults to the current directory.
reader (str or list): The name of the reader to use for loading the data or a list of names.
sensor (str or list): Limit used files by provided sensors.
ppp_config_dir (str): The directory containing the configuration
files for Satpy.
filter_parameters (dict): Filename pattern metadata to filter on. `start_time` and `end_time` are
automatically added to this dictionary. Shortcut for
`reader_kwargs['filter_parameters']`.
reader_kwargs (dict): Keyword arguments to pass to specific reader
instances to further configure file searching.
missing_ok (bool): If False (default), raise ValueError if no files
are found. If True, return empty dictionary if no
files are found.
fs (FileSystem): Optional, instance of implementation of
fsspec.spec.AbstractFileSystem (strictly speaking,
any object of a class implementing ``.glob`` is
enough). Defaults to searching the local filesystem.
Returns: Dictionary mapping reader name string to list of filenames
"""
if ppp_config_dir is None:
ppp_config_dir = get_environ_config_dir()
reader_files = {}
reader_kwargs = reader_kwargs or {}
filter_parameters = filter_parameters or reader_kwargs.get('filter_parameters', {})
sensor_supported = False
if start_time or end_time:
filter_parameters['start_time'] = start_time
filter_parameters['end_time'] = end_time
reader_kwargs['filter_parameters'] = filter_parameters
for reader_configs in configs_for_reader(reader, ppp_config_dir):
try:
reader_instance = load_reader(reader_configs, **reader_kwargs)
except (KeyError, IOError, yaml.YAMLError) as err:
LOG.info('Cannot use %s', str(reader_configs))
LOG.debug(str(err))
if reader and (isinstance(reader, str) or len(reader) == 1):
# if it is a single reader then give a more usable error
raise
continue
if not reader_instance.supports_sensor(sensor):
continue
elif sensor is not None:
# sensor was specified and a reader supports it
sensor_supported = True
loadables = reader_instance.select_files_from_directory(base_dir, fs)
if loadables:
loadables = list(
reader_instance.filter_selected_filenames(loadables))
if loadables:
reader_files[reader_instance.name] = list(loadables)
if sensor and not sensor_supported:
raise ValueError("Sensor '{}' not supported by any readers".format(sensor))
if not (reader_files or missing_ok):
raise ValueError("No supported files found")
return reader_files
def load_readers(filenames=None, reader=None, reader_kwargs=None,
ppp_config_dir=None):
"""Create specified readers and assign files to them.
Args:
filenames (iterable or dict): A sequence of files that will be used to load data from. A ``dict`` object
should map reader names to a list of filenames for that reader.
reader (str or list): The name of the reader to use for loading the data or a list of names.
reader_kwargs (dict): Keyword arguments to pass to specific reader instances.
ppp_config_dir (str): The directory containing the configuration files for satpy.
Returns: Dictionary mapping reader name to reader instance
"""
reader_instances = {}
reader_kwargs = reader_kwargs or {}
reader_kwargs_without_filter = reader_kwargs.copy()
reader_kwargs_without_filter.pop('filter_parameters', None)
if ppp_config_dir is None:
ppp_config_dir = get_environ_config_dir()
if not filenames and not reader:
# used for an empty Scene
return {}
elif reader and filenames is not None and not filenames:
# user made a mistake in their glob pattern
raise ValueError("'filenames' was provided but is empty.")
elif not filenames:
LOG.warning("'filenames' required to create readers and load data")
return {}
elif reader is None and isinstance(filenames, dict):
# filenames is a dictionary of reader_name -> filenames
reader = list(filenames.keys())
remaining_filenames = set(f for fl in filenames.values() for f in fl)
elif reader and isinstance(filenames, dict):
# filenames is a dictionary of reader_name -> filenames
# but they only want one of the readers
filenames = filenames[reader]
remaining_filenames = set(filenames or [])
else:
remaining_filenames = set(filenames or [])
for idx, reader_configs in enumerate(configs_for_reader(reader, ppp_config_dir)):
if isinstance(filenames, dict):
readers_files = set(filenames[reader[idx]])
else:
readers_files = remaining_filenames
try:
reader_instance = load_reader(reader_configs, **reader_kwargs)
except (KeyError, IOError, yaml.YAMLError) as err:
LOG.info('Cannot use %s', str(reader_configs))
LOG.debug(str(err))
continue
if not readers_files:
# we weren't given any files for this reader
continue
loadables = reader_instance.select_files_from_pathnames(readers_files)
if loadables:
reader_instance.create_filehandlers(loadables, fh_kwargs=reader_kwargs_without_filter)
reader_instances[reader_instance.name] = reader_instance
remaining_filenames -= set(loadables)
if not remaining_filenames:
break
if remaining_filenames:
LOG.warning("Don't know how to open the following files: {}".format(str(remaining_filenames)))
if not reader_instances:
raise ValueError("No supported files found")
elif not any(list(r.available_dataset_ids) for r in reader_instances.values()):
raise ValueError("No dataset could be loaded. Either missing "
"requirements (such as Epilog, Prolog) or none of the "
"provided files match the filter parameters.")
return reader_instances