Skip to content
This repository has been archived by the owner. It is now read-only.
Browse files
  • Loading branch information
lewismc committed Oct 31, 2017
2 parents 28c9239 + 14b98e9 commit 4f13a40a7712126169bf1a650bd239872df4a43b
Show file tree
Hide file tree
Showing 11 changed files with 535 additions and 143 deletions.
@@ -0,0 +1,56 @@
import os
import subprocess
import jinja2
from metadata_extractor import CORDEXMetadataExtractor, obs4MIPSMetadataExtractor

# These should be modified. TODO: domains can also be made into separate group
# CORDEX domain
domain = 'NAM-44'

# The output directory
workdir = '/home/goodman/data_processing/CORDEX/analysis'

# Location of osb4Mips files
obs_dir = '/proj3/data/obs4mips'

# Location of CORDEX files
models_dir = '/proj3/data/CORDEX/{domain}/*'.format(domain=domain)

# Extract metadata from model and obs files, pairing up files with the same
# variables for separate evaluations
obs_extractor = obs4MIPSMetadataExtractor(obs_dir)
models_extractor = CORDEXMetadataExtractor(models_dir)
groups =, 'variable')

# Configuration file template, to be rendered repeatedly for each evaluation
# run
env = jinja2.Environment(loader=jinja2.FileSystemLoader('./templates'),
trim_blocks=True, lstrip_blocks=True)
t = env.get_template('CORDEX.yaml.template')

# Each group represents a single evaluation. Repeat the evaluation for
# three seasons: Summer, Winter, and Annual.
seasons = ['annual', 'winter', 'summer']
for group in groups:
obs_info, models_info = group
instrument = obs_info['instrument']
variable = obs_info['variable']
for season in seasons:
configfile_basename = '_'.join([domain, instrument, variable, season]) + '.yaml'
configfile_path = os.path.join(workdir, domain, instrument,
variable, season)
if not os.path.exists(configfile_path):
configfile_path = os.path.join(configfile_path, configfile_basename)
with open(configfile_path, 'w') as configfile:
configfile.write(t.render(obs_info=obs_info, models_info=models_info,
season=season, output_dir=workdir))

# TODO: Do this in parallel. Will change this once this approach
# is well tested.
code =['python', '../', configfile_path])
errored = []
if code:

print("All runs done. The following ended with an error: {}".format(errored))
@@ -0,0 +1,245 @@
import glob
import os

class MetadataExtractor(object):
def __init__(self, *paths):
"""Extracts metadata from data filenames.
Instances of MetadataExtractor are used to extract metadata from
filenames in bulk. Example usage:
>>> extractor = MetadataExtractor('/path/to/data')
Suppose the data in this directory had the following files:
pr_*.nc, uas_*.nc, vas_*.nc
All of the metadata lies in the data attribute:
[{'filename': /path/to/data/pr_*.nc, 'variable': 'pr'},
{'filename': /path/to/data/vas_*.nc, 'variable': 'vas'},
{'filename': /path/to/data/uas_*.nc, 'variable': 'uas'}]
Results can be narrowed down by specifying values for a field:
>>> extractor.query(variable='pr')
[{'filename': /path/to/data/pr_*.nc, 'variable': 'pr'}]
Finally, metadata from two sets of extractors can be grouped together
based on common field name as follows:
>>>, 'variable')
This class should only be used as a starting point. We recommend using
the included obs4MIPSMetadataExtractor and CORDEXMetadataExtractor
subclasses or creating your own subclass for your usecase.
self.paths = paths

def data(self):
The extracted metadata for each file, with all fields listed in
the fields attribute included.
return self._data

def paths(self):
Search paths containing the dataset files.
return self._paths

def paths(self, paths):
Extracts the metadata from scratch when paths are reset.
self._paths = paths

def fields(self):
The name of field in the filename, assuming the fully filtered
filename conforms to the following convention:
filename = <field[0]>_<field[1]>_..._<field[n]>.nc. Using fewer fields
than the filename defines is allowed.
fields = ['variable']
return fields

def files(self):
List of files (or regular expressions) for each dataset.
files = []
for path in self.paths:
files.extend(glob.glob(os.path.join(path, '*.nc')))
return list(set(self.get_pattern(fname) for fname in files))

def variables(self):
Get the list of variables included accross all the datasets.
return self.get_field('variable')

def field_filters(self):
Override this to filter out specific characters contained in a field.
return dict()

def query(self, **kwargs):
Narrow down the list of files by field names.
fields = kwargs.keys()
if not set(fields).issubset(set(self.fields)):
raise ValueError("Invalid fields: {}. Must be subset of: {}"
.format(fields, self.fields))
data =
for field, value in kwargs.items():
value = value if isinstance(value, list) else [value]
data = [meta for meta in data
if self._match_filter(meta, field) in value]
return data

def group(self, extractor, field):
Compare the data of this extractor with another extractor instance
and group each of their metadata together by given field.
# First we only want to consider values of field which are contained
# in both extractors
subset = self.get_field(field)
other_subset = extractor.get_field(field)
intersection = list(subset.intersection(other_subset))

# Next we will group the datasets in each extractor together by common
# field values
kwargs = {field: intersection}
results = self.query(**kwargs)

groups = []
for meta in results:
val = self._match_filter(meta, field)
kwargs.update({field: val})
match = extractor.query(**kwargs)
groups.append((meta, match))

return groups

def get_field(self, field):
Returns only the selected field of the extracted data.
if field not in self.fields:
raise ValueError("Invalid field: {}. Must be one of: {}"
.format(field, self.fields))
sub = set(meta[field] for meta in
return sub

def filter_filename(self, fname):
Applies a filter to each individual filename contained in the _files
attribute, which is useful if some files within a data set are known
to not follow conventions, and "fix" them so that they do.
return os.path.basename(fname)

def get_pattern(self, fname):
Used to group multiple file datasets together via regular expresssions.
The most common convention is to split files by time periods, which
are generally the last field in a filename.
base = fname.split('_')
pattern = '_'.join(base[:len(self.fields)] + ['*.nc'])
return pattern

def _match_filter(self, meta, field):
Filter (ignore) certain character patterns when matching a field.
val = meta[field]
if field in self.field_filters:
for pattern in self.field_filters[field]:
val = val.replace(pattern, '')
return val

def _extract(self):
Do the actual metadata extraction from the list of filename given
via filter_filelist(). Additionally, filenames can also be filtered
via filter_filename() to remove unwanted characters from the extraction.
self._data = []
for fname in self.files:
meta = dict(filename=fname)

# Perform the actual metadata extraction
fname = self.filter_filename(fname)
meta.update(dict(zip(self.fields, fname.split('_')[:-1])))

class obs4MIPSMetadataExtractor(MetadataExtractor):
def instruments(self):
Get the list of instruments accross all the datasets.
return self.get_field('instrument')

def fields(self):
obs4MIPs fields
fields = ['variable', 'instrument', 'processing_level', 'version']
return fields

def field_filters(self):
Field filters for CALIPSO
return dict(variable=['calipso', 'Lidarsr532'])

def filter_filename(self, fname):
CALIPSO files have odd naming conventions, so we will use
a modified version to conform to standard obs4MIPs conventions.
fname = os.path.basename(fname)
fname = fname.replace('_obs4MIPs_', '_')
return fname

def get_pattern(self, fname):
Overriden to deal with CALIPSO filenames
base = fname.split('_')
offset = -2 if len(base) != 5 else -1
pattern = '_'.join(base[:offset] + ['*.nc'])
return pattern

class CORDEXMetadataExtractor(MetadataExtractor):
def models(self):
Get the list of models accross all the datasets.
return self.get_field('model')

def fields(self):
obs4MIPs fields
fields = ['variable', 'domain', 'driving_model', 'experiment',
'ensemble', 'model', 'version', 'time_step']
return fields
@@ -0,0 +1,57 @@
{% set domain = models_info[0].domain %}
{% set instrument = obs_info.instrument %}
{% set variable = models_info[0].variable %}
{% set basename = [variable, instrument, domain, season]|join('_') %}
workdir: {{ [output_dir, domain, instrument, variable, season]|join('/') }}
output_netcdf_filename: {{ basename }}.nc

# (RCMES will temporally subset data between month_start and month_end.
# If average_each_year is True (False), seasonal mean in each year is (not) calculated and used for metrics calculation.)
maximum_overlap_period: True
temporal_resolution: monthly
{% if season == "winter" %}
month_start: 12
month_end: 2
{% elif season == "summer" %}
month_start: 6
month_end: 8
{% else %}
month_start: 1
month_end: 12
{% endif %}
average_each_year: True

boundary_type: CORDEX {{ domain[:3] }}

regrid_on_reference: True

- loader_name: local_split
name: {{ instrument }}
file_path: {{ obs_info.filename }}
variable_name: {{ obs_info.variable }}
{% for model_info in models_info %}
- loader_name: local_split
name: {{ model_info.model }}
file_path: {{ model_info.filename }}
variable_name: {{ model_info.variable }}
lat_name: lat
lon_name: lon
{% endfor %}

number_of_metrics_and_plots: 2

metrics1: Map_plot_bias_of_multiyear_climatology

file_name: {{ basename }}_bias

metrics2: Taylor_diagram_spatial_pattern_of_multiyear_climatology

file_name: {{ basename }}_taylor

use_subregions: False

0 comments on commit 4f13a40

Please sign in to comment.