In [1]:
import xarray as xr
import os
from glob import glob
import datetime
import re

In [2]:
os.getcwd()

'/Users/deepthought/Library/CloudStorage/OneDrive-UniversityofLeeds/EUREC4A/Code/CFizer/dev'

Load a NetCDF file and extract information from its name. Raise an exception if the file doesn't have an `nc` extension, or if the expected patterns are not found.

In [3]:
filename = 'd20200128_diagnostic_1d_43200.nc'

name_parts = re.split('[_\.]', filename)
print(name_parts)
ext = name_parts.pop()
if ext != 'nc':
    raise ValueError('Not a NetCDF file.')
for part in name_parts:
    print(part)
    if part == 'diagnostic':
        continue
    date_match = re.fullmatch('d[0-9]{8}', part)
    print(date_match)
    if date_match is not None:
        # monc_date_str = date_match.string[1:]
        monc_year = int(date_match.string[1:5])
        monc_month = int(date_match.string[5:7])
        monc_day = int(date_match.string[7:])
        monc_date = datetime.date(monc_year, monc_month, monc_day)
    else:
        dim_match = re.fullmatch('[0-9]d', part)
        print(dim_match)
        if dim_match is not None:
            n_dims = 1 + int(dim_match.string[0])
        else:
            t_match = re.fullmatch('[0-9]+', part)
            print(t_match)
            if t_match is not None:
                end_time = int(t_match.string)
            else:
                raise ValueError('Unexpected filename format')

print(monc_date, n_dims, end_time)

filepath = os.path.join('../test_data/', filename)
try:
    nc1d_1 = xr.open_dataset(filepath)
except FileNotFoundError:
    print(filepath, 'not found')
    exit(1)
except Exception as e:
    print(e)
    exit(1)


['d20200128', 'diagnostic', '1d', '43200', 'nc']
d20200128
<re.Match object; span=(0, 9), match='d20200128'>
diagnostic
1d
None
<re.Match object; span=(0, 2), match='1d'>
43200
None
None
<re.Match object; span=(0, 5), match='43200'>
2020-01-28 2 43200


In [4]:
nc1d_1

`Diagnostic write frequency` corresponds to the time points encoded in the names of the `*diagnostic*.nc` files, while the actual time points contained in the files correspond to the ``

In [5]:
nc1d_1.attrs['MONC time'] == 

SyntaxError: invalid syntax (2000387512.py, line 1)

In [7]:
nc1d_1['options_database'].data

array([[b'rad_interval', b'60'],
       [b'subsidence_local_q', b'true'],
       [b'passive_q', b'false'],
       ...,
       [b'right_tank_delta_rh', b'0.0'],
       [b'front_tank_delta_rh', b'0.0'],
       [b'cfl_vt_max', b'1.0']], dtype='|S150')

In [8]:
nc1d_1['options_database'].data[0][0]

b'rad_interval'

Decode binary string as described [here](https://stackoverflow.com/a/17615424).

In [9]:
nc1d_1['options_database'].data[0][0].decode('utf-8')

'rad_interval'

In [10]:
options_db = {k.decode('utf-8'): v.decode('utf-8') for [k, v] in nc1d_1['options_database'].data}

In [12]:
options_db['rad_start_year']

'2020.0'

In [11]:
float(options_db['rad_start_year'])

2020.0

In [13]:
float('not a number')

ValueError: could not convert string to float: 'not a number'

In [14]:
try:
    n = float(options_db['rad_start_year'])
except ValueError:
    # not a number
    n = None
else:
    try:
        n = int(options_db['rad_start_year'])
    except ValueError:
        # not an integer
        pass
print(n)

    

2020.0


In [15]:
str(bool('true')).lower() == 'true'

True

In [16]:
str(bool('35.0')).lower() == '35.0'

False

In [17]:
str(bool('false')).lower() == 'false'

False

In [18]:
bool('false')

True

In [19]:
'65.2'.lower()

'65.2'

In [20]:
for k, v in options_db.items():
    if v.lower() == 'true':
        options_db[k] = True
    elif v.lower() == 'false':
        options_db[k] = False
    else:
        try:
            float(v)
        except ValueError:
            # not a number
            pass
        else:
            try:
                int(v)
            except ValueError:
                # not an integer
                options_db[k] = float(v)
            else:
                options_db[k] = int(v)
options_db

{'rad_interval': 60,
 'subsidence_local_q': True,
 'passive_q': False,
 'checkpoint_file': 'checkpoint_files/dephy_20200128_dump.nc',
 'galilean_transformation': False,
 'l_pgshd': True,
 'smagorinsky_enabled': True,
 'damping_enabled': True,
 'gammam': 19.3,
 'gammah': 12.0,
 'symm_prob': False,
 'pressure-terms_group_type': 'column',
 'finalisation_stage_orderinga_34': 'conditional_diagnostics_column',
 'finalisation_stage_orderinga_33': 'conditional_diagnostics_whole',
 'finalisation_stage_orderinga_32': 'pstep',
 'finalisation_stage_orderinga_31': 'tracers',
 'finalisation_stage_orderinga_30': 'simplecloud',
 'l_thref': False,
 'finalisation_stage_orderinga_29': 'damping',
 'finalisation_stage_orderinga_28': 'th_advection',
 'finalisation_stage_orderinga_27': 'pw_advection',
 'finalisation_stage_orderinga_26': 'profile_diagnostics',
 'finalisation_stage_orderinga_25': 'viscosity',
 'finalisation_stage_orderinga_24': 'lower_bc',
 'finalisation_stage_orderinga_23': 'diffusion',
 'fin

The above works, but RegEx could do more efficiently.

In [21]:
{k:v for k, v in options_db.items() if 'rad_start' in k}

{'rad_start_year': 2020.0, 'rad_start_time': 0.0, 'rad_start_day': 25.0}

In [22]:
odb = {}
for k,v in {k.decode('utf-8'): v.decode('utf-8') for [k, v] in nc1d_1['options_database'].data}.items():
    if re.fullmatch(pattern='[0-9]+\.[0-9]+', string=v):
        odb[k] = float(v)
    elif re.fullmatch(pattern='[0-9]+', string=v):
        odb[k] = int(v)
    elif v.lower() == 'true':
        odb[k] = True
    elif v.lower() == 'false':
        odb[k] = False
    else:
        odb[k] = v

odb

{'rad_interval': 60,
 'subsidence_local_q': True,
 'passive_q': False,
 'checkpoint_file': 'checkpoint_files/dephy_20200128_dump.nc',
 'galilean_transformation': False,
 'l_pgshd': True,
 'smagorinsky_enabled': True,
 'damping_enabled': True,
 'gammam': 19.3,
 'gammah': 12.0,
 'symm_prob': False,
 'pressure-terms_group_type': 'column',
 'finalisation_stage_orderinga_34': 'conditional_diagnostics_column',
 'finalisation_stage_orderinga_33': 'conditional_diagnostics_whole',
 'finalisation_stage_orderinga_32': 'pstep',
 'finalisation_stage_orderinga_31': 'tracers',
 'finalisation_stage_orderinga_30': 'simplecloud',
 'l_thref': False,
 'finalisation_stage_orderinga_29': 'damping',
 'finalisation_stage_orderinga_28': 'th_advection',
 'finalisation_stage_orderinga_27': 'pw_advection',
 'finalisation_stage_orderinga_26': 'profile_diagnostics',
 'finalisation_stage_orderinga_25': 'viscosity',
 'finalisation_stage_orderinga_24': 'lower_bc',
 'finalisation_stage_orderinga_23': 'diffusion',
 'fin

In [24]:
{k:v for k, v in odb.items() if ("freq" in k or "int" in k)}

{'rad_interval': 60,
 'checkpoint_file': 'checkpoint_files/dephy_20200128_dump.nc',
 'cfl_frequency': 10,
 'force_output_on_interval': False,
 'check_walltime_frequency': 20,
 'diag_write_freq': 3600.0,
 'checkpoint_unique_per_dump': False,
 '3d_sampling_frequency': 1800,
 'print_debug_data': False,
 'display_synopsis_frequency': 100,
 'checkpointer_enabled': True,
 'checkpoint_internal_write': 'auto',
 'checkpoint_frequency': 0,
 'check_msg_frequency': 500,
 'sampling_frequency': 300}

In [30]:
{k:v for k, v in odb.items() if "monc" in k.lower()}

{'moncs_per_io_server': 15}

In [25]:
nc1d_1.attrs

{'title': '1d',
 'created': '18/2/2022 1:11:23',
 'MONC time': '43200.0',
 'MONC timestep': '49119',
 'Diagnostic write frequency': '3600.0',
 'Previous diagnostic write at': '39600.0'}

In [34]:
float(nc1d_1.attrs['Diagnostic write frequency'])


3600.0

In [37]:
nc1d_1.attrs['sample frequency'] = str(float(odb['sampling_frequency']))

In [38]:
nc1d_1.attrs

{'title': '1d',
 'created': '18/2/2022 1:11:23',
 'MONC time': '43200.0',
 'MONC timestep': '49119',
 'Diagnostic write frequency': '3600.0',
 'Previous diagnostic write at': '39600.0',
 'sample frequency': '300.0'}

In [39]:
nc1d_1.attrs.pop('sample frequency')
nc1d_1.attrs


{'title': '1d',
 'created': '18/2/2022 1:11:23',
 'MONC time': '43200.0',
 'MONC timestep': '49119',
 'Diagnostic write frequency': '3600.0',
 'Previous diagnostic write at': '39600.0'}

In [50]:
[sampling_freq, write_freq] = 'time_series_300_1800'.split('_')[-2:]
sampling_freq

'300'

In [51]:
for c in nc1d_1.coords:
    if 'time' in c:
        [sampling_freq, write_freq] = [int(n) for n in c.split('_')[-2:]]
        updated_nc = nc1d_1.rename({c: 'time'})
        updated_nc.attrs['Sampling frequency'] = sampling_freq
        updated_nc.attrs['Write frequency'] = write_freq

updated_nc

In [54]:
for d in nc1d_1.dims:
    print(d)

nc1d_1.dims


time_series_300_1800
zn
z
number_options
kvp


Frozen({'time_series_300_1800': 2, 'zn': 121, 'z': 121, 'number_options': 664, 'kvp': 2})

In [52]:
updated_nc.attrs

{'title': '1d',
 'created': '18/2/2022 1:11:23',
 'MONC time': '43200.0',
 'MONC timestep': '49119',
 'Diagnostic write frequency': '3600.0',
 'Previous diagnostic write at': '39600.0',
 'Sampling frequency': 300,
 'Write frequency': 1800}

In [57]:
print(f"""sampling frequency from time variable matches:
      sampling_frequency: {
          float(updated_nc.attrs['Sampling frequency']) == float(odb['sampling_frequency'])
          }
      3d_sampling_frequency: {
          float(updated_nc.attrs['Sampling frequency']
                ) == float(odb['3d_sampling_frequency'])
      }""")


sampling frequency from time variable matches:
      sampling_frequency: True
      3d_sampling_frequency: False


In [64]:
print(
    f"Time interval matches write frequency? {float(updated_nc.attrs['Write frequency']) == updated_nc.time.values[1] - updated_nc.time.values[0]}")


Time interval matches write frequency? True
