In [None]:
import gc
from pathlib import Path

import dask.array as da
import iris
from tqdm import tqdm
from wildfires.data import homogenise_time_coordinate

In [None]:
# data_dir = Path("/work/scratch-pw/alexkr/new-with-antec5")
data_dir = Path("~/tmp/new-with-antec6").expanduser()
files = list(data_dir.glob("*Instant.*.nc"))
files

In [None]:
cubes = iris.cube.CubeList([])
for f in tqdm(list(map(str, files))):
    cubes.extend(iris.load_raw(f))

In [None]:
orig_len = len(cubes)
print(len(cubes))
cubes = iris.cube.CubeList([cube for cube in cubes if cube.shape[0] == 2190])
print(len(cubes))
if len(cubes) != orig_len:
    print("warning - missing cubes?")

In [None]:
# Ensure cubes can be concatenated.
concat = homogenise_time_coordinate(cubes).concatenate()
# Ensure all cubes have the same number of temporal samples after concatenation.
assert len(set(c.shape[0] for c in concat)) == 1
print(len(concat))

In [None]:
# for cube in tqdm(concat):
#     iris.coord_categorisation.add_day_of_year(cube, 'time')
#     iris.coord_categorisation.add_hour(cube, 'time')

### Get climatology

In [None]:
# climatologies = iris.cube.CubeList()
# for cube in tqdm(concat):
#     climatologies.append(cube.aggregated_by(["day_of_year", "hour"], iris.analysis.MEAN))
# climatologies

In [None]:
# The proper 'Dask-way' with lazy mean for each variable - very slow on jasmin sci
# servers (expected ~9 hrs for 47 variables across 17 years with 4 hour resolution).
climatologies = iris.cube.CubeList()
n_matching = set()
for concat_cube in tqdm(concat):
    matching = cubes.extract(iris.Constraint(name=concat_cube.name()))
    n_matching.add(len(matching))
    # NOTE - the time coordinate will technically be wrong here for the climatological case.
    climatologies.append(
        matching[0].copy(
            data=da.mean(
                da.stack(tuple(cube.core_data() for cube in matching), axis=0),
                axis=0
                # If computing here already, the time estimate will be accurate, but all final cubes will have to be held in memory at once!
            )  # .compute()
        )
    )
    gc.collect()

assert len(n_matching) == 1, n_matching

climatologies

In [None]:
# The proper 'numpy-way' - realise data one variable at a time and take the mean.
# Uses more memory than the above.
# climatologies = iris.cube.CubeList()
# n_matching = set()
# for concat_cube in tqdm(concat):
#     matching = cubes.extract(iris.Constraint(name=concat_cube.name()))
#     n_matching.add(len(matching))
#     # NOTE - the time coordinate will technically be wrong here for the climatological case.
#     climatologies.append(
#         matching[0].copy(
#             data=np.mean(
#                 np.stack(
#                     tuple(cube.data for cube in matching),
#                     axis=0
#                 ),
#                 axis=0
#             )
#         )
#     )
#     gc.collect()

# assert len(n_matching) == 1, n_matching

# climatologies

In [None]:
# # Use numpy, but try to minimise memory usage by incrementally adding individual arrays before dividing.
# climatologies = iris.cube.CubeList()
# n_matching = set()
# for concat_cube in tqdm(concat):
#     matching = cubes.extract(iris.Constraint(name=concat_cube.name()))
#     n_matching.add(len(matching))
#     # NOTE - the time coordinate will technically be wrong here for the climatological case.
#     climatology_data = matching[0].data
#     for cube in matching[1:]:
#         climatology_data += cube.data
#         gc.collect()
#     climatology_data /= len(matching)

#     climatologies.append(matching[0].copy(data=climatology_data))

#     gc.collect()

# assert len(n_matching) == 1, n_matching

# climatologies

In [None]:
iris.save(climatologies, str(Path("~/tmp/climatology6.nc").expanduser()))