### Process daily UNSEEN ###

Processes the model data for a given initialisation year (e.g. s1960), ensemble member (e.g. r1i1p1f2), model (e.g. HadGEM3-GC31-MM), spatial area (e.g. UK) and variable (e.g. tas) into a dataframe.

In [1]:
# Local imports
import os
import sys
import time
import argparse

# Third-party imports
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt
import pandas as pd
import shapely.geometry
import cartopy.io.shapereader as shpreader
import iris

# Specific imports
from tqdm import tqdm
from datetime import datetime, timedelta

  _set_context_ca_bundle_path(ca_bundle_path)


In [2]:
# Load my specific functions
sys.path.append("/home/users/benhutch/unseen_functions")
import functions as funcs

In [3]:
# Hard coded args
model = "HadGEM3-GC31-MM"
experiment = "dcppA-hindcast"
freq = "day"
months = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

# Set up the output directory for the dfs
output_dir_dfs = "/gws/nopw/j04/canari/users/benhutch/unseen/saved_dfs"

In [4]:
# Flexible args
variable = "tas"
country = "United Kingdom"
init_year = 1960

In [5]:
# HadGEM3-GC31-MM_dcppA-hindcast_tas_United_Kingdom_1960-2018_day.csv
# ERA5_tas_United_Kingdom_1960-2018_daily.csv

# load the model data
df_model = pd.read_csv(f"{output_dir_dfs}/HadGEM3-GC31-MM_dcppA-hindcast_tas_United_Kingdom_1960-2018_day.csv")

# Load the observed data
df_obs = pd.read_csv(f"{output_dir_dfs}/ERA5_tas_United_Kingdom_1960-2018_daily.csv")

In [6]:
df_model.head()

Unnamed: 0,init_year,member,lead,data
0,1960,1,1,283.904326
1,1960,1,2,281.653069
2,1960,1,3,280.558694
3,1960,1,4,280.931852
4,1960,1,5,280.171699


In [7]:
df_obs.head()

Unnamed: 0,time,data
0,3652,
1,3653,
2,3654,
3,3655,
4,3656,


In [5]:
def get_member_string(member, model):
    """
    Returns the appropriate member string based on the member value.
    
    Parameters:
    member (int): The member number (1 to 10).
    model (str): The model name.
    
    Returns:
    str: The corresponding member string.
    """
    # check the model is HadGEM3-GC31-MM
    if model != "HadGEM3-GC31-MM":
        raise ValueError("Model must be HadGEM3-GC31-MM")

    if 1 <= member <= 10 and model == "HadGEM3-GC31-MM":
        return f'r{member}i1p1f2'
    else:
        raise ValueError("Member must be between 1 and 10")

# Example usage
member = 3
member_string = get_member_string(member, model)
print(member_string)  # Output: r3i1p1f2

r3i1p1f2


In [6]:
# if country has a space, replace with _
country = country.replace(" ", "_")

# Set up the name for the df
df_name = f"{model}_{experiment}_{variable}_{country}_{init_year}_{member}_{freq}.csv"
df_name_next = f"{model}_{experiment}_{variable}_{country}_{init_year+1}_{member}_{freq}.csv"

In [7]:
# join the dfs
df = pd.read_csv(os.path.join(output_dir_dfs, df_name))
df_next = pd.read_csv(os.path.join(output_dir_dfs, df_name_next))

In [8]:
df.head()

Unnamed: 0,init_year,member,lead,data
0,1960,3,1,283.936085
1,1960,3,2,281.669556
2,1960,3,3,280.445822
3,1960,3,4,280.883473
4,1960,3,5,280.189686


In [9]:
df_next.head()

Unnamed: 0,init_year,member,lead,data
0,1961,3,1,283.591079
1,1961,3,2,282.98202
2,1961,3,3,278.014896
3,1961,3,4,277.302433
4,1961,3,5,282.388435


In [10]:
# join the dataframes
df = pd.concat([df, df_next])

In [11]:
df.tail()

Unnamed: 0,init_year,member,lead,data
3745,1961,3,3746,278.792032
3746,1961,3,3747,278.351358
3747,1961,3,3748,276.485531
3748,1961,3,3749,279.756007
3749,1961,3,3750,278.945528


In [12]:
# Set up the years
years = np.arange(init_year, 2018 + 1)

# Set up the members
members = np.arange(1, 10 + 1)

# Set up an empty df
df_all = pd.DataFrame()

# Loop over the years and members
for year in tqdm(years):
    for member in members:
        # Set up the name for the df
        df_name = f"{model}_{experiment}_{variable}_{country}_{year}_{member}_{freq}.csv"
        df = pd.read_csv(os.path.join(output_dir_dfs, df_name))
        df_all = pd.concat([df_all, df])

100%|██████████| 59/59 [00:22<00:00,  2.59it/s]


In [13]:
df_all.tail()

Unnamed: 0.1,Unnamed: 0,init_year,member,lead,data
3745,,2018,10,3746,277.801051
3746,,2018,10,3747,276.441295
3747,,2018,10,3748,277.51868
3748,,2018,10,3749,279.036363
3749,,2018,10,3750,281.840242


In [14]:
# drop the unnamed column
df_all = df_all.drop(columns="Unnamed: 0")

In [15]:
# reset the index
df_all_reset = df_all.reset_index(drop=True)

In [16]:
df_all_reset.head()

Unnamed: 0,init_year,member,lead,data
0,1960,1,1,283.904326
1,1960,1,2,281.653069
2,1960,1,3,280.558694
3,1960,1,4,280.931852
4,1960,1,5,280.171699


In [17]:
%%time

# Set up a name for the df all reset
df_name_all = f"{model}_{experiment}_{variable}_{country}_1960-2018_{freq}.csv"

# Save the df
df_all_reset.to_csv(os.path.join(output_dir_dfs, df_name_all), index=False)

CPU times: user 2.14 s, sys: 53 ms, total: 2.2 s
Wall time: 2.5 s


In [18]:
df_all_reset.head()

Unnamed: 0,init_year,member,lead,data
0,1960,1,1,283.904326
1,1960,1,2,281.653069
2,1960,1,3,280.558694
3,1960,1,4,280.931852
4,1960,1,5,280.171699


In [19]:
# Load and process the observed data to compare against
# Set up the path to the observed data
base_path = "/gws/nopw/j04/canari/users/benhutch/ERA5/"

test_file_path = "/badc/cmip6/data/CMIP6/DCPP/MOHC/HadGEM3-GC31-MM/dcppA-hindcast/s1961-r9i1p1f2/day/tas/gn/files/d20200417/tas_day_HadGEM3-GC31-MM_dcppA-hindcast_s1961-r9i1p1f2_gn_19720101-19720330.nc"

# if the variable is tas
if variable == "tas":
    obs_path = os.path.join(base_path, "ERA5_t2m_daily_1950_2020.nc")
elif variable == "sfcWind":
    obs_path = os.path.join(base_path, "ERA5_wind_daily_1960_2020.nc")
else:
    raise ValueError("Variable not recognised.")

# Assert that the obs path exists
assert os.path.exists(obs_path), "Observations path does not exist."

# set up the obs variab;e
if variable == "tas":
    obs_variable = "t2m"
elif variable == "sfcWind":
    obs_variable = "si10"
else:
    raise ValueError("Variable not recognised.")

# load the obs cube test
obs_cube_test = iris.load_cube(obs_path)

# load the model cube test (for regridding)
model_cube_test = iris.load_cube(test_file_path)

# constrain to the relevant years
obs_cube_test = obs_cube_test.extract(
    iris.Constraint(time=lambda cell: 1960 <= cell.point.year <= 2018)
)

# perform the intersection
obs_cube_test = obs_cube_test.intersection(
    latitude=(30, 80),
    longitude=(-40, 30),
)

# print the model cube test dimensions
print("Model cube test dimensions:")
print(model_cube_test)

# Select the first member and time from the model cube
model_cube_regrid = model_cube_test[0, :, :]

# print the model cube regrid dimensions
print("Model cube regrid dimensions:")
print(model_cube_regrid)

model_cube_regrid.coord("latitude").units = obs_cube_test[0].coord("latitude").units
model_cube_regrid.coord("longitude").units = obs_cube_test[0].coord("longitude").units

# and for the attributes
model_cube_regrid.coord("latitude").attributes = obs_cube_test[0].coord("latitude").attributes
model_cube_regrid.coord("longitude").attributes = obs_cube_test[0].coord("longitude").attributes

obs_cube_regrid = obs_cube_test.regrid(model_cube_regrid, iris.analysis.Linear())

Model cube test dimensions:
air_temperature / (K)               (time: 90; latitude: 324; longitude: 432)
    Dimension coordinates:
        time                             x             -               -
        latitude                         -             x               -
        longitude                        -             -               x
    Scalar coordinates:
        height                      1.5 m
    Cell methods:
        0                           area: time: mean
    Attributes:
        Conventions                 'CF-1.7 CMIP-6.2'
        activity_id                 'DCPP'
        branch_method               'no parent'
        branch_time_in_child        np.float64(0.0)
        branch_time_in_parent       np.float64(0.0)
        cmor_version                '3.4.0'
        comment                     'near-surface (usually, 2 meter) air temperature'
        creation_date               '2020-06-15T03:03:46Z'
        cv_version                  '6.2.37.5'
        da



In [20]:
model_cube_test

Air Temperature (K),time,latitude,longitude
Shape,90,324,432
Dimension coordinates,,,
time,x,-,-
latitude,-,x,-
longitude,-,-,x
Scalar coordinates,,,
height,1.5 m,1.5 m,1.5 m
Cell methods,,,
0,area: time: mean,area: time: mean,area: time: mean
Attributes,,,


In [21]:
country

'United_Kingdom'

In [22]:
# create the mask
MASK_MATRIX = funcs.create_masked_matrix(
    country="United Kingdom",
    cube=model_cube_regrid,
)

Found Country United Kingdom


In [23]:
%%time

obs_data = obs_cube_regrid.data

In [None]:
# Apply the mask to the observed and model data
obs_values = obs_data * MASK_MATRIX
# model_values = model_cube.data * MASK_MATRIX

# Where there are zeros we want to set these to NaNs
obs_values = np.where(obs_values == 0, np.nan, obs_values)
# model_values = np.where(model_values == 0, np.nan, model_values)

# Take the Nanmean of the data
# over lat and lon dims
obs_mean = np.nanmean(obs_values, axis=(1, 2))
# model_mean = np.nanmean(model_values, axis=(2, 3))

KeyboardInterrupt: 

In [None]:
%%time

# Load the model data
model_ds = funcs.load_model_data_xarray(
    model_variable=variable,
    model=model,
    experiment=experiment,
    start_year=init_year,
    end_year=init_year,
    first_fcst_year=init_year + 1,
    last_fcst_year=init_year + 10,
    months=months,
    member="r1i1p1f2",
    frequency=freq,
    parallel=False,
)

Model path: /badc/cmip6/data/CMIP6/DCPP/MOHC/HadGEM3-GC31-MM/dcppA-hindcast


The model path root is badc
Number of unique variant labels: 10
For model: HadGEM3-GC31-MM
First 10 unique variant labels: ['r10i1p1f2' 'r1i1p1f2' 'r2i1p1f2' 'r3i1p1f2' 'r4i1p1f2' 'r5i1p1f2'
 'r6i1p1f2' 'r7i1p1f2' 'r8i1p1f2' 'r9i1p1f2']
Number of unique variant labels: 10
Unique variant labels: ['r10i1p1f2' 'r1i1p1f2' 'r2i1p1f2' 'r3i1p1f2' 'r4i1p1f2' 'r5i1p1f2'
 'r6i1p1f2' 'r7i1p1f2' 'r8i1p1f2' 'r9i1p1f2']
First 10 model files: ['/badc/cmip6/data/CMIP6/DCPP/MOHC/HadGEM3-GC31-MM/dcppA-hindcast/s1960-r10i1p1f2/day/tas/gn/files/d20200417/tas_day_HadGEM3-GC31-MM_dcppA-hindcast_s1960-r10i1p1f2_gn_19601101-19601230.nc', '/badc/cmip6/data/CMIP6/DCPP/MOHC/HadGEM3-GC31-MM/dcppA-hindcast/s1960-r10i1p1f2/day/tas/gn/files/d20200417/tas_day_HadGEM3-GC31-MM_dcppA-hindcast_s1960-r10i1p1f2_gn_19610101-19611230.nc', '/badc/cmip6/data/CMIP6/DCPP/MOHC/HadGEM3-GC31-MM/dcppA-hindcast/s1960-r10i1p1f2/day/tas/gn/files/d20200417/tas_day_HadGEM3-GC31-MM_dcppA-hindcast_s1960-r10i1p1f2_gn_19620101-19621230.nc', 

100%|██████████| 1/1 [00:00<00:00,  2.62it/s]00:00<?, ?it/s]
Processing init years: 100%|██████████| 1/1 [00:00<00:00,  2.38it/s]

CPU times: user 332 ms, sys: 78.1 ms, total: 410 ms
Wall time: 809 ms





In [None]:
model_ds

Unnamed: 0,Array,Chunk
Bytes,58.59 kiB,16 B
Shape,"(1, 1, 3750, 2)","(1, 1, 1, 2)"
Dask graph,3750 chunks in 27 graph layers,3750 chunks in 27 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 58.59 kiB 16 B Shape (1, 1, 3750, 2) (1, 1, 1, 2) Dask graph 3750 chunks in 27 graph layers Data type object numpy.ndarray",1  1  2  3750  1,

Unnamed: 0,Array,Chunk
Bytes,58.59 kiB,16 B
Shape,"(1, 1, 3750, 2)","(1, 1, 1, 2)"
Dask graph,3750 chunks in 27 graph layers,3750 chunks in 27 graph layers
Data type,object numpy.ndarray,object numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.06 kiB,5.06 kiB
Shape,"(1, 1, 324, 2)","(1, 1, 324, 2)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 5.06 kiB 5.06 kiB Shape (1, 1, 324, 2) (1, 1, 324, 2) Dask graph 1 chunks in 4 graph layers Data type float64 numpy.ndarray",1  1  2  324  1,

Unnamed: 0,Array,Chunk
Bytes,5.06 kiB,5.06 kiB
Shape,"(1, 1, 324, 2)","(1, 1, 324, 2)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,6.75 kiB,6.75 kiB
Shape,"(1, 1, 432, 2)","(1, 1, 432, 2)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 6.75 kiB 6.75 kiB Shape (1, 1, 432, 2) (1, 1, 432, 2) Dask graph 1 chunks in 4 graph layers Data type float64 numpy.ndarray",1  1  2  432  1,

Unnamed: 0,Array,Chunk
Bytes,6.75 kiB,6.75 kiB
Shape,"(1, 1, 432, 2)","(1, 1, 432, 2)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.96 GiB,546.75 kiB
Shape,"(1, 1, 3750, 324, 432)","(1, 1, 1, 324, 432)"
Dask graph,3750 chunks in 27 graph layers,3750 chunks in 27 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.96 GiB 546.75 kiB Shape (1, 1, 3750, 324, 432) (1, 1, 1, 324, 432) Dask graph 3750 chunks in 27 graph layers Data type float32 numpy.ndarray",1  1  432  324  3750,

Unnamed: 0,Array,Chunk
Bytes,1.96 GiB,546.75 kiB
Shape,"(1, 1, 3750, 324, 432)","(1, 1, 1, 324, 432)"
Dask graph,3750 chunks in 27 graph layers,3750 chunks in 27 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [None]:
# convert modify the member coordinate
model_ds["member"] = model_ds["member"].str[1:-6].astype(int)

# convert to an iris cube
model_cube = model_ds[variable].squeeze().to_iris()

# Make sure cube is on the correct grid system
model_cube = model_cube.intersection(longitude=(-180, 180))

In [None]:
# Create the mask matrix for the UK
MASK_MATRIX = funcs.create_masked_matrix(
    country=country,
    cube=model_cube,
)

Found Country United Kingdom


In [None]:
%%time

model_data = model_cube.data

CPU times: user 25.9 s, sys: 4.07 s, total: 30 s
Wall time: 21.9 s


In [None]:
%%time

# Apply the mask to the model cube
model_values = model_data * MASK_MATRIX

# Where there are zeros in the mask we want to set these to Nans
model_values_masked = np.where(MASK_MATRIX == 0, np.nan, model_values)

CPU times: user 1.66 s, sys: 1.25 s, total: 2.91 s
Wall time: 2.93 s


In [None]:
%%time

# Take the Nanmean of the data
model_values = np.nanmean(model_values_masked, axis=(1, 2))

CPU times: user 3.4 s, sys: 817 ms, total: 4.22 s
Wall time: 4.24 s


In [None]:
model_values_masked.shape

(3750, 324, 432)

In [None]:
%%time

model_df = pd.DataFrame()

# Extract the ini years, member and lead times
init_years = model_cube.coord("init").points
members = model_cube.coord("member").points
lead_times = model_cube.coord("lead").points


# loop through the inits, members and leadtimes
for i, init_year in enumerate(init_years):
    for m, member in enumerate(members):
        for l, lead_time in enumerate(lead_times):
            # get the model data
            model_data = model_values[l]

            # set up the model df this
            model_df_this = pd.DataFrame(
                {
                    "init_year": [init_year],
                    "member": [member],
                    "lead": [lead_time],
                    "data": [model_data],
                },
            )

            # concat to the model df
            model_df = pd.concat([model_df, model_df_this])

CPU times: user 1.6 s, sys: 1.47 ms, total: 1.6 s
Wall time: 1.61 s


In [None]:
model_df.tail()

Unnamed: 0,init_year,member,lead,data
0,1960,1,3746,270.007662
0,1960,1,3747,270.725837
0,1960,1,3748,271.052666
0,1960,1,3749,271.333062
0,1960,1,3750,271.023811


In [None]:
# if country has a space, replace with _
country = country.replace(" ", "_")

# Set up the name for the df
df_name = f"{model}_{experiment}_{variable}_{country}_{init_year}_{member}_{freq}.csv"

In [None]:
df_name

'HadGEM3-GC31-MM_dcppA-hindcast_tas_United_Kingdom_1960_1_day.csv'

In [None]:
output_dir_dfs

'/gws/nopw/j04/canari/users/benhutch/unseen/saved_dfs'

In [None]:
# Save the df
model_df.to_csv(os.path.join(output_dir_dfs, df_name))