# Extract reanalysis data at point - station

Code correct as of Summer 2023.

Extract data interpolated at a specified point from the daily ERA-Interim/ERA5/ERA5-Land data held on the bas_climate group workspace. The currently available daily data (all means unless otherwise stated) are:

* MSLP (msl) or surface pressure (sp - ERA5-land)
* 2m air temperature -- daily mean (t2m/mean_t2m), maximum (mx2t/max_t2m) and minimum (mn2t/min_t2m)
* soil temperature on 4 levels (stl1, stl2, stl3, stl4)
* 10m wind field components (u10, v10)
* 2m dew point temperature (d2m)
* daily total precipitation (tp)
* snow depth (sd) and snow density (rsn)

The variable codes shown in brackets above are the names of the directories in which the data for that variable are held on the bas_climate gws.

* ERA-Interim data are available January 1979 to August 2019
* ERA5 data are available January 1940 to present
* ERA5-Land data are available January 1950 to present

## Inputs - always run

In [None]:
from datetime import datetime
from glob import glob
import iris
import iris.pandas
from iris.time import PartialDateTime
from pathlib import Path
import numpy as np
import os
import pandas as pd
from tqdm import tqdm

# Set the envionment variable HDF5_USE_FILE_LOCKING to avoid potential hangs
# This means that the netCDF library ignores advisory exclusive locks on ERA5 data files, otherwise a hang may occur
os.environ['HDF5_USE_FILE_LOCKING'] = 'FALSE'

## Parameters

In [None]:
# the variables to read - see list of variable codes above
# NOTE - variablesdaily does NOT include d2m because 'TTTR' does not include a d2m value

# ERA-Interim
#variables = ['msl', 'd2m', 't2m', 'u10', 'v10', 'mx2t', 'mn2t', 'tp']
#variablesdaily = ['SD', 'RSN', 'T2', 'TP', 't2max', 't2min', 'MSL', 'u10', 'v10']

# ERA5
variables = ['msl', 'd2m', 't2m', 'u10', 'v10', 'mx2t', 'mn2t', 'tp']
variablesdaily = ['stl1', 'stl2', 'stl3', 'stl4', 'msl', 'sd', 'rsn', 't2m', 'mx2t', 'mn2t', 'tp', 'u10', 'v10']

# ERA5-Land
#variables = ['sp', 'd2m', 'mean_t2m', 'u10', 'v10', 'max_t2m', 'min_t2m', 'tp']
#variablesdaily = ['stl1', 'stl2', 'stl3', 'stl4', 'sp', 'sd', 'rsn', 'd2m', 'mean_t2m', 'max_t2m', 'min_t2m', 'tp', 'u10', 'v10']

# define station WMO number
station = 23678

# the point at which to extract the data - use met station coordinates
# this will interpolate the coarse resolution data
# positive for degrees North/East - negative for degrees South/West
lat = 63.15
lon = 87.95

# the range of dates over which to extract the data - year, month, day
start_year, start_month, start_day = 1979, 1, 1
end_year, end_month, end_day = 2019, 8, 31

## Read and extract data - sub-daily

In [None]:
# ERA-Interim is 6-hourly; ERA5 and ERA5-Land are 3-hourly
# the location of the data
hourly_data_dir = '/gws/nopw/j04/bas_climate/users/clelland/era5/58.0N-64.5N_82.0E-95.5E/3-hourly' # <-- CHANGE AS NECESSARY

values = iris.cube.CubeList()

for i, variable in enumerate(variables):
    print('At {}, reading data for {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), variable))

    files = sorted(glob('{}/{}/*.nc'.format(hourly_data_dir, variable)))
    var_values = iris.cube.CubeList()

    for file in tqdm(files):
        # read the data
        cube = iris.load_cube(file)

        # extract the required date range
        # note: if the date range does not intersect the range of data read, the resulting cube will be None
        time_coord_name = cube.coord(axis='t').name()
        start_date_pdt = PartialDateTime(start_year, start_month, start_day)
        start_con = iris.Constraint(coord_values={time_coord_name:lambda cell: start_date_pdt <= cell.point})
        end_date_pdt = PartialDateTime(end_year, end_month, end_day)
        end_con = iris.Constraint(coord_values={time_coord_name:lambda cell: end_date_pdt >= cell.point})
        cube = cube.extract(start_con & end_con)

        if cube is not None:
            # interpolate the value at the requested point
            var_values.append(cube.interpolate([(cube.coord(axis='x'), lon), (cube.coord(axis='y'), lat)], iris.analysis.Linear(extrapolation_mode='error')))

    # concatenate the interpolated cubes into a single cube and store it
    values.append(var_values.concatenate_cube())

In [None]:
for i, var_values in enumerate(values):
    # copy the cube and set the "time" part of its time coordinate to 12:00 so that they all agree
    # this is not the case in the daily data files as the means of hourly means are 24-hour means running from 00:00 to 24:00,
    # but the means of instantaneous data are actually 23-hour means running from 00:00 to 23:00
    var_values_for_df = var_values.copy()
    time_coord = var_values_for_df.coord(axis='t') # <-- CHECK THIS: could be 'time'
    time_coord.points = np.array([time_coord.units.date2num(tp.replace(minute=0)) for tp in time_coord.units.num2date(time_coord.points)])

    # create or add the values to a DataFrame
    if i == 0:
        df = iris.pandas.as_series(var_values_for_df).to_frame(name=var_values_for_df.var_name)
    else:
        df = pd.concat([df, iris.pandas.as_series(var_values_for_df).to_frame(name=var_values_for_df.var_name)], axis=1)

df.index.rename(time_coord.name(), inplace=True)

# save station pandas dataframe to folder in directory
filepath = Path(f'/home/users/clelland/era5/stations_pandas_files/{station}/{station}e53h.csv') # <-- CHANGE AS NECESSARY
df.to_csv(filepath, sep=';')

## Read and extract data - daily

In [None]:
# the location of the daily data
daily_data_dir = '/gws/nopw/j04/bas_climate/users/clelland/era5/58.0N-64.5N_82.0E-95.5E/daily' # <-- CHANGE AS NECESSARY

valuesdaily = iris.cube.CubeList()

for i, variable in enumerate(variablesdaily):
    print('At {}, reading data for {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), variable))

    files = sorted(glob('{}/{}/*.nc'.format(daily_data_dir, variable)))
    var_values = iris.cube.CubeList()

    for file in tqdm(files):
        # read the data
        cube = iris.load_cube(file)

        # extract the required date range
        # note: if the date range does not intersect the range of data read, the resulting cube will be None
        time_coord_name = cube.coord(axis='t').name()
        start_date_pdt = PartialDateTime(start_year, start_month, start_day)
        start_con = iris.Constraint(coord_values={time_coord_name:lambda cell: start_date_pdt <= cell.point})
        end_date_pdt = PartialDateTime(end_year, end_month, end_day)
        end_con = iris.Constraint(coord_values={time_coord_name:lambda cell: end_date_pdt >= cell.point})
        cube = cube.extract(start_con & end_con)

        if cube is not None:
            # interpolate the value at the requested point
            var_values.append(cube.interpolate([(cube.coord(axis='x'), lon), (cube.coord(axis='y'), lat)], iris.analysis.Linear(extrapolation_mode='error')))

    # concatenate the interpolated cubes into a single cube and store it
    valuesdaily.append(var_values.concatenate_cube())

In [None]:
for i, var_values in enumerate(valuesdaily):
    # copy the cube and set the "time" part of its time coordinate to 12:00 so that they all agree
    # this is not the case in the daily data files as the means of hourly means are 24-hour means running from 00:00 to 24:00,
    # but the means of instantaneous data are actually 23-hour means running from 00:00 to 23:00
    var_values_for_df = var_values.copy()
    time_coord = var_values_for_df.coord(axis='t') # <-- CHECK THIS: could be 'time'
    time_coord.points = np.array([time_coord.units.date2num(tp.replace(hour=12, minute=0)) for tp in time_coord.units.num2date(time_coord.points)])

    # create or add the values to a DataFrame
    if i == 0:
        dfdaily = iris.pandas.as_series(var_values_for_df).to_frame(name=var_values_for_df.var_name)
    else:
        dfdaily = pd.concat([dfdaily, iris.pandas.as_series(var_values_for_df).to_frame(name=var_values_for_df.var_name)], axis=1)

dfdaily.index.rename(time_coord.name(), inplace=True)

# save station pandas dataframe to folder in home directory
filepath = Path(f'/home/users/clelland/era5/stations_pandas_files/{station}/{station}e5daily.csv') # <-- CHANGE AS NECESSARY
dfdaily.to_csv(filepath, sep=';')