# ERA5 → CSV Exports (t2m + 10 m wind)

This notebook loads your ERA5 **GRIB** file and exports four CSVs. It looks for 2 m temperature (`t2m`) and 10 m wind components (`u10`, `v10`).
It computes 10 m wind speed as `sqrt(u10**2 + v10**2)` (in m/s).

### It will create:
1. `era5_dublin_t2m_<YEAR>.csv` — Dublin 2 m temperature
2. `era5_dublin_t2m_wind_<YEAR>.csv` — Dublin 2 m temperature + u10, v10, wind_speed10m_ms
3. `era5_ireland3_t2m_<YEAR>.csv` — 2 m temperature for Dublin, Galway, Cork
4. `era5_ireland3_t2m_wind_<YEAR>.csv` — t2m + wind speed for Dublin, Galway, Cork (one column per city/metric)

### Requirements
`xarray`, `cfgrib`, `eccodes`, `numpy`, `pandas`. Install if needed:
```bash
pip install xarray cfgrib eccodes numpy pandas
```


In [4]:
# --- Parameters ---
from pathlib import Path

# Path to your unzipped GRIB file
GRIB_PATH = Path('data.grib')  # <- change if needed

# Choose output directory for CSVs
OUTDIR = Path('.')  # current folder by default
OUTDIR.mkdir(parents=True, exist_ok=True)
GRIB_PATH

PosixPath('data.grib')

In [5]:
# --- Imports & helpers ---
import numpy as np
import pandas as pd
import xarray as xr

CITIES = {
    'Dublin': (53.35, -6.26),
    'Galway': (53.27, -9.05),
    'Cork':   (51.90, -8.47),
}

def open_era5_grib(path: Path) -> xr.Dataset:
    """Open a GRIB file with xarray/cfgrib. If multiple groups exist, merge them."""
    try:
        ds = xr.open_dataset(path, engine='cfgrib')
        if len(ds.data_vars) > 0:
            return ds
    except Exception as e:
        print('Simple open failed, trying grouped open via cfgrib.open_datasets()...\n', e)
    import cfgrib
    groups = cfgrib.open_datasets(str(path))
    print(f'Found {len(groups)} message group(s)')
    ds = xr.merge(groups, compat='no_conflicts', join='outer')
    return ds

def pick_coord_names(ds: xr.Dataset):
    lat_name = 'latitude' if 'latitude' in ds.coords else 'lat'
    lon_name = 'longitude' if 'longitude' in ds.coords else 'lon'
    time_name = 'time' if 'time' in ds.coords else next(c for c in ds.coords if 'time' in c)
    return lat_name, lon_name, time_name

def find_var(ds: xr.Dataset, candidates, keywords=None):
    """Return the first variable in candidates or whose name/attrs match keywords."""
    for cand in candidates:
        if cand in ds.data_vars:
            return cand
    kws = [k.lower() for k in (keywords or [])]
    for v, da in ds.data_vars.items():
        blob = ' '.join([v] + [str(x) for x in da.attrs.values()]).lower()
        if all(kw in blob for kw in kws):
            return v
    return None

def ensure_units_t2m(da: xr.DataArray) -> xr.DataArray:
    units = da.attrs.get('units', '').lower()
    if units in {'k', 'kelvin'} or float(da.max()) > 200:
        da = da - 273.15
        da.attrs['units'] = 'degC'
    return da

def select_point_series(ds: xr.Dataset, var: str, lat: float, lon: float, lat_name: str, lon_name: str):
    lon_vals = ds[lon_name]
    target_lon = (lon + 360) % 360 if float(lon_vals.max()) > 180 else lon
    da = ds[var].sel({lat_name: lat, lon_name: target_lon}, method='nearest').squeeze(drop=True)
    return da

def restrict_to_year(da: xr.DataArray, time_name: str, year: int) -> xr.DataArray:
    t = pd.to_datetime(da[time_name].values)
    mask = (t >= f'{year}-01-01') & (t <= f'{year}-12-31 23:59:59')
    return da.isel({time_name: np.where(mask)[0]})

def maybe_resample(ser: pd.Series, how: str = 'none'):
    if how == 'daily_mean':
        return ser.resample('1D').mean()
    return ser  # daily mean is typical for wind components and speed

def wind_speed(u: xr.DataArray, v: xr.DataArray) -> xr.DataArray:
    return np.sqrt(u**2 + v**2)


In [6]:
# --- Open dataset and pick variables ---
assert GRIB_PATH.exists(), f'GRIB not found: {GRIB_PATH}'
ds = open_era5_grib(GRIB_PATH)
lat_name, lon_name, time_name = pick_coord_names(ds)

var_t2m = find_var(ds, candidates=['t2m', 't'], keywords=['2', 'metre', 'temperature']) or 't2m'
if var_t2m not in ds:
    raise RuntimeError('Could not find 2 m temperature (t2m) in dataset.')
var_u10 = find_var(ds, candidates=['u10'], keywords=['10', 'wind', 'u-component']) or 'u10'
var_v10 = find_var(ds, candidates=['v10'], keywords=['10', 'wind', 'v-component']) or 'v10'
if var_u10 not in ds or var_v10 not in ds:
    print('Warning: u10 and/or v10 not found — wind exports will omit missing components.')
years_available = sorted({pd.to_datetime(t).year for t in ds[time_name].values})
years_available

skipping variable: paramId==228 shortName='tp'
Traceback (most recent call last):
  File "/Users/andrewparnell/GitHub/STAT41130/.venv/lib/python3.11/site-packages/cfgrib/dataset.py", line 725, in build_dataset_components
    dict_merge(variables, coord_vars)
  File "/Users/andrewparnell/GitHub/STAT41130/.venv/lib/python3.11/site-packages/cfgrib/dataset.py", line 641, in dict_merge
    raise DatasetBuildError(
cfgrib.dataset.DatasetBuildError: key present and new value is different: key='time' value=Variable(dimensions=('time',), data=array([1704067200, 1704070800, 1704074400, ..., 1735678800, 1735682400,
       1735686000], shape=(8784,))) new_value=Variable(dimensions=('time',), data=array([1704045600, 1704088800, 1704132000, 1704175200, 1704218400,
       1704261600, 1704304800, 1704348000, 1704391200, 1704434400,
       1704477600, 1704520800, 1704564000, 1704607200, 1704650400,
       1704693600, 1704736800, 1704780000, 1704823200, 1704866400,
       1704909600, 1704952800, 1704996

[2024]

## Choose year and (optional) resampling
- `YEAR`: one of the years detected above.
- `RESAMPLE`: `'none'` keeps native timestep (e.g., 6-hourly). Use `'daily_mean'` to average to daily for both temperature and wind.

In [7]:
YEAR = int(years_available[0])  # <- change if needed, e.g., 2021
RESAMPLE = 'none'  # options: 'none', 'daily_mean'
YEAR, RESAMPLE

(2024, 'none')

## 1) Dublin — 2 m temperature

In [8]:
t2m_dub = ensure_units_t2m(select_point_series(ds, var_t2m, *CITIES['Dublin'], lat_name, lon_name))
t2m_dub_y = restrict_to_year(t2m_dub, time_name, YEAR).to_series()
t2m_dub_y = maybe_resample(t2m_dub_y, RESAMPLE)
df1 = pd.DataFrame({'time': t2m_dub_y.index, 't2m_degC': t2m_dub_y.values}).set_index('time')
csv1 = OUTDIR / f'era5_dublin_t2m_{YEAR}.csv'
df1.to_csv(csv1)
csv1

PosixPath('era5_dublin_t2m_2024.csv')

## 2) Dublin — 2 m temperature + 10 m wind (u10, v10, speed)

In [9]:
if var_u10 in ds and var_v10 in ds:
    u_dub = select_point_series(ds, var_u10, *CITIES['Dublin'], lat_name, lon_name)
    v_dub = select_point_series(ds, var_v10, *CITIES['Dublin'], lat_name, lon_name)
    ws_dub = wind_speed(u_dub, v_dub)
    t2m_s = t2m_dub_y
    u_s = maybe_resample(restrict_to_year(u_dub, time_name, YEAR).to_series(), RESAMPLE)
    v_s = maybe_resample(restrict_to_year(v_dub, time_name, YEAR).to_series(), RESAMPLE)
    ws_s = maybe_resample(restrict_to_year(ws_dub, time_name, YEAR).to_series(), RESAMPLE)
    # Align indexes
    df2 = pd.concat([t2m_s, u_s, v_s, ws_s], axis=1)
    df2.columns = ['t2m_degC', 'u10_ms', 'v10_ms', 'wind_speed10m_ms']
else:
    print('u10/v10 missing: exporting temperature-only for step 2.')
    df2 = df1.copy()
csv2 = OUTDIR / f'era5_dublin_t2m_wind_{YEAR}.csv'
df2.to_csv(csv2)
csv2

PosixPath('era5_dublin_t2m_wind_2024.csv')

## 3) Dublin, Galway, Cork — 2 m temperature

In [10]:
rows = {}
for city in ['Dublin', 'Galway', 'Cork']:
    da = ensure_units_t2m(select_point_series(ds, var_t2m, *CITIES[city], lat_name, lon_name))
    ser = restrict_to_year(da, time_name, YEAR).to_series()
    ser = maybe_resample(ser, RESAMPLE)
    rows[city] = ser
df3 = pd.concat(rows, axis=1)
df3.columns = [f'{c}_t2m_degC' for c in df3.columns]
df3.index.name = 'time'
csv3 = OUTDIR / f'era5_ireland3_t2m_{YEAR}.csv'
df3.to_csv(csv3)
csv3

PosixPath('era5_ireland3_t2m_2024.csv')

## 4) Dublin, Galway, Cork — 2 m temperature + 10 m wind speed

In [11]:
if var_u10 in ds and var_v10 in ds:
    t_rows, ws_rows = {}, {}
    for city in ['Dublin', 'Galway', 'Cork']:
        da_t = ensure_units_t2m(select_point_series(ds, var_t2m, *CITIES[city], lat_name, lon_name))
        da_u = select_point_series(ds, var_u10, *CITIES[city], lat_name, lon_name)
        da_v = select_point_series(ds, var_v10, *CITIES[city], lat_name, lon_name)
        da_ws = wind_speed(da_u, da_v)
        st = maybe_resample(restrict_to_year(da_t, time_name, YEAR).to_series(), RESAMPLE)
        sws = maybe_resample(restrict_to_year(da_ws, time_name, YEAR).to_series(), RESAMPLE)
        t_rows[city] = st
        ws_rows[city] = sws
    df_t = pd.concat(t_rows, axis=1)
    df_ws = pd.concat(ws_rows, axis=1)
    df4 = pd.concat({'t2m_degC': df_t, 'wind_speed10m_ms': df_ws}, axis=1)
    # Flatten MultiIndex columns to City_Var format
    df4.columns = [f"{city}_{lvl}" for lvl, city in zip([c[0] for c in df4.columns], [c[1] for c in df4.columns])]
    df4.index.name = 'time'
else:
    print('u10/v10 missing: exporting temperature-only for step 4.')
    df4 = df3.copy()
csv4 = OUTDIR / f'era5_ireland3_t2m_wind_{YEAR}.csv'
df4.to_csv(csv4)
csv4

PosixPath('era5_ireland3_t2m_wind_2024.csv')

### Done
The four CSVs should now be saved in `OUTDIR`. Set `RESAMPLE = 'daily_mean'` if you prefer daily averages.
Wind speed is in **m/s**. Temperature is in **°C**.
