## simple notebook to get ERA5 data for Kelmarsh wind farm

We often need to fill in gaps for missing on site records. Having datasets like MERRA2 and ERA5 gives us a data source that can be used to build models to fill gaps.

This notebook gets ERA5 data for the Kelmarsh wind farm in the UK at lat 52.401461, long -0.943105
using information available from <br>
CDS https://cds.climate.copernicus.eu/how-to-api


This code will not work until you sign up for a cds account and follow instructions on page above to get your own api key.

### Identify surrounding 4 grid points

In [1]:
import math
import numpy as np

def get_surrounding_grid_points(lat, lon, interval=0.25):
    # Calculate the nearest grid point
    nearest_lat = round(lat / interval) * interval
    nearest_lon = round(lon / interval) * interval

    # Calculate surrounding grid points
    lat_points = [nearest_lat - interval, nearest_lat, nearest_lat + interval]
    lon_points = [nearest_lon - interval, nearest_lon, nearest_lon + interval]

    # Generate all combinations of surrounding grid points
    surrounding_points = [(lat, lon) for lat in lat_points for lon in lon_points]
    
    return surrounding_points

def haversine(lat1, lon1, lat2, lon2):
    # Radius of the Earth in kilometers
    R = 6371.0

    # Convert latitude and longitude from degrees to radians
    lat1_rad = math.radians(lat1)
    lon1_rad = math.radians(lon1)
    lat2_rad = math.radians(lat2)
    lon2_rad = math.radians(lon2)

    # Compute differences
    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad

    # Haversine formula
    a = math.sin(dlat / 2)**2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    # Distance in kilometers
    distance = R * c
    return distance

def get_closest_grid_points(lat, lon, num_points=4):
    surrounding_points = get_surrounding_grid_points(lat, lon)
    distances = [(point, haversine(lat, lon, point[0], point[1])) for point in surrounding_points]
    distances.sort(key=lambda x: x[1])
    closest_points = [point for point, distance in distances[:num_points]]
    return closest_points

# Example usage
lat = 52.40
lon = -0.943
closest_points = get_closest_grid_points(lat, lon)
# closest_points is used in next cell

# Print the closest points and their distances
for point in closest_points:
    distance = np.round(haversine(lat, lon, point[0], point[1]), 3)
    print(f'For point {point}, distance is {distance} km from Kelmarsh at {lat}, {lon}')

For point (52.5, -1.0), distance is 11.771 km from Kelmarsh at 52.4, -0.943
For point (52.25, -1.0), distance is 17.123 km from Kelmarsh at 52.4, -0.943
For point (52.5, -0.75), distance is 17.167 km from Kelmarsh at 52.4, -0.943
For point (52.25, -0.75), distance is 21.219 km from Kelmarsh at 52.4, -0.943


### get the data from CDS

As noted above, you have to get an API key first from <br>
CDS https://cds.climate.copernicus.eu/how-to-api




In [None]:
import cdsapi
from calendar import monthrange
from pathlib import Path
import time

def download_era5_data(year, month, closest_points, output_dir, max_retries=3):
    # Initialize the CDS API client
    c = cdsapi.Client()

    # Get the number of days in the month
    num_days = monthrange(year, month)[1]

    # Ensure the output directory exists
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    # Calculate the expected range from the closest points
    lat_min = min(point[0] for point in closest_points)
    lat_max = max(point[0] for point in closest_points)
    lon_min = min(point[1] for point in closest_points)
    lon_max = max(point[1] for point in closest_points)

    # Request ERA5 data with retry mechanism
    for attempt in range(max_retries):
        try:
            c.retrieve(
                'reanalysis-era5-single-levels',
                {
                    'product_type': 'reanalysis',
                    'format': 'netcdf',  # Options: 'grib' or 'netcdf'
                    'variable': [
                        '2m_temperature', '10m_u_component_of_wind', '10m_v_component_of_wind',
                        'surface_pressure', '100m_u_component_of_wind', '100m_v_component_of_wind'
                    ],
                    'year': str(year),
                    'month': f'{month:02d}',
                    'day': [f'{day:02d}' for day in range(1, num_days + 1)],
                    'time': [
                        '00:00', '01:00', '02:00', '03:00', '04:00', '05:00',
                        '06:00', '07:00', '08:00', '09:00', '10:00', '11:00',
                        '12:00', '13:00', '14:00', '15:00', '16:00', '17:00',
                        '18:00', '19:00', '20:00', '21:00', '22:00', '23:00',
                    ],
                    'area': [
                        lat_max, lon_min, lat_min, lon_max,
                    ],  # North, West, South, East
                },
                output_dir / f'era5_single_levels_{year}{month:02d}.nc'  # Output file name
            )
            print(f'Successfully downloaded data for {year}-{month:02d} into {output_dir}')
            break  # Exit the loop if the download is successful
        except Exception as e:
            print(f'Failed to download data for {year}-{month:02d} on attempt {attempt + 1}: {e}')
            if attempt < max_retries - 1:
                print('Retrying...')
                time.sleep(5)  # Wait for 5 seconds before retrying
            else:
                print('Max retries reached. Moving to the next month.')

# Example usage: Loop through a year and download data for each month
year = 2020
output_dir = Path('era5_data')

# Ensure the output directory exists
output_dir.mkdir(parents=True, exist_ok=True)

# closest_points should be defined in a previous cell
# closest_points = [(52.25, -1.25), (52.25, -1.00), (52.25, -0.75), (52.50, -1.25)]

for month in range(1, 13):
    download_era5_data(year, month, closest_points, output_dir)

### combine nc files into polars df

The data is in netcdf format, so we mine the nc files and make a polars df


In [169]:
import xarray as xr
import polars as pl
from pathlib import Path

def load_nc_files_to_polars_df(nc_files):
    dfs = []
    for nc_file in nc_files:
        # Load the NetCDF file using xarray
        ds = xr.open_dataset(nc_file)

        # Convert xarray.Dataset to a Pandas DataFrame
        df = ds.to_dataframe().reset_index()

        # Convert Pandas DataFrame to Polars DataFrame
        pl_df = pl.from_pandas(df)

        # Append to the list of DataFrames
        dfs.append(pl_df)

    # Concatenate all Polars DataFrames
    combined_df = pl.concat(dfs)
    return combined_df

# Example usage: Load all .nc files in the output directory
output_dir = Path('era5_data')
nc_files = list(output_dir.glob('*.nc'))

# Load the NetCDF files into a Polars DataFrame
polars_df = (load_nc_files_to_polars_df(nc_files)
             .drop(['expver', 'number'])
             .sort(['valid_time', 'latitude', 'longitude']))

# rename to iec -25-2 naming convention
polars_df = polars_df.rename({'u10':'HorWdU_10m', 'v10':'HorWdV_10m', 
                              'u100':'HorWdU_100m', 'v100':'HorWdV_100m', 
                              't2m':'EnvTmp_2m', 'sp':'EnvPres_0m'})


In [173]:
# Set the display width
pl.Config.set_tbl_cols(100)  # Set the number of columns to display
pl.Config.set_tbl_width_chars(200)  # Set the width in characters
print(polars_df.describe())

shape: (9, 10)
┌────────────┬────────────────────────────┬──────────┬───────────┬────────────┬────────────┬────────────┬───────────────┬─────────────┬─────────────┐
│ statistic  ┆ valid_time                 ┆ latitude ┆ longitude ┆ EnvTmp_2m  ┆ HorWdU_10m ┆ HorWdV_10m ┆ EnvPres_0m    ┆ HorWdU_100m ┆ HorWdV_100m │
│ ---        ┆ ---                        ┆ ---      ┆ ---       ┆ ---        ┆ ---        ┆ ---        ┆ ---           ┆ ---         ┆ ---         │
│ str        ┆ str                        ┆ f64      ┆ f64       ┆ f64        ┆ f64        ┆ f64        ┆ f64           ┆ f64         ┆ f64         │
╞════════════╪════════════════════════════╪══════════╪═══════════╪════════════╪════════════╪════════════╪═══════════════╪═════════════╪═════════════╡
│ count      ┆ 35136                      ┆ 35136.0  ┆ 35136.0   ┆ 35136.0    ┆ 35136.0    ┆ 35136.0    ┆ 35136.0       ┆ 35136.0     ┆ 35136.0     │
│ null_count ┆ 0                          ┆ 0.0      ┆ 0.0       ┆ 0.0        ┆ 0.0  

In [174]:
print(polars_df.head())

shape: (5, 9)
┌─────────────────────┬──────────┬───────────┬────────────┬────────────┬────────────┬───────────────┬─────────────┬─────────────┐
│ valid_time          ┆ latitude ┆ longitude ┆ EnvTmp_2m  ┆ HorWdU_10m ┆ HorWdV_10m ┆ EnvPres_0m    ┆ HorWdU_100m ┆ HorWdV_100m │
│ ---                 ┆ ---      ┆ ---       ┆ ---        ┆ ---        ┆ ---        ┆ ---           ┆ ---         ┆ ---         │
│ datetime[ns]        ┆ f64      ┆ f64       ┆ f32        ┆ f32        ┆ f32        ┆ f32           ┆ f32         ┆ f32         │
╞═════════════════════╪══════════╪═══════════╪════════════╪════════════╪════════════╪═══════════════╪═════════════╪═════════════╡
│ 2020-01-01 00:00:00 ┆ 52.25    ┆ -1.0      ┆ 279.213928 ┆ -3.089966  ┆ 1.042038   ┆ 101720.429688 ┆ -5.053909   ┆ 2.368256    │
│ 2020-01-01 00:00:00 ┆ 52.25    ┆ -0.75     ┆ 279.210022 ┆ -3.121216  ┆ 1.08403    ┆ 101906.429688 ┆ -5.1418     ┆ 2.460052    │
│ 2020-01-01 00:00:00 ┆ 52.5     ┆ -1.0      ┆ 278.844788 ┆ -3.275513  ┆ 0.6

In [None]:
# Ensure the output directory exists
cwd = Path.cwd()
output_dir = cwd / 'output'
output_dir.mkdir(parents=True, exist_ok=True)
polars_df.write_csv(output_dir / 'era5_data.csv', datetime_format='%Y-%m-%d %H:%M:%S')
polars_df.write_parquet(output_dir / 'era5_data.parquet')