# Preparing past buoy data and reanalyses for use in model training

Concatenate all past buoy data into a single dataframe

This section will collect all of the cleaned buoy data and combine them into a single dataframe. A column to represent the day of year (DOY) as an integer is also added. These data will be used (along with weather reanalyses) as training data for the machine learning model. Also removes buoys deployed outside of the arctic (<64 degrees N)

In [None]:
import pandas as pd
import glob
import os

# Define the path to the folder containing the CSV files
folder_path = '../data/cleaned/buoydata/past'

# Use glob to get all CSV files in the folder
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))

# Initialize an empty list to store DataFrames
dfs = []

# Loop through the list of CSV files and read each one into a DataFrame
for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Rename the lat and lon columns to Latitude and Longitude
combined_df.rename(columns={'Lat': 'Latitude', 'Lon': 'Longitude'}, inplace=True)

# Pad Month, Day, Hour, Min, and Sec columns with leading zeros
combined_df['Month'] = combined_df['Month'].apply(lambda x: f'{x:02}')
combined_df['Day'] = combined_df['Day'].apply(lambda x: f'{x:02}')
combined_df['Hour'] = combined_df['Hour'].apply(lambda x: f'{x:02}')
combined_df['Min'] = combined_df['Min'].apply(lambda x: f'{x:02}')
combined_df['Sec'] = combined_df['Sec'].apply(lambda x: f'{x:02}')

# Create a new column called datetime by combining Year, Month, Day, Hour, Min, and Sec columns
combined_df['datetime'] = pd.to_datetime(combined_df['Year'].astype(str) + '-' +
                                         combined_df['Month'].astype(str) + '-' +
                                         combined_df['Day'].astype(str) + ' ' +
                                         combined_df['Hour'].astype(str) + ':' +
                                         combined_df['Min'].astype(str) + ':' +
                                         combined_df['Sec'].astype(str))

# Add a new column with the Day of Year (DOY) as an integer
combined_df['DOY'] = combined_df['datetime'].dt.dayofyear

# Iterate through the combined_df by BuoyID
filtered_dfs = []
for buoy_id, group in combined_df.groupby('BuoyID'):
    # Sort the records for each BuoyID by datetime from oldest to newest
    group = group.sort_values(by='datetime')

    # Remove duplicate datetime values, keeping only the first occurrence
    group = group.drop_duplicates(subset=['datetime'], keep='first')

    # Append the filtered group to the list
    filtered_dfs.append(group)

# Concatenate the filtered groups back into a single DataFrame
combined_df = pd.concat(filtered_dfs, ignore_index=True)

Interpolate ERA5 to buoy data

In [8]:
import netCDF4 as nc
import numpy as np
import pandas as pd
from scipy.spatial import cKDTree
from datetime import timezone

# Define paths to the NetCDF files and their corresponding variables
netcdf_files = {
    'ERA5_sea_ice_cover': '../data/raw/reanalyses/ERA5/era5_sea_ice_cover_2023.nc',
    'ERA5_10m_u_component_of_wind': '../data/raw/reanalyses/ERA5/era5_10m_u_component_of_wind_2023.nc',
    'ERA5_10m_v_component_of_wind': '../data/raw/reanalyses/ERA5/era5_10m_v_component_of_wind_2023.nc',
    'ERA5_100m_u_component_of_wind': '../data/raw/reanalyses/ERA5/era5_100m_u_component_of_wind_2023.nc',
    'ERA5_100m_v_component_of_wind': '../data/raw/reanalyses/ERA5/era5_100m_v_component_of_wind_2023.nc'
}

# Load the combined DataFrame
# combined_df should already exist with 'datetime', 'Latitude', and 'Longitude' columns
# Example:
# combined_df = pd.read_csv('path_to_combined_df.csv')

# Add a timestamp column to combined_df
combined_df['timestamp'] = combined_df['datetime'].apply(lambda x: int(x.replace(tzinfo=timezone.utc).timestamp()))

# Iterate over each variable and interpolate values
for variable, file_path in netcdf_files.items():
    print(f"Processing variable: {variable}")
    
    # Open the NetCDF file
    ds = nc.Dataset(file_path)
    
    # Extract the necessary variables
    valid_time = ds.variables['valid_time'][:]
    latitudes = ds.variables['latitude'][:]
    longitudes = ds.variables['longitude'][:]
    data_array = ds.variables[list(ds.variables.keys())[-1]][:]  # Assuming last variable is the data

    # Check dimensions and adjust for 3D arrays
    if len(data_array.shape) == 4:  # Time, Level, Lat, Lon
        data_array = data_array[:, 0, :, :]  # Take the first level

    # Create a KDTree for spatial lookup
    lat_lon_pairs = np.array([(lat, lon) for lat in latitudes for lon in longitudes])
    tree = cKDTree(lat_lon_pairs)

    # Add a new column for the variable in combined_df
    combined_df[variable] = np.nan

    # Iterate through each row in the DataFrame
    for index, row in combined_df.iterrows():
        # Find the closest time index
        timestamp = row['timestamp']
        time_diffs = np.abs(valid_time - timestamp)
        closest_time_index = np.argmin(time_diffs)

        # Skip if out of bounds
        if closest_time_index < 0 or closest_time_index >= data_array.shape[0]:
            print(f"Skipping row {index} due to time out of bounds")
            continue

        # Extract the corresponding slice
        data_slice = data_array[closest_time_index, :, :]

        # Find the closest grid point
        lat_lon = (row['Latitude'], row['Longitude'])
        _, closest_point_index = tree.query(lat_lon)
        closest_lat, closest_lon = lat_lon_pairs[closest_point_index]

        # Find indices of the closest latitude and longitude
        lat_index = np.where(latitudes == closest_lat)[0][0]
        lon_index = np.where(longitudes == closest_lon)[0][0]

        # Get the value and check if it's valid
        value = data_slice[lat_index, lon_index]
        if np.ma.is_masked(value) or np.isnan(value):
            # Find the nearest valid value by expanding the search
            found_valid_value = False
            for neighbor_index in range(2, len(lat_lon_pairs) + 1):
                _, expanded_indices = tree.query(lat_lon, k=neighbor_index)
                for expanded_index in expanded_indices:
                    nearest_lat, nearest_lon = lat_lon_pairs[expanded_index]
                    nearest_lat_index = np.where(latitudes == nearest_lat)[0][0]
                    nearest_lon_index = np.where(longitudes == nearest_lon)[0][0]
                    value = data_slice[nearest_lat_index, nearest_lon_index]
                    if not np.ma.is_masked(value) and not np.isnan(value):  # Found a valid value
                        found_valid_value = True
                        break
                if found_valid_value:
                    break

        # Assign the interpolated value to the DataFrame and round to 3 decimal places
        combined_df.at[index, variable] = round(value, 3) if not np.ma.is_masked(value) else np.nan

    print(f"Finished processing {variable}")

# Ensure all interpolated columns are rounded to 3 decimal places
combined_df[list(netcdf_files.keys())] = combined_df[list(netcdf_files.keys())].round(3)

# Save the updated DataFrame
# combined_df.to_csv('path_to_updated_combined_df.csv', index=False)

# Print completion message
print("All variables have been processed and interpolated.")

Processing variable: ERA5_sea_ice_cover
Finished processing ERA5_sea_ice_cover
Processing variable: ERA5_10m_u_component_of_wind
Finished processing ERA5_10m_u_component_of_wind
Processing variable: ERA5_10m_v_component_of_wind
Finished processing ERA5_10m_v_component_of_wind
Processing variable: ERA5_100m_u_component_of_wind
Finished processing ERA5_100m_u_component_of_wind
Processing variable: ERA5_100m_v_component_of_wind
Finished processing ERA5_100m_v_component_of_wind
All variables have been processed and interpolated.


In [9]:
# Check for NaNs in the DataFrame X
nan_counts = combined_df.isna().sum()

# Print the columns with NaNs and their counts
print("Columns with NaNs and their counts:")
print(nan_counts[nan_counts > 0])

Columns with NaNs and their counts:
Series([], dtype: int64)


Interpolate IBCAO v5 bathymetry to buoy data

In [10]:
import pandas as pd
import numpy as np
import rasterio
from pyproj import Transformer

# Load the georeferenced raster file
raster_path = '../data/raw/geospatial/ibcao_v5_2024_100m_depth.tiff'

with rasterio.open(raster_path) as raster:
    # Get raster metadata
    raster_data = raster.read(1)  # Load raster band data
    transform = raster.transform  # Affine transformation matrix
    nodata = raster.nodata  # NoData value for the raster

    # Create a transformer to convert coordinates from WGS 1984 (EPSG:4326) to the raster's CRS (EPSG:3996)
    transformer = Transformer.from_crs("EPSG:4326", raster.crs, always_xy=True)

    # Function to get row and column indices for latitude and longitude
    def get_row_col(x, y, transform):
        col, row = ~transform * (x, y)
        return int(row), int(col)

    # Create a function to get raster values using numpy indexing
    def get_raster_values(latitudes, longitudes):
        # Transform WGS 1984 coordinates to the raster CRS
        transformed_coords = transformer.transform(longitudes, latitudes)
        x_coords, y_coords = transformed_coords

        # Get row and column indices
        rows, cols = zip(*[get_row_col(x, y, transform) for x, y in zip(x_coords, y_coords)])
        rows = np.array(rows)
        cols = np.array(cols)

        # Ensure indices are within bounds
        valid_mask = (
            (rows >= 0) & (rows < raster_data.shape[0]) &
            (cols >= 0) & (cols < raster_data.shape[1])
        )
        values = np.full(latitudes.shape, np.nan)  # Initialize output array with NaN
        values[valid_mask] = raster_data[rows[valid_mask], cols[valid_mask]]

        # Replace nodata values with NaN
        if nodata is not None:
            values[values == nodata] = np.nan
        
        # Round extracted raster values to 3 decimal places
        return np.round(values, 3)

# Assuming combined_df is already defined and contains 'Latitude' and 'Longitude' columns
# Example: combined_df = pd.read_csv('path_to_your_combined_df.csv')

# Extract raster values for all lat/lon pairs and round them to 3 decimal places
combined_df['IBCAOv5_bathymetry'] = get_raster_values(
    combined_df['Latitude'].values,
    combined_df['Longitude'].values
)


Add more data to the spreadsheet (wind vector and displacement/heading columns)

In [11]:
import numpy as np
from geopy.distance import great_circle  # For calculating displacement
import math  # For trigonometric calculations

def add_new_columns(combined_df):
    print("Calculating wind magnitude and wind angle...")

    # Calculate wind magnitude and wind angle
    combined_df.loc[:, 'ERA5_wind_magnitude_10m'] = np.round(
        np.sqrt(combined_df['ERA5_10m_u_component_of_wind']**2 + combined_df['ERA5_10m_v_component_of_wind']**2), 3
    )
    combined_df.loc[:, 'ERA5_wind_angle_10m'] = np.round(
        np.degrees(np.arctan2(combined_df['ERA5_10m_v_component_of_wind'], combined_df['ERA5_10m_u_component_of_wind'])), 3
    )
    combined_df.loc[:, 'ERA5_wind_magnitude_100m'] = np.round(
        np.sqrt(combined_df['ERA5_100m_u_component_of_wind']**2 + combined_df['ERA5_100m_v_component_of_wind']**2), 3
    )
    combined_df.loc[:, 'ERA5_wind_angle_100m'] = np.round(
        np.degrees(np.arctan2(combined_df['ERA5_100m_v_component_of_wind'], combined_df['ERA5_100m_u_component_of_wind'])), 3
    )

    print("Wind magnitude and wind angle calculated successfully.")
    print(combined_df.head())

    print("Calculating displacement, heading, time differences, and velocity...")

    # Function to calculate displacement, heading, time differences, and velocity for each group
    def calculate_movement_metrics(group):
        group = group.sort_values(by='datetime').reset_index(drop=True)
        
        # Initialize new columns
        group.loc[:, 'displacement'] = np.nan
        group.loc[:, 'heading'] = np.nan
        group.loc[:, 'time_to_next_position'] = np.nan
        group.loc[:, 'time_to_last_position'] = np.nan
        group.loc[:, 'velocity'] = np.nan

        if len(group) > 1:
            for i in range(len(group)):
                if i < len(group) - 1:  # Compute for all but the last row
                    prev_point = (group.loc[i, 'Latitude'], group.loc[i, 'Longitude'])
                    next_point = (group.loc[i + 1, 'Latitude'], group.loc[i + 1, 'Longitude'])

                    # Calculate displacement
                    displacement = great_circle(prev_point, next_point).meters
                    group.loc[i, 'displacement'] = round(displacement, 3)

                    # Calculate heading
                    lat1, lon1 = map(math.radians, prev_point)
                    lat2, lon2 = map(math.radians, next_point)

                    dlon = lon2 - lon1
                    x = math.sin(dlon) * math.cos(lat2)
                    y = math.cos(lat1) * math.sin(lat2) - (math.sin(lat1) * math.cos(lat2) * math.cos(dlon))
                    initial_heading = math.atan2(x, y)
                    initial_heading = math.degrees(initial_heading)
                    compass_heading = (initial_heading + 360) % 360

                    group.loc[i, 'heading'] = round(compass_heading, 3)

                    # Calculate time to next position
                    time_diff = (group.loc[i + 1, 'datetime'] - group.loc[i, 'datetime']).total_seconds()
                    group.loc[i, 'time_to_next_position'] = round(time_diff, 3)

                if i > 0:  # Compute for all but the first row
                    prev_time_diff = (group.loc[i, 'datetime'] - group.loc[i - 1, 'datetime']).total_seconds()
                    group.loc[i, 'time_to_last_position'] = round(prev_time_diff, 3)

                    # Calculate velocity as displacement / time
                    if prev_time_diff > 0 and not np.isnan(group.loc[i-1, 'displacement']):
                        group.loc[i, 'velocity'] = round(group.loc[i-1, 'displacement'] / prev_time_diff, 3)

        return group

    # Apply the function to each group
    combined_df = combined_df.groupby('BuoyID', group_keys=False).apply(calculate_movement_metrics).reset_index(drop=True)

    print("Displacement, heading, time differences, and velocity calculated successfully.")
    print(combined_df.head())

    return combined_df

# Add new columns for wind magnitude, angle, displacement, heading, time differences, and velocity
combined_df = add_new_columns(combined_df)

Calculating wind magnitude and wind angle...
Wind magnitude and wind angle calculated successfully.
            BuoyID  Year Month Day Hour Min Sec  Latitude  Longitude  \
0  300025010734900  2023    08  07   00  07  32  77.33740 -138.15785   
1  300025010734900  2023    08  07   00  51  05  77.33538 -138.13705   
2  300025010734900  2023    08  07   01  01  40  77.33479 -138.13317   
3  300025010734900  2023    08  07   02  01  21  77.33148 -138.11950   
4  300025010734900  2023    08  07   03  01  11  77.32867 -138.12018   

   GPSdelay  ...  ERA5_sea_ice_cover  ERA5_10m_u_component_of_wind  \
0         0  ...                0.49                         2.133   
1         0  ...                0.49                         1.552   
2         0  ...                0.49                         1.552   
3         0  ...                0.48                         1.249   
4         0  ...                0.48                         0.723   

   ERA5_10m_v_component_of_wind  ERA5_100m_u_c

  combined_df = combined_df.groupby('BuoyID', group_keys=False).apply(calculate_movement_metrics).reset_index(drop=True)


Displacement, heading, time differences, and velocity calculated successfully.
   BuoyID  Year Month Day Hour Min Sec  Latitude  Longitude  GPSdelay  ...  \
0  900115  2023    01  01   00  00  46  81.53036 -149.67551         0  ...   
1  900115  2023    01  01   00  30  47  81.53165 -149.68448         0  ...   
2  900115  2023    01  01   01  01  17  81.53296 -149.69345         0  ...   
3  900115  2023    01  01   01  31  03  81.53421 -149.70296         0  ...   
4  900115  2023    01  01   02  00  47  81.53523 -149.71284         0  ...   

   IBCAOv5_bathymetry  ERA5_wind_magnitude_10m  ERA5_wind_angle_10m  \
0           -3545.648                    7.177              157.054   
1           -3530.164                    7.251              158.675   
2           -3511.190                    7.251              158.675   
3           -3493.298                    7.391              159.047   
4           -3475.697                    7.391              159.047   

   ERA5_wind_magnitude_10

In [13]:
# Save the combined_df to a CSV file
output_csv_path = '../data/processed/interpolated_buoy_data.csv'
combined_df.to_csv(output_csv_path, index=False)
print(f"Interpolated buoy data saved to {output_csv_path}.")

Interpolated buoy data saved to ../data/processed/interpolated_buoy_data.csv.
