# Preparing past buoy data and reanalyses for use in model training

Concatenate all past buoy data into a single dataframe

This section will collect all of the cleaned buoy data and combine them into a single dataframe. A column to represent the day of year (DOY) as an integer is also added. These data will be used (along with weather reanalyses) as training data for the machine learning model. Also removes buoys deployed outside of the arctic (<64 degrees N)

In [None]:
# Concatenate multiple raw buoy CSV files into a single DataFrame and add a new column with the Day of Year (DOY) as an integer

import pandas as pd
import glob
import os

# Define the path to the folder containing the CSV files
folder_path = '../data/cleaned/buoydata/past'

# Use glob to get all CSV files in the folder
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))

# Initialize an empty list to store DataFrames
dfs = []

# Loop through the list of CSV files and read each one into a DataFrame
for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Rename the lat and lon columns to Latitude and Longitude
combined_df.rename(columns={'Lat': 'Latitude', 'Lon': 'Longitude'}, inplace=True)

# Pad Month, Day, Hour, Min, and Sec columns with leading zeros
combined_df['Month'] = combined_df['Month'].apply(lambda x: f'{x:02}')
combined_df['Day'] = combined_df['Day'].apply(lambda x: f'{x:02}')
combined_df['Hour'] = combined_df['Hour'].apply(lambda x: f'{x:02}')
combined_df['Min'] = combined_df['Min'].apply(lambda x: f'{x:02}')
combined_df['Sec'] = combined_df['Sec'].apply(lambda x: f'{x:02}')

# Create a new column called datetime by combining Year, Month, Day, Hour, Min, and Sec columns
combined_df['datetime'] = pd.to_datetime(combined_df['Year'].astype(str) + '-' +
                                         combined_df['Month'].astype(str) + '-' +
                                         combined_df['Day'].astype(str) + ' ' +
                                         combined_df['Hour'].astype(str) + ':' +
                                         combined_df['Min'].astype(str) + ':' +
                                         combined_df['Sec'].astype(str))

# Add a new column with the Day of Year (DOY) as an integer
combined_df['DOY'] = combined_df['datetime'].dt.dayofyear

# Iterate through the combined_df by BuoyID
for buoy_id, group in combined_df.groupby('BuoyID'):
    # Sort the records for each BuoyID by datetime from oldest to newest
    group = group.sort_values(by='datetime')
    
    # Check if the first row of the sorted data has a latitude value less than 64
    if group.iloc[0]['Latitude'] < 64:
        # Remove the entire BuoyID from the dataset
        combined_df = combined_df[combined_df['BuoyID'] != buoy_id]

Interpolate ERA5 to buoy data

In [None]:
import pandas as pd
import numpy as np
import netCDF4 as nc
from scipy.spatial import cKDTree
from datetime import datetime, timezone

# Load the NetCDF files
uwnd_nc_file_path = '../data/raw/reanalyses/ERA5/era5_uwnd_2023.nc'
vwnd_nc_file_path = '../data/raw/reanalyses/ERA5/era5_vwnd_2023.nc'
uwnd_ds = nc.Dataset(uwnd_nc_file_path)
vwnd_ds = nc.Dataset(vwnd_nc_file_path)

# Extract the valid_time, latitudes, longitudes, and u-component wind values from the NetCDF file
valid_time = uwnd_ds.variables['valid_time'][:]  # Assuming 'valid_time' is the variable name for time
latitudes = uwnd_ds.variables['latitude'][:]
longitudes = uwnd_ds.variables['longitude'][:]
uwnd_array = uwnd_ds.variables['u'][:, 0, :, :]  # Assuming 'u' is the variable name for u-component wind and removing the pressure dimension
vwnd_array = vwnd_ds.variables['v'][:, 0, :, :]  # Assuming 'v' is the variable name for v-component wind and removing the pressure dimension

# Add a column to the dataframe called "timestamp"
combined_df['timestamp'] = combined_df['datetime'].apply(lambda x: int(x.replace(tzinfo=timezone.utc).timestamp()))

# Create a KDTree for fast spatial lookup
lat_lon_pairs = np.array([(lat, lon) for lat in latitudes for lon in longitudes])
tree = cKDTree(lat_lon_pairs)

# Add new columns to combined_df for the u-component and v-component wind values
combined_df['era5_uwnd'] = np.nan
combined_df['era5_vwnd'] = np.nan

# Check the shape of the uwnd_array
print(f"uwnd_array shape: {uwnd_array.shape}")
print(f"vwnd_array shape: {vwnd_array.shape}")

# Iterate through each row in the dataframe
for index, row in combined_df.iterrows():
    # Find the value of the netCDF variable valid_time closest to the timestamp value
    timestamp = row['timestamp']
    time_diffs = np.abs(valid_time - timestamp)
    closest_time_index = np.argmin(time_diffs)
    
    # Check if the calculated index is within the bounds of the uwnd_array
    if closest_time_index < 0 or closest_time_index >= uwnd_array.shape[0]:
        print(f"Skipping row {index} with timestamp {timestamp} as it is out of bounds")
        continue
    
    # Select the corresponding netCDF slices
    uwnd_slice = uwnd_array[closest_time_index, :, :]
    vwnd_slice = vwnd_array[closest_time_index, :, :]
    
    # Find the grid cell of the netCDF slice closest to the Latitude and Longitude position
    lat_lon = (row['Latitude'], row['Longitude'])
    _, closest_point_index = tree.query(lat_lon)
    closest_lat, closest_lon = lat_lon_pairs[closest_point_index]
    
    # Find the index of the closest latitude/longitude pair in the arrays
    lat_index = np.where(latitudes == closest_lat)[0][0]
    lon_index = np.where(longitudes == closest_lon)[0][0]
    
    # Assign the corresponding u and v values to the new columns in the dataframe
    combined_df.at[index, 'era5_uwnd'] = uwnd_slice[lat_index, lon_index]
    combined_df.at[index, 'era5_vwnd'] = vwnd_slice[lat_index, lon_index]

# Drop the timestamp column from the dataframe
combined_df.drop(columns=['timestamp'], inplace=True)

# Print the dataframe head
print(combined_df.head())

# Print a message saying the script has completed
print("The ERA5 wind assignment script has completed.")

Interpolate IBCAO v5 bathymetry to buoy data

NOTE: This data is not currently implemented for use in model training, etc. Time/hardware restraints necessitated skipping this variable in later steps but it is here as a placeholder for future work.

In [None]:
import pandas as pd
import numpy as np
import rasterio
from pyproj import Transformer

# Load the georeferenced raster file
raster_path = '../data/raw/geospatial/ibcao_v5_2024_100m_depth.tiff'

with rasterio.open(raster_path) as raster:
    # Get raster metadata
    raster_data = raster.read(1)  # Load raster band data
    transform = raster.transform  # Affine transformation matrix
    nodata = raster.nodata  # NoData value for the raster

    # Create a transformer to convert coordinates from WGS 1984 (EPSG:4326) to the raster's CRS (EPSG:3996)
    transformer = Transformer.from_crs("EPSG:4326", raster.crs, always_xy=True)

    # Function to get row and column indices for latitude and longitude
    def get_row_col(x, y, transform):
        col, row = ~transform * (x, y)
        return int(row), int(col)

    # Create a function to get raster values using numpy indexing
    def get_raster_values(latitudes, longitudes):
        # Transform WGS 1984 coordinates to the raster CRS
        transformed_coords = transformer.transform(longitudes, latitudes)
        x_coords, y_coords = transformed_coords

        # Get row and column indices
        rows, cols = zip(*[get_row_col(x, y, transform) for x, y in zip(x_coords, y_coords)])
        rows = np.array(rows)
        cols = np.array(cols)

        # Ensure indices are within bounds
        valid_mask = (
            (rows >= 0) & (rows < raster_data.shape[0]) &
            (cols >= 0) & (cols < raster_data.shape[1])
        )
        values = np.full(latitudes.shape, np.nan)  # Initialize output array with NaN
        values[valid_mask] = raster_data[rows[valid_mask], cols[valid_mask]]

        # Replace nodata values with NaN
        if nodata is not None:
            values[values == nodata] = np.nan
        return values

# Assuming combined_df is already defined and contains 'Latitude' and 'Longitude' columns
# Example: combined_df = pd.read_csv('path_to_your_combined_df.csv')

    # Extract raster values for all lat/lon pairs
    combined_df['IBCAOv5_bathymetry'] = get_raster_values(
        combined_df['Latitude'].values,
        combined_df['Longitude'].values
    )


Add more data to the spreadsheet (wind vector and displacement/heading columns) and extract only the columns needed for training to a new spreadsheet.

In [None]:
import pandas as pd
import numpy as np
from geopy.distance import great_circle  # For calculating displacement
import math  # For trigonometric calculations

print("Extracting necessary columns...")

# Extract necessary columns (create a new DataFrame to avoid modifying the original)
columns_to_extract = ['Latitude', 'Longitude', 'BuoyID', 'datetime', 'era5_uwnd', 'era5_vwnd', 'IBCAOv5_bathymetry']
combined_df = combined_df[columns_to_extract].copy()  # Use .copy() to avoid SettingWithCopyWarning
print("Columns extracted successfully.")
print(combined_df.head())

print("Calculating wind magnitude and wind angle...")

# Calculate wind magnitude and wind angle
combined_df.loc[:, 'wind_magnitude'] = np.sqrt(combined_df['era5_uwnd']**2 + combined_df['era5_vwnd']**2)
combined_df.loc[:, 'wind_angle'] = np.degrees(np.arctan2(combined_df['era5_vwnd'], combined_df['era5_uwnd']))

print("Wind magnitude and wind angle calculated successfully.")
print(combined_df.head())

print("Displaying the first few rows of the preprocessed data:")

# Display the first few rows of the preprocessed data
print(combined_df.head())

print("Calculating displacement and heading...")

# Initialize displacement and heading columns
combined_df.loc[:, 'displacement'] = 0.0
combined_df.loc[:, 'heading'] = 0.0

# Function to calculate displacement and heading for each group
def calculate_displacement_and_heading(group):
    group = group.sort_values(by='datetime').reset_index(drop=True)
    for i in range(1, len(group)):
        # Ensure latitude and longitude values are passed as numeric arguments
        prev_point = (group.loc[i-1, 'Latitude'], group.loc[i-1, 'Longitude'])
        curr_point = (group.loc[i, 'Latitude'], group.loc[i, 'Longitude'])
        
        # Calculate displacement
        group.loc[i, 'displacement'] = great_circle(prev_point, curr_point).meters
        
        # Calculate heading
        lat1, lon1 = map(math.radians, prev_point)
        lat2, lon2 = map(math.radians, curr_point)
        
        dlon = lon2 - lon1
        x = math.sin(dlon) * math.cos(lat2)
        y = math.cos(lat1) * math.sin(lat2) - (math.sin(lat1) * math.cos(lat2) * math.cos(dlon))
        initial_heading = math.atan2(x, y)
        initial_heading = math.degrees(initial_heading)
        compass_heading = (initial_heading + 360) % 360
        
        group.loc[i, 'heading'] = compass_heading
    return group

# Apply the function to each group
combined_df = combined_df.groupby('BuoyID').apply(calculate_displacement_and_heading).reset_index(drop=True)

print("Displacement and heading calculated successfully.")
print(combined_df.head())

# Save the processed combined_df back to the spreadsheet
output_csv_path = '../combined_buoy_data.csv'
combined_df.to_csv(output_csv_path, index=False)
print(f"Processed buoy data saved to {output_csv_path}.")

Cleaned buoy data geospatial bounds confirmation

This cell will analyze and display the minimum and maximum values of the latitude and longitude fields of the data. 

In [None]:
# Confirm the latitude and longitude ranges

min_latitude = combined_df['Latitude'].min()
max_latitude = combined_df['Latitude'].max()
min_longitude = combined_df['Longitude'].min()
max_longitude = combined_df['Longitude'].max()

print(f"Latitude: min = {min_latitude}, max = {max_latitude}")
print(f"Longitude: min = {min_longitude}, max = {max_longitude}")