# Preparing past buoy data and reanalyses for use in model training

Concatenate all past buoy data into a single dataframe

This section will collect all of the cleaned buoy data and combine them into a single dataframe. A column to represent the day of year (DOY) as an integer is also added. These data will be used (along with weather reanalyses) as training data for the machine learning model. Also removes buoys deployed outside of the arctic (<64 degrees N)

In [9]:
# Concatenate all raw buoy CSV files into a single DataFrame and add a new column with the Day of Year (DOY) as an integer

import pandas as pd
import glob
import os

# Define the path to the folder containing the CSV files
folder_path = '../data/cleaned/buoydata/past'

# Use glob to get all CSV files in the folder
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))

# Initialize an empty list to store DataFrames
dfs = []

# Loop through the list of CSV files and read each one into a DataFrame
for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Rename the lat and lon columns to Latitude and Longitude
combined_df.rename(columns={'Lat': 'Latitude', 'Lon': 'Longitude'}, inplace=True)

# Pad Month, Day, Hour, Min, and Sec columns with leading zeros
combined_df['Month'] = combined_df['Month'].apply(lambda x: f'{x:02}')
combined_df['Day'] = combined_df['Day'].apply(lambda x: f'{x:02}')
combined_df['Hour'] = combined_df['Hour'].apply(lambda x: f'{x:02}')
combined_df['Min'] = combined_df['Min'].apply(lambda x: f'{x:02}')
combined_df['Sec'] = combined_df['Sec'].apply(lambda x: f'{x:02}')

# Create a new column called datetime by combining Year, Month, Day, Hour, Min, and Sec columns
combined_df['datetime'] = pd.to_datetime(combined_df['Year'].astype(str) + '-' +
                                         combined_df['Month'].astype(str) + '-' +
                                         combined_df['Day'].astype(str) + ' ' +
                                         combined_df['Hour'].astype(str) + ':' +
                                         combined_df['Min'].astype(str) + ':' +
                                         combined_df['Sec'].astype(str))

# Add a new column with the Day of Year (DOY) as an integer
combined_df['DOY'] = combined_df['datetime'].dt.dayofyear

# Iterate through the combined_df by BuoyID
for buoy_id, group in combined_df.groupby('BuoyID'):
    # Sort the records for each BuoyID by datetime from oldest to newest
    group = group.sort_values(by='datetime')
    
    # Check if the first row of the sorted data has a latitude value less than 64
    if group.iloc[0]['Latitude'] < 64:
        # Remove the entire BuoyID from the dataset
        combined_df = combined_df[combined_df['BuoyID'] != buoy_id]

Interpolate ERA5 to buoy data

In [10]:
import pandas as pd
import numpy as np
import netCDF4 as nc
from scipy.spatial import cKDTree
from datetime import datetime, timezone

# Define paths to the NetCDF files and their corresponding variables
netcdf_files = {
    'ERA5_10m_u_component_of_wind': '../data/raw/reanalyses/ERA5/era5_10m_u_component_of_wind_2023.nc',
    'ERA5_10m_v_component_of_wind': '../data/raw/reanalyses/ERA5/era5_10m_v_component_of_wind_2023.nc',
    'ERA5_100m_u_component_of_wind': '../data/raw/reanalyses/ERA5/era5_100m_u_component_of_wind_2023.nc',
    'ERA5_100m_v_component_of_wind': '../data/raw/reanalyses/ERA5/era5_100m_v_component_of_wind_2023.nc',
    'ERA5_sea_ice_cover': '../data/raw/reanalyses/ERA5/era5_sea_ice_cover_2023.nc',
}

# Load the combined DataFrame
# combined_df should already exist with 'datetime', 'Latitude', and 'Longitude' columns
# Example:
# combined_df = pd.read_csv('path_to_combined_df.csv')

# Add a timestamp column to combined_df
combined_df['timestamp'] = combined_df['datetime'].apply(lambda x: int(x.replace(tzinfo=timezone.utc).timestamp()))

# Iterate over each variable and interpolate values
for variable, file_path in netcdf_files.items():
    print(f"Processing variable: {variable}")
    
    # Open the NetCDF file
    ds = nc.Dataset(file_path)
    
    # Extract the necessary variables
    valid_time = ds.variables['valid_time'][:]
    latitudes = ds.variables['latitude'][:]
    longitudes = ds.variables['longitude'][:]
    data_array = ds.variables[list(ds.variables.keys())[-1]][:]  # Assuming last variable is the data

    # Check dimensions and adjust for 3D arrays
    if len(data_array.shape) == 4:  # Time, Level, Lat, Lon
        data_array = data_array[:, 0, :, :]  # Take the first level

    # Create a KDTree for spatial lookup
    lat_lon_pairs = np.array([(lat, lon) for lat in latitudes for lon in longitudes])
    tree = cKDTree(lat_lon_pairs)

    # Add a new column for the variable in combined_df
    combined_df[variable] = np.nan

    # Iterate through each row in the DataFrame
    for index, row in combined_df.iterrows():
        # Find the closest time index
        timestamp = row['timestamp']
        time_diffs = np.abs(valid_time - timestamp)
        closest_time_index = np.argmin(time_diffs)

        # Skip if out of bounds
        if closest_time_index < 0 or closest_time_index >= data_array.shape[0]:
            print(f"Skipping row {index} due to time out of bounds")
            continue

        # Extract the corresponding slice
        data_slice = data_array[closest_time_index, :, :]

        # Find the closest grid point
        lat_lon = (row['Latitude'], row['Longitude'])
        _, closest_point_index = tree.query(lat_lon)
        closest_lat, closest_lon = lat_lon_pairs[closest_point_index]

        # Find indices of the closest latitude and longitude
        lat_index = np.where(latitudes == closest_lat)[0][0]
        lon_index = np.where(longitudes == closest_lon)[0][0]

        # Assign the interpolated value to the DataFrame
        combined_df.at[index, variable] = data_slice[lat_index, lon_index]

    print(f"Finished processing {variable}")

# Save the updated DataFrame
# combined_df.to_csv('path_to_updated_combined_df.csv', index=False)

# Print completion message
print("All variables have been processed and interpolated.")

Processing variable: ERA5_10m_u_component_of_wind
Finished processing ERA5_10m_u_component_of_wind
Processing variable: ERA5_10m_v_component_of_wind
Finished processing ERA5_10m_v_component_of_wind
Processing variable: ERA5_100m_u_component_of_wind
Finished processing ERA5_100m_u_component_of_wind
Processing variable: ERA5_100m_v_component_of_wind
Finished processing ERA5_100m_v_component_of_wind
Processing variable: ERA5_sea_ice_cover


  arr[indexer] = value


Finished processing ERA5_sea_ice_cover
All variables have been processed and interpolated.


Interpolate IBCAO v5 bathymetry to buoy data

In [11]:
import pandas as pd
import numpy as np
import rasterio
from pyproj import Transformer

# Load the georeferenced raster file
raster_path = '../data/raw/geospatial/ibcao_v5_2024_100m_depth.tiff'

with rasterio.open(raster_path) as raster:
    # Get raster metadata
    raster_data = raster.read(1)  # Load raster band data
    transform = raster.transform  # Affine transformation matrix
    nodata = raster.nodata  # NoData value for the raster

    # Create a transformer to convert coordinates from WGS 1984 (EPSG:4326) to the raster's CRS (EPSG:3996)
    transformer = Transformer.from_crs("EPSG:4326", raster.crs, always_xy=True)

    # Function to get row and column indices for latitude and longitude
    def get_row_col(x, y, transform):
        col, row = ~transform * (x, y)
        return int(row), int(col)

    # Create a function to get raster values using numpy indexing
    def get_raster_values(latitudes, longitudes):
        # Transform WGS 1984 coordinates to the raster CRS
        transformed_coords = transformer.transform(longitudes, latitudes)
        x_coords, y_coords = transformed_coords

        # Get row and column indices
        rows, cols = zip(*[get_row_col(x, y, transform) for x, y in zip(x_coords, y_coords)])
        rows = np.array(rows)
        cols = np.array(cols)

        # Ensure indices are within bounds
        valid_mask = (
            (rows >= 0) & (rows < raster_data.shape[0]) &
            (cols >= 0) & (cols < raster_data.shape[1])
        )
        values = np.full(latitudes.shape, np.nan)  # Initialize output array with NaN
        values[valid_mask] = raster_data[rows[valid_mask], cols[valid_mask]]

        # Replace nodata values with NaN
        if nodata is not None:
            values[values == nodata] = np.nan
        return values

# Assuming combined_df is already defined and contains 'Latitude' and 'Longitude' columns
# Example: combined_df = pd.read_csv('path_to_your_combined_df.csv')

    # Extract raster values for all lat/lon pairs
    combined_df['IBCAOv5_bathymetry'] = get_raster_values(
        combined_df['Latitude'].values,
        combined_df['Longitude'].values
    )


In [13]:
combined_df.head()

Unnamed: 0,BuoyID,Year,Month,Day,Hour,Min,Sec,Latitude,Longitude,GPSdelay,...,Batt,datetime,DOY,timestamp,ERA5_10m_u_component_of_wind,ERA5_10m_v_component_of_wind,ERA5_100m_u_component_of_wind,ERA5_100m_v_component_of_wind,ERA5_sea_ice_cover,IBCAOv5_bathymetry
0,300025010734900,2023,8,7,0,7,32,77.3374,-138.15785,0,...,13,2023-08-07 00:07:32,219,1691366852,2.13327,-1.722595,2.60524,-2.824326,0.48999,-3700.891113
1,300025010734900,2023,8,7,0,51,5,77.33538,-138.13705,0,...,13,2023-08-07 00:51:05,219,1691369465,1.552002,-2.264008,1.641525,-3.23082,0.48999,-3691.111084
2,300025010734900,2023,8,7,1,1,40,77.33479,-138.13317,0,...,13,2023-08-07 01:01:40,219,1691370100,1.552002,-2.264008,1.641525,-3.23082,0.48999,-3697.583984
3,300025010734900,2023,8,7,2,1,21,77.33148,-138.1195,0,...,13,2023-08-07 02:01:21,219,1691373681,1.249054,-2.76796,1.250229,-3.651215,0.47995,-3698.184082
4,300025010734900,2023,8,7,3,1,11,77.32867,-138.12018,0,...,13,2023-08-07 03:01:11,219,1691377271,0.722733,-3.224045,0.595322,-4.391693,0.47995,-3699.036377


Add more data to the spreadsheet (wind vector and displacement/heading columns)

In [16]:
import numpy as np
from geopy.distance import great_circle  # For calculating displacement
import math  # For trigonometric calculations

def add_new_columns(combined_df):
    print("Calculating wind magnitude and wind angle...")

    # Calculate wind magnitude and wind angle
    combined_df.loc[:, 'ERA5_wind_magnitude_10m'] = np.sqrt(combined_df['ERA5_10m_u_component_of_wind']**2 + combined_df['ERA5_10m_v_component_of_wind']**2)
    combined_df.loc[:, 'ERA5_wind_angle_10m'] = np.degrees(np.arctan2(combined_df['ERA5_10m_v_component_of_wind'], combined_df['ERA5_10m_u_component_of_wind']))
    combined_df.loc[:, 'ERA5_wind_magnitude_100m'] = np.sqrt(combined_df['ERA5_100m_u_component_of_wind']**2 + combined_df['ERA5_100m_v_component_of_wind']**2)
    combined_df.loc[:, 'ERA5_wind_angle_100m'] = np.degrees(np.arctan2(combined_df['ERA5_100m_v_component_of_wind'], combined_df['ERA5_100m_u_component_of_wind']))
    
    print("Wind magnitude and wind angle calculated successfully.")
    print(combined_df.head())

    print("Calculating displacement, heading, and time to next position...")

    # Initialize displacement, heading, and time to next position columns
    combined_df.loc[:, 'displacement'] = 0.0
    combined_df.loc[:, 'heading'] = 0.0
    combined_df.loc[:, 'time_to_next_position'] = 0.0

    # Function to calculate displacement, heading, and time to next position for each group
    def calculate_displacement_and_heading(group):
        group = group.sort_values(by='datetime').reset_index(drop=True)
        for i in range(1, len(group)):
            # Ensure latitude and longitude values are passed as numeric arguments
            prev_point = (group.loc[i-1, 'Latitude'], group.loc[i-1, 'Longitude'])
            curr_point = (group.loc[i, 'Latitude'], group.loc[i, 'Longitude'])

            # Calculate displacement
            group.loc[i, 'displacement'] = great_circle(prev_point, curr_point).meters

            # Calculate heading
            lat1, lon1 = map(math.radians, prev_point)
            lat2, lon2 = map(math.radians, curr_point)

            dlon = lon2 - lon1
            x = math.sin(dlon) * math.cos(lat2)
            y = math.cos(lat1) * math.sin(lat2) - (math.sin(lat1) * math.cos(lat2) * math.cos(dlon))
            initial_heading = math.atan2(x, y)
            initial_heading = math.degrees(initial_heading)
            compass_heading = (initial_heading + 360) % 360

            group.loc[i, 'heading'] = compass_heading

            # Calculate time to next position
            time_diff = (group.loc[i, 'datetime'] - group.loc[i-1, 'datetime']).total_seconds()
            group.loc[i, 'time_to_next_position'] = time_diff
        return group

    # Apply the function to each group
    combined_df = combined_df.groupby('BuoyID').apply(calculate_displacement_and_heading).reset_index(drop=True)

    print("Displacement, heading, and time to next position calculated successfully.")
    print(combined_df.head())

    return combined_df

# Add new columns for wind magnitude, angle, displacement, heading, and time to next position
combined_df = add_new_columns(combined_df)


Calculating wind magnitude and wind angle...
Wind magnitude and wind angle calculated successfully.
            BuoyID  Year Month Day Hour Min Sec  Latitude  Longitude  \
0  300025010734900  2023    08  07   00  07  32  77.33740 -138.15785   
1  300025010734900  2023    08  07   00  51  05  77.33538 -138.13705   
2  300025010734900  2023    08  07   01  01  40  77.33479 -138.13317   
3  300025010734900  2023    08  07   02  01  21  77.33148 -138.11950   
4  300025010734900  2023    08  07   03  01  11  77.32867 -138.12018   

   GPSdelay  ...  wind_angle_10m  wind_magnitude_100m  wind_angle_100m  \
0         0  ...      -38.920541             3.842407       -47.310660   
1         0  ...      -55.568936             3.623921       -63.065641   
2         0  ...      -55.568936             3.623921       -63.065641   
3         0  ...      -65.712514             3.859332       -71.098052   
4         0  ...      -77.364915             4.431859       -82.280252   

   displacement  headi

  combined_df = combined_df.groupby('BuoyID').apply(calculate_displacement_and_heading).reset_index(drop=True)


Displacement, heading, and time to next position calculated successfully.
   BuoyID  Year Month Day Hour Min Sec  Latitude  Longitude  GPSdelay  ...  \
0  900115  2023    01  01   00  00  46  81.53036 -149.67551         0  ...   
1  900115  2023    01  01   00  30  47  81.53165 -149.68448         0  ...   
2  900115  2023    01  01   01  01  17  81.53296 -149.69345         0  ...   
3  900115  2023    01  01   01  31  03  81.53421 -149.70296         0  ...   
4  900115  2023    01  01   02  00  47  81.53523 -149.71284         0  ...   

   wind_angle_10m  wind_magnitude_100m  wind_angle_100m  displacement  \
0      157.049906            10.385153       151.275310      0.000000   
1      158.672519            10.493810       152.740733    205.312977   
2      158.672519            10.493810       152.740733    206.856929   
3      159.048017            10.716345       153.376829    208.707315   
4      159.048017            10.716345       153.376829    197.532749   

      heading  tim

In [17]:
# Save the combined_df to a CSV file
output_csv_path = 'interpolated_buoy_data.csv'
combined_df.to_csv(output_csv_path, index=False)
print(f"Interpolated buoy data saved to {output_csv_path}.")

Interpolated buoy data saved to interpolated_buoy_data.csv.


In [18]:
import pandas as pd
import numpy as np

# Pick a random BuoyID from combined_df
random_buoy_id = np.random.choice(combined_df['BuoyID'].unique())

# Filter the dataframe for the selected BuoyID
random_buoy_df = combined_df[combined_df['BuoyID'] == random_buoy_id]

# Save the filtered dataframe to a new CSV file
random_buoy_csv_path = f'random_buoy_{random_buoy_id}.csv'
random_buoy_df.to_csv(random_buoy_csv_path, index=False)

print(f"Data for BuoyID {random_buoy_id} saved to {random_buoy_csv_path}.")

Data for BuoyID 300234060330560 saved to random_buoy_300234060330560.csv.


In [None]:
import netCDF4 as nc
import numpy as np

# Define the timestamp for which we want to extract the slices
target_timestamp = 1693519200

# Define a dictionary to store the slices
slices = {}

# Iterate over each variable and extract the corresponding slice
for variable, file_path in netcdf_files.items():
    print(f"Extracting slice for variable: {variable}")
    
    # Open the NetCDF file
    ds = nc.Dataset(file_path)
    
    # Extract the necessary variables
    valid_time = ds.variables['valid_time'][:]
    data_array = ds.variables[list(ds.variables.keys())[-1]][:]  # Assuming last variable is the data

    # Check dimensions and adjust for 3D arrays
    if len(data_array.shape) == 4:  # Time, Level, Lat, Lon
        data_array = data_array[:, 0, :, :]  # Take the first level

    # Find the closest time index
    time_diffs = np.abs(valid_time - target_timestamp)
    closest_time_index = np.argmin(time_diffs)

    # Extract the corresponding slice
    data_slice = data_array[closest_time_index, :, :]

    # Store the slice in the dictionary
    slices[variable] = data_slice

    # Save the slice to a file for validation
    np.save(f'{variable}_slice_{target_timestamp}.npy', data_slice)

    print(f"Slice for {variable} saved successfully.")

# Print completion message
print("All slices have been extracted and saved for validation.")

Extracting slice for variable: ERA5_10m_u_component_of_wind
