### Raw buoy data cleaning
This cell will clean the buoy data to remove rows with NAs in the Lat/Lon columns, ensure that the format of lat/lon pairs is standardized, remove buoy locations on land, and remove buoy tracks with less than 50 rows (these are assumed to have been destroyed or instruments failed)

The resulting cleaned data will be stored in the data/cleaned/buoydata/past folder.

In [2]:
import os
import pandas as pd
import rasterio
from pyproj import Transformer
import shutil
import numpy as np

# 1. Initialize directories and paths
input_directory = '../data/raw/buoydata/past'
output_directory = '../data/cleaned/buoydata/past'
removed_rows_output_path = '../data/cleaned/buoydata/removed_rows_past.csv'
raster_path = '../data/raw/geospatial/arctic_land.tif'

# 2. Clear the output directory if it exists, otherwise create it
if os.path.exists(output_directory):
    shutil.rmtree(output_directory)
os.makedirs(output_directory, exist_ok=True)

# 3. Create a DataFrame to store removed rows
removed_rows_df = pd.DataFrame()

# Define the transformer to use EPSG:4326 to EPSG:3413
transformer = Transformer.from_crs("epsg:4326", "epsg:3413", always_xy=True)

# Function to project coordinates from WGS 1984 to EPSG 3413
def project_to_epsg3413(lat, lon):
    try:
        # Ensure lat and lon are within valid ranges
        if not (-90 <= lat <= 90 and -180 <= lon <= 180):
            raise ValueError("Invalid latitude or longitude values")
        
        # Perform the projection
        x, y = transformer.transform(lon, lat)
        
        # Check for invalid projection results
        if np.isinf(x) or np.isinf(y):
            raise ValueError("Projection resulted in inf values")
        
        return x, y
    except Exception as e:
        print(f"Projection error for lat={lat}, lon={lon}: {e}")
        return np.nan, np.nan

# Function to normalize latitude values
def normalize_lat(lat):
    while lat < -90 or lat > 90:
        if lat < -90:
            lat = -180 - lat
        elif lat > 90:
            lat = 180 - lat
    return lat

# Function to normalize longitude values
def normalize_lon(lon):
    while lon < -180 or lon > 180:
        if lon < -180:
            lon += 360
        elif lon > 180:
            lon -= 360
    return lon

# Open the raster file
with rasterio.open(raster_path) as src:
    # Read the raster data
    raster_data = src.read(1)
    affine_transform = src.transform

    # Function to check if a point overlaps with raster cells with value=1
    def overlaps_raster_value_one(lat, lon):
        if pd.isna(lat) or pd.isna(lon):
            return False
        x, y = project_to_epsg3413(lat, lon)
        try:
            row, col = src.index(x, y)
            if 0 <= row < raster_data.shape[0] and 0 <= col < raster_data.shape[1]:
                return raster_data[row, col] == 1
        except ValueError:
            return False
        return False

    # 4. Iterate through each file in the input directory
    for filename in os.listdir(input_directory):
        if filename.endswith('.csv'):
            input_file_path = os.path.join(input_directory, filename)
            
            # Read the CSV file
            df = pd.read_csv(input_file_path)
            
            # a. Remove rows with NAs in Lat or Lon columns
            df_cleaned = df.dropna(subset=['Lat', 'Lon'])

            # b. Delete duplicate rows
            df_cleaned = df_cleaned.drop_duplicates()

            # c. Normalize lat/lon values
            df_cleaned['Lat'] = df_cleaned['Lat'].apply(normalize_lat)
            df_cleaned['Lon'] = df_cleaned['Lon'].apply(normalize_lon)

            # d. Add columns with projected coordinates
            df_cleaned['x'], df_cleaned['y'] = zip(*df_cleaned.apply(lambda row: project_to_epsg3413(row['Lat'], row['Lon']), axis=1))

            # e. Determine which rows overlap with raster cells with value=1
            df_cleaned['overlaps'] = df_cleaned.apply(lambda row: overlaps_raster_value_one(row['Lat'], row['Lon']), axis=1)
            removed_rows = df_cleaned[df_cleaned['overlaps']]
            df_cleaned = df_cleaned[~df_cleaned['overlaps']]

            # f. Append removed rows to the DataFrame for validation
            removed_rows_df = pd.concat([removed_rows_df, removed_rows])

            # g. Drop the 'overlaps' and projected coordinate columns
            df_cleaned = df_cleaned.drop(columns=['overlaps', 'x', 'y'])

            # h. Save the cleaned data to the output directory unless the file has less than 50 rows
            if len(df_cleaned) > 50:
                output_file_path = os.path.join(output_directory, filename)
                df_cleaned.to_csv(output_file_path, index=False)
            else:
                print(f'{filename} has been deleted for having less than 50 rows')

# 5. Save the removed rows to a CSV for validation
removed_rows_df.to_csv(removed_rows_output_path, index=False)

# 6. Print a message to indicate that the script has finished
print('All files have been cleaned and saved to the cleaned directory')

300025010024370.csv has been deleted for having less than 50 rows
All files have been cleaned and saved to the cleaned directory
