### Raw buoy data cleaning
This cell will clean the buoy data to remove rows with NAs in the Lat/Lon columns, ensure that the format of lat/lon pairs is standardized, remove buoy locations on land, and remove buoy tracks with less than 50 rows (these are assumed to have been destroyed or instruments failed)

The resulting cleaned data will be stored in the data/cleaned/buoydata/past folder.

In [1]:
import os
import pandas as pd
import rasterio
import shutil
import numpy as np

# Initialize directories and paths
input_directory = '../data/raw/buoydata/past'
output_directory = '../data/cleaned/buoydata/past'
raster_path = '../data/raw/geospatial/arctic_land.tif'

# Clear the output directory if it exists, otherwise create it
if os.path.exists(output_directory):
    shutil.rmtree(output_directory)
os.makedirs(output_directory, exist_ok=True)

# Normalize latitude values
def normalize_lat(lat):
    while lat < -90 or lat > 90:
        if lat < -90:
            lat = -180 - lat
        elif lat > 90:
            lat = 180 - lat
    return lat

# Normalize longitude values
def normalize_lon(lon):
    while lon < -180 or lon > 180:
        if lon < -180:
            lon += 360
        elif lon > 180:
            lon -= 360
    return lon

# Open the raster file
with rasterio.open(raster_path) as src:
    raster_data = src.read(1)
    affine_transform = src.transform

    def overlaps_raster_value_one(lat, lon):
        if pd.isna(lat) or pd.isna(lon):
            return False
        try:
            row, col = src.index(lon, lat)
            if 0 <= row < raster_data.shape[0] and 0 <= col < raster_data.shape[1]:
                return raster_data[row, col] == 1
        except ValueError:
            return False
        return False

    for filename in os.listdir(input_directory):
        if filename.endswith('.csv'):
            input_file_path = os.path.join(input_directory, filename)

            df = pd.read_csv(input_file_path)

            df_cleaned = df.dropna(subset=['Lat', 'Lon'])

            df_cleaned = df_cleaned.drop_duplicates()

            df_cleaned['Lat'] = df_cleaned['Lat'].apply(normalize_lat)
            df_cleaned['Lon'] = df_cleaned['Lon'].apply(normalize_lon)

            df_cleaned['overlaps'] = df_cleaned.apply(lambda row: overlaps_raster_value_one(row['Lat'], row['Lon']), axis=1)
            removed_rows = df_cleaned[df_cleaned['overlaps']]
            df_cleaned = df_cleaned[~df_cleaned['overlaps']]

            removed_rows_df = pd.concat([removed_rows_df, removed_rows], ignore_index=True) if 'removed_rows_df' in locals() else removed_rows

            df_cleaned = df_cleaned.drop(columns=['overlaps'])

            if len(df_cleaned) > 50:
                output_file_path = os.path.join(output_directory, filename)
                df_cleaned.to_csv(output_file_path, index=False)
            else:
                print(f'{filename} has been deleted for having less than 50 rows')

print('All files have been cleaned and saved to the cleaned directory.')

300025010024370.csv has been deleted for having less than 50 rows
All files have been cleaned and saved to the cleaned directory.
