# Data Downloading

Download a collection of past buoy transmissions from a section of the IABP website.

In [None]:
import os
import pandas as pd
import requests

# URL of the file to download
file_url = 'https://iabp.apl.uw.edu/Data_Products/LEVEL1_DATA/LEVEL1_2023.csv'

# Directory to save the downloaded file
temp_dir = '../data/raw/buoydata/past/temp'
os.makedirs(temp_dir, exist_ok=True)

# Path to save the downloaded file
temp_file_path = os.path.join(temp_dir, 'IABP_Level1_2023all.csv')

# Download the file
response = requests.get(file_url)
response.raise_for_status()  # Check if the request was successful

# Save the file
with open(temp_file_path, 'wb') as file:
    file.write(response.content)

print(f'Downloaded file to {temp_file_path}')

# Read the downloaded CSV file
df = pd.read_csv(temp_file_path)

# Directory to save the separated CSV files
output_dir = '../data/raw/buoydata/past'
os.makedirs(output_dir, exist_ok=True)

# Clear all files in the directory before saving new data, except the "temp" folder
for filename in os.listdir(output_dir):
    file_path = os.path.join(output_dir, filename)
    try:
        if os.path.isfile(file_path) or os.path.islink(file_path):
            os.unlink(file_path)
        elif os.path.isdir(file_path) and filename != 'temp':
            os.rmdir(file_path)
    except Exception as e:
        print(f'Failed to delete {file_path}. Reason: {e}')
    
# Separate the file into discrete CSV files based on the buoyID column
for buoy_id, group in df.groupby('BuoyID'):
    output_file_path = os.path.join(output_dir, f'{buoy_id}.csv')
    group.to_csv(output_file_path, index=False)
    print(f'Saved {output_file_path}')

This section will download the 2023 ERA5 reanalysis (u and v components of wind) and save as a netCDF. This data will be interpolated with the past buoy data as training data for machine learning algorithms.

In [None]:
# This script will download the ERA5 reanalysis data for the year 2023 from the CDS API. The data will be downloaded in netCDF format and will contain the u-component of the wind at the 1 hPa pressure level. The data will be downloaded for the entire globe and for all hours of the day. The data will be saved in the data/raw/reanalyses/ERA5 directory. 
# If the directory does not exist, it will be created. If the directory already exists, all files in the directory will be deleted before the new data is downloaded.
# WARNING: the output files will be large (approximately 2*5.5 GB) and the download may take a long time. Make sure you have enough disk space and a stable internet connection before running this script.

import os
import cdsapi

# Create the output directory if it does not exist
os.makedirs("../data/raw/reanalyses/ERA5", exist_ok=True)
        
# Set the CDSAPI_RC environment variable to the path of your .cdsapirc file
os.environ['CDSAPI_RC'] = '../.cdsapirc'

dataset = "reanalysis-era5-pressure-levels"
request = {
    "product_type": ["reanalysis"],
    "variable": ["u_component_of_wind"],
    "year": ["2023"],
    "month": [
        "01", "02", "03",
        "04", "05", "06",
        "07", "08", "09",
        "10", "11", "12"
    ],
    "day": [
        "01", "02", "03",
        "04", "05", "06",
        "07", "08", "09",
        "10", "11", "12",
        "13", "14", "15",
        "16", "17", "18",
        "19", "20", "21",
        "22", "23", "24",
        "25", "26", "27",
        "28", "29", "30",
        "31"
    ],
    "time": [
        "00:00", "01:00", "02:00",
        "03:00", "04:00", "05:00",
        "06:00", "07:00", "08:00",
        "09:00", "10:00", "11:00",
        "12:00", "13:00", "14:00",
        "15:00", "16:00", "17:00",
        "18:00", "19:00", "20:00",
        "21:00", "22:00", "23:00"
    ],
    "pressure_level": ["1"],
    "data_format": "netcdf",
    "download_format": "unarchived",
    "area": [90, -180, 23, 180]
}

client = cdsapi.Client()

target_file = "../data/raw/reanalyses/ERA5/era5_uwnd_2023.nc"
client.retrieve(dataset, request, target_file)

dataset = "reanalysis-era5-pressure-levels"
request = {
    "product_type": ["reanalysis"],
    "variable": ["v_component_of_wind"],
    "year": ["2023"],
    "month": [
        "01", "02", "03",
        "04", "05", "06",
        "07", "08", "09",
        "10", "11", "12"
    ],
    "day": [
        "01", "02", "03",
        "04", "05", "06",
        "07", "08", "09",
        "10", "11", "12",
        "13", "14", "15",
        "16", "17", "18",
        "19", "20", "21",
        "22", "23", "24",
        "25", "26", "27",
        "28", "29", "30",
        "31"
    ],
    "time": [
        "00:00", "01:00", "02:00",
        "03:00", "04:00", "05:00",
        "06:00", "07:00", "08:00",
        "09:00", "10:00", "11:00",
        "12:00", "13:00", "14:00",
        "15:00", "16:00", "17:00",
        "18:00", "19:00", "20:00",
        "21:00", "22:00", "23:00"
    ],
    "pressure_level": ["1"],
    "data_format": "netcdf",
    "download_format": "unarchived",
    "area": [90, -180, 23, 180]
}

client = cdsapi.Client()

target_file = "../data/raw/reanalyses/ERA5/era5_vwnd_2023.nc"
client.retrieve(dataset, request, target_file)