# Setup

## Note:
- Download of ERA5 data below will require around 70 GB of storage, and may take several days due to queueing in the ECMWF Climate Data Store (CDS)
- Download of GLASS data will require around 30 GB of storage
- So a total of around 100 GB of storage will be required

In [None]:
# Import libraries
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
import cdsapi
import wget

# Download ERA5 data

If haven't already, first set up ECMWF CDS API using instructions from here: https://confluence.ecmwf.int/display/CKB/How+to+download+ERA5#HowtodownloadERA5-4-DownloadERA5familydatathroughtheCDSAPI

In [None]:
# Open CDS API client for ERA5 downloads
c = cdsapi.Client()

In [None]:
# Areas for each region in NWSE format to retrieve data for
area = {
    "ca": [17, -91, 7, -81],
    "sa": [0, -65, -15, -30],
    "wa": [-30, 113, -35, 123]
}

In [None]:
# Years to retrieve data for
years = [
            '1980', '1981', '1982',
            '1983', '1984', '1985',
            '1986', '1987', '1988',
            '1989', '1990', '1991',
            '1992', '1993', '1994',
            '1995', '1996', '1997',
            '1998', '1999', '2000',
            '2001', '2002', '2003',
            '2004', '2005', '2006',
            '2007', '2008', '2009',
            '2010', '2011', '2012',
            '2013', '2014', '2015',
            '2016', '2017', '2018',
            '2019', '2020', '2021',
]

In [None]:
# Static variable names in ECMWF CDS, available here:
# https://confluence.ecmwf.int/display/CKB/ERA5%3A+data+documentation
static_var_names = [
    "angle_of_sub_gridscale_orography",
    "anisotropy_of_sub_gridscale_orography",
    "geopotential",
    "high_vegetation_cover",
    "lake_cover",
    "lake_depth",
    "land_sea_mask",
    "low_vegetation_cover",
    "slope_of_sub_gridscale_orography",
    "soil_type",
    "standard_deviation-of-filtered-subgrid-orography",
    "standard_deviation_of_orography",
    "type_of_high_vegetation",
    "type_of_low_vegetation"
]

## Static data (global)

In [None]:
# Define request to retrieve static data (to be used with retrieve function)
def request_era5_slv_static(variable):
    file_name = "data_raw/global_era5-slv_static/global_era5-slv_static_{output_name}.nc".format(
        output_name=variable.replace("_", "-"))
    if Path(file_name).exists():
        print(file_name, "already exists")
    else:
        try:
            c.retrieve(
                'reanalysis-era5-single-levels-monthly-means',
                {
                    'product_type': 'monthly_averaged_reanalysis',
                    'variable': variable,
                    'year': '2022',
                    'month': '01',
                    'time': '00:00',
                    'format': 'netcdf',
                },
                file_name)
            print("Retrieved", file_name)
        except:
            print("Could not retrieve " + file_name)
            
# Define function to retrieve static data 
def retrieve_era5_slv_static(variables):
    # Assert variables are valid so we don't unneccessarily create folders in the next part
    # And so we don't unnecessarily trigger the exception message
    assert all(variable in static_var_names for variable in variables), (
        "variables not subset of: {static_var_names}").format(static_var_names=static_var_names)
    # Create global_era5-slv_static folder to download data into
    # (if it doesn't already exist)
    Path("data_raw/global_era5-slv_static").mkdir(parents=True, exist_ok=True)
    # Run up to 10 parallel retrieve requests (to queue and download data for all variables simultaneously)
    with ThreadPoolExecutor(max_workers=10) as executor:
        for variable in variables:
            executor.submit(request_era5_slv_static, variable)

In [None]:
# Retrieve static data for geopoential (to plot topography) and land-sea mask
retrieve_era5_slv_static(["geopotential", "land_sea_mask"])

## Monthly averaged reanalysis by hour of day, on single levels

In [None]:
# Define request to retrieve monthly reanalysis (to be used with retrieve function)
def request_era5_slv_month_hour(region):
    file_name = 'data_raw/{region}_era5-slv_month-hour_{yearstart}-{yearend}.nc'.format(
        region=region, yearstart=years[0], yearend=years[-1])
    if Path(file_name).exists():
        print(file_name + " already exists")
    else:
        try:
            c.retrieve(
                'reanalysis-era5-single-levels-monthly-means',
                {
                    'product_type': 'monthly_averaged_reanalysis_by_hour_of_day',
                    'variable': [
                        '100m_u_component_of_wind', '100m_v_component_of_wind', '10m_u_component_of_wind',
                        '10m_v_component_of_wind', '2m_temperature', 'mean_sea_level_pressure',
                        'surface_latent_heat_flux', 'surface_sensible_heat_flux',
                    ],
                    'year': years,
                    'month': [
                        '01', '02', '03',
                        '04', '05', '06',
                        '07', '08', '09',
                        '10', '11', '12',
                    ],
                    'time': [
                        '00:00', '01:00', '02:00',
                        '03:00', '04:00', '05:00',
                        '06:00', '07:00', '08:00',
                        '09:00', '10:00', '11:00',
                        '12:00', '13:00', '14:00',
                        '15:00', '16:00', '17:00',
                        '18:00', '19:00', '20:00',
                        '21:00', '22:00', '23:00',
                    ],
                    'format': 'netcdf',
                    'area': area[region],
                },
                file_name)
            print("Retrieved " + file_name)
        except:
            print("Could not retrieve " + file_name)
        
# Define function to retrieve hourly reanalysis    
def retrieve_era5_slv_month_hour(regions):
    # Assert regions are valid so we don't unneccessarily create folders in the next part
    # And so we don't unnecessarily trigger the exception message
    assert all(region in area.keys() for region in regions), f"regions not subset of: {*[*area],}"
    # Run up to 10 parallel retrieve requests (to queue and download data for all regions simultaneously)
    with ThreadPoolExecutor(max_workers=10) as executor:
        for region in regions:
            executor.submit(request_era5_slv_month_hour, region)

In [None]:
# Retrieve monthly reanalysis data for Central America, South America and Western Australia
retrieve_era5_slv_month_hour(["ca", "sa", "wa"])

## Hourly reanalysis, on single levels

In [None]:
# Define request to retrieve hourly reanalysis (to be used with retrieve function)
def request_era5_slv_hour(region, year):
    file_name = 'data_raw/{region}_era5-slv_hour/{region}_era5-slv_hour_{year}.nc'.format(
        region=region, year=year)
    if Path(file_name).exists():
        print(file_name + " already exists")
    else:
        try:
            c.retrieve(
            'reanalysis-era5-single-levels',
            {
                'product_type': 'reanalysis',
                'variable': [
                    '100m_u_component_of_wind', '100m_v_component_of_wind', '10m_u_component_of_wind',
                    '10m_v_component_of_wind', '2m_temperature', 'mean_sea_level_pressure',
                    'surface_latent_heat_flux', 'surface_sensible_heat_flux',
                ],
                'year': year,
                'month': [
                    '01', '02', '03',
                    '04', '05', '06',
                    '07', '08', '09',
                    '10', '11', '12',
                ],
                'day': [
                    '01', '02', '03',
                    '04', '05', '06',
                    '07', '08', '09',
                    '10', '11', '12',
                    '13', '14', '15',
                    '16', '17', '18',
                    '19', '20', '21',
                    '22', '23', '24',
                    '25', '26', '27',
                    '28', '29', '30',
                    '31',
                ],
                'time': [
                    '00:00', '01:00', '02:00',
                    '03:00', '04:00', '05:00',
                    '06:00', '07:00', '08:00',
                    '09:00', '10:00', '11:00',
                    '12:00', '13:00', '14:00',
                    '15:00', '16:00', '17:00',
                    '18:00', '19:00', '20:00',
                    '21:00', '22:00', '23:00',
                ],
                'area': area[region],
                'format': 'netcdf',
            },
            file_name)
            print("Retrieved " + file_name)
        except:
            print("Could not retrieve " + file_name)

# Define function to retrieve hourly reanalysis    
def retrieve_era5_slv_hour(region):
    # Assert region is valid so we don't unneccessarily create a folder in the next part
    # And so we don't unnecessarily trigger the exception message
    assert region in area.keys(), f"region not one of: {*[*area],}"
    # Create {region}_era5-slv_hour folder to download data into
    # (if it doesn't already exist)
    Path("data_raw/{region}_era5-slv_hour".format(region=region)).mkdir(parents=True, exist_ok=True)
    # Run up to 10 parallel retrieve requests (ECMWF CDS only allows 1 year per request for hourly data)
    with ThreadPoolExecutor(max_workers=10) as executor:
        for year in years:
            executor.submit(request_era5_slv_hour, region, year)

In [None]:
# Retrieve hourly reanalysis data for Central America
retrieve_era5_slv_hour("ca")

In [None]:
# Retrieve hourly reanalysis data for South America
retrieve_era5_slv_hour("sa")

In [None]:
# Retrieve hourly reanalysis data for Western Australia
retrieve_era5_slv_hour("wa")

# Download GLASS data

First check that the naming conventions for the file urls are still up to date here: http://www.glass.umd.edu/Overview.html

Of the different components in the naming convention, the product_version and production_date is most likely to change so check these by browsing the downloads page here: http://www.glass.umd.edu/Download.html

Take care in that the production_date may vary for different years within the same dataset

In [None]:
# Define function to retrieve GLASS data
# Check on server that the url components are up to date (especially product_version)
def retrieve_glass_8_day(variable, data_source, year_start, year_end, production_date):
    # Define dictionaries according to GLASS dataset names (user may need to update these)
    variable_number = {"lai": "01", "fapar": "09"}
    data_source_number = {"modis": "01", "avhrr": "02"}
    product_version = {"modis": "V60", "avhrr": "V40"}
    dl_dir_path = {"modis": "MODIS/0.05D", "avhrr": "AVHRR"}
    # Assert arguments are valid so we don't unneccessarily create a folder in the next part
    # And so we don't unnecessarily trigger the exception message
    assert variable in variable_number.keys(), f"variable not one of: {*[*variable_number],}"
    assert data_source in data_source_number.keys(), f"data_source not one of: {*[*data_source_number],}"
    # Create global_glass-{variable}-{data_source}_8-day folder to download data into
    # (if it doesn't already exist)
    file_path = "data_raw/global_glass-{variable}-{data_source}_8-day".format(
        variable=variable, data_source=data_source)
    Path(file_path).mkdir(parents=True, exist_ok=True)
    # Download data from the correct url, and to the correct file name
    for year in range(year_start, year_end+1):
        for day in range(1, 361+1, 8):
            file_name = file_path + "/global_glass-{variable}-{data_source}_8-day_{year}-{day:03}.hdf".format(
                variable=variable, data_source=data_source, year=year, day=day)
            dl_url = ("http://www.glass.umd.edu/{variable}/{dl_dir_path}/{year}/GLASS{variable_number}" + 
                      "B{data_source_number}.{product_version}.A{year}{day:03}.{production_date}.hdf").format(
                variable=variable.upper(), dl_dir_path=dl_dir_path[data_source], year=year, day=day,
                variable_number=variable_number[variable], data_source_number = data_source_number[data_source],
                product_version=product_version[data_source], production_date=production_date)
            if Path(file_name).exists():
                print(file_name + " already exists")
            else:
                try:
                    wget.download(dl_url, file_name)
                    print("Retrieved " + file_name)
                except:
                    print("Could not retrieve " + file_name + "; check on server if there is missing data " +
                          "for the given date, and/or if the product_version and production_date for that " +
                          "file is correct")

In [None]:
# Retrieve LAI data derived from MODIS
# Check on server for the correct production_date to enter for each year
retrieve_glass_8_day("lai", "modis", 2000, 2019, "2022010")
retrieve_glass_8_day("lai", "modis", 2020, 2021, "2022138")

In [None]:
# Retrieve LAI data derived from AVHRR
# Check on server for the correct production_date to enter for each year
retrieve_glass_8_day("lai", "avhrr", 1981, 2017, "2019353")
retrieve_glass_8_day("lai", "avhrr", 2018, 2018, "2019358")

In [None]:
# Retrieve FAPAR data derived from MODIS
# Check on server for the correct production_date to enter for each year
retrieve_glass_8_day("fapar", "modis", 2000, 2020, "2022092")

In [None]:
# Retrieve FAPAR data derived from AVHRR
# Check on server for the correct production_date to enter for each year
retrieve_glass_8_day("fapar", "avhrr", 1982, 2015, "2019353")
retrieve_glass_8_day("fapar", "avhrr", 2016, 2018, "2019358")

# Download other data

## BoM hourly observation data

Request from http://www.bom.gov.au/catalogue/data-feeds.shtml

In [None]:
# Create wa_bom_hour folder to download data into
# (if it doesn't already exist)
Path("data_raw/wa_bom_hour").mkdir(parents=True, exist_ok=True)

## BoM minutely observation data

Request from http://www.bom.gov.au/catalogue/data-feeds.shtml

In [None]:
# Create wa_bom_minute folder to download data into
# (if it doesn't already exist)
Path("data_raw/wa_bom_minute").mkdir(parents=True, exist_ok=True)

## Bunny Fence Experiment (2005-2007) data

Request from https://www.eol.ucar.edu/field_projects/bufex

In [None]:
# Create wa_bufex folder to download data into
# (if it doesn't already exist)
Path("data_raw/wa_bufex").mkdir(parents=True, exist_ok=True)