# Setup

Note: Download of data below will require around XXX GB of storage, and may take several days due to queueing in the ECMWF Climate Data Store (CDS).

In [None]:
# Import libraries
from pathlib import Path
import cdsapi
from concurrent.futures import ThreadPoolExecutor, as_completed

In [None]:
# Create data_raw folder to download data into
# (if it doesn't already exist)
Path("data_raw").mkdir(parents=True, exist_ok=True)

In [None]:
# Open CDS API client for ERA5 downloads
c = cdsapi.Client()

In [None]:
# Areas for each region in NWSE format to retrieve data for
area = {"ca": [17, -91, 7, -81],
        "sa": [0, -65, -15, -30],
        "wa": [-30, 113, -35, 123]}

In [None]:
# Years to retrieve data for
years = [
            '1980', '1981', '1982',
            '1983', '1984', '1985',
            '1986', '1987', '1988',
            '1989', '1990', '1991',
            '1992', '1993', '1994',
            '1995', '1996', '1997',
            '1998', '1999', '2000',
            '2001', '2002', '2003',
            '2004', '2005', '2006',
            '2007', '2008', '2009',
            '2010', '2011', '2012',
            '2013', '2014', '2015',
            '2016', '2017', '2018',
            '2019', '2020', '2021',
]

In [None]:
# Define function to retrieve monthly averaged reanalysis by hour of day
def retrieve_era5_slv_month_hour(region):
    file_name = 'data_raw/{region}_era5-slv_month-hour_{yearstart}-{yearend}.nc'.format(region=region, yearstart=years[0], yearend=years[-1])
    if Path(file_name).exists():
        print(file_name, "already exists")
    else:
        c.retrieve(
            'reanalysis-era5-single-levels-monthly-means',
            {
                'product_type': 'monthly_averaged_reanalysis_by_hour_of_day',
                'variable': [
                    '100m_u_component_of_wind', '100m_v_component_of_wind', '10m_u_component_of_wind',
                    '10m_v_component_of_wind', '2m_temperature', 'mean_sea_level_pressure',
                    'surface_latent_heat_flux', 'surface_sensible_heat_flux',
                ],
                'year': years,
                'month': [
                    '01', '02', '03',
                    '04', '05', '06',
                    '07', '08', '09',
                    '10', '11', '12',
                ],
                'time': [
                    '00:00', '01:00', '02:00',
                    '03:00', '04:00', '05:00',
                    '06:00', '07:00', '08:00',
                    '09:00', '10:00', '11:00',
                    '12:00', '13:00', '14:00',
                    '15:00', '16:00', '17:00',
                    '18:00', '19:00', '20:00',
                    '21:00', '22:00', '23:00',
                ],
                'format': 'netcdf',
                'area': area[region],
            },
            file_name)
        print("Retrieved", file_name)

In [None]:
# Define function to retrieve hourly reanalysis (to be used with ThreadPoolExecutor)
def retrieve_era5_slv_hour(region, year):
    file_name = 'data_raw/{region}_era5-slv_hour/{region}_era5-slv_hour_{year}.nc'.format(region=region, year=year)
    if Path(file_name).exists():
        return(file_name + " already exists")
    else:
        c.retrieve(
        'reanalysis-era5-single-levels',
        {
            'product_type': 'reanalysis',
            'variable': [
                '100m_u_component_of_wind', '100m_v_component_of_wind', '10m_u_component_of_wind',
                '10m_v_component_of_wind', '2m_temperature', 'mean_sea_level_pressure',
                'surface_latent_heat_flux', 'surface_sensible_heat_flux',
            ],
            'year': year,
            'month': [
                '01', '02', '03',
                '04', '05', '06',
                '07', '08', '09',
                '10', '11', '12',
            ],
            'day': [
                '01', '02', '03',
                '04', '05', '06',
                '07', '08', '09',
                '10', '11', '12',
                '13', '14', '15',
                '16', '17', '18',
                '19', '20', '21',
                '22', '23', '24',
                '25', '26', '27',
                '28', '29', '30',
                '31',
            ],
            'time': [
                '00:00', '01:00', '02:00',
                '03:00', '04:00', '05:00',
                '06:00', '07:00', '08:00',
                '09:00', '10:00', '11:00',
                '12:00', '13:00', '14:00',
                '15:00', '16:00', '17:00',
                '18:00', '19:00', '20:00',
                '21:00', '22:00', '23:00',
            ],
            'area': area[region],
            'format': 'netcdf',
        },
        file_name)
        return("Retrieved " + file_name)

# Download ERA5 data

First set up ECMWF CDS API using instructions from here: https://confluence.ecmwf.int/display/CKB/How+to+download+ERA5#HowtodownloadERA5-4-DownloadERA5familydatathroughtheCDSAPI

## Static data

### Global data

In [None]:
# Create global_era5-slv_static folder to download data into
# (if it doesn't already exist)
Path("data_raw/global_era5-slv_static").mkdir(parents=True, exist_ok=True)

In [None]:
# Retrieve ERA5 static geopotential data (for topography)
c.retrieve(
    'reanalysis-era5-single-levels-monthly-means',
    {
        'product_type': 'monthly_averaged_reanalysis',
        'variable': 'geopotential',
        'year': '2022',
        'month': '01',
        'time': '00:00',
        'format': 'netcdf',
    },
    'data_raw/global_era5-slv_static/geopotential.nc')

In [None]:
# Retrieve ERA5 land-sea mask
c.retrieve(
    'reanalysis-era5-single-levels-monthly-means',
    {
        'product_type': 'monthly_averaged_reanalysis',
        'variable': 'land_sea_mask',
        'year': '2022',
        'month': '01',
        'time': '00:00',
        'format': 'netcdf',
    },
    'data_raw/global_era5-slv_static/land-sea-mask.nc')

## Monthly averaged reanalysis by hour of day, on single levels

### Central America data

In [None]:
retrieve_era5_slv_month_hour("ca")

### South America data

In [None]:
retrieve_era5_slv_month_hour("sa")

### Western Australia data

In [None]:
retrieve_era5_slv_month_hour("wa")

## Hourly reanalysis, on single levels

### Central America data

In [None]:
# Create ca_era5-slv_hour folder to download data into
# (if it doesn't already exist)
Path("data_raw/ca_era5-slv_hour").mkdir(parents=True, exist_ok=True)

In [None]:
# Run up to 10 parallel retrieve requests (can only retrieve 1 year per request)
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = [
        executor.submit(retrieve_era5_slv_hour, "ca", year) for year in years
    ]
    for f in as_completed(futures):
            try:
                print(f.result())
            except:
                print("Could not retrieve")

### South America data

In [None]:
# Create sa_era5-slv_hour folder to download data into
# (if it doesn't already exist)
Path("data_raw/sa_era5-slv_hour").mkdir(parents=True, exist_ok=True)

In [None]:
# Run up to 10 parallel retrieve requests (can only retrieve 1 year per request)
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = [
        executor.submit(retrieve_era5_slv_hour, "sa", year) for year in years
    ]
    for f in as_completed(futures):
            try:
                print(f.result())
            except:
                print("Could not retrieve")

### Western Australia data

In [None]:
# Create wa_era5-slv_hour folder to download data into
# (if it doesn't already exist)
Path("data_raw/wa_era5-slv_hour").mkdir(parents=True, exist_ok=True)

In [None]:
# Run up to 10 parallel retrieve requests (can only retrieve 1 year per request)
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = [
        executor.submit(retrieve_era5_slv_hour, "wa", year) for year in years
    ]
    for f in as_completed(futures):
            try:
                print(f.result())
            except:
                print("Could not retrieve")

# Download GLASS LAI data

In [None]:
# AVHRR 0.05 deg data (1981-2008)

# Create global_glass-lai-avhrr_month folder to download data into
# (if it doesn't already exist)
Path("data_raw/global_glass-lai-avhrr_month").mkdir(parents=True, exist_ok=True)



In [None]:
# MODIS 0.05 deg data (2000-2021)

# Create global_glass-lai-modis_month folder to download data into
# (if it doesn't already exist)
Path("data_raw/global_glass-lai-modis_month").mkdir(parents=True, exist_ok=True)



# Download other data

## BoM hourly observation data

Request from http://www.bom.gov.au/catalogue/data-feeds.shtml

In [None]:
# Create wa_bom_hour folder to download data into
# (if it doesn't already exist)
Path("data_raw/wa_bom_hour").mkdir(parents=True, exist_ok=True)

## BoM minutely observation data

Request from http://www.bom.gov.au/catalogue/data-feeds.shtml

In [None]:
# Create wa_bom_minute folder to download data into
# (if it doesn't already exist)
Path("data_raw/wa_bom_minute").mkdir(parents=True, exist_ok=True)

## Bunny Fence Experiment (2005-2007) data

Request from https://www.eol.ucar.edu/field_projects/bufex

In [None]:
# Create wa_bufex folder to download data into
# (if it doesn't already exist)
Path("data_raw/wa_bufex").mkdir(parents=True, exist_ok=True)

# To delete after

In [None]:
c.retrieve(
    'reanalysis-era5-single-levels-monthly-means',
    {
        'product_type': 'monthly_averaged_reanalysis_by_hour_of_day',
        'variable': [
            '100m_u_component_of_wind', '100m_v_component_of_wind', '10m_u_component_of_wind',
            '10m_v_component_of_wind', '2m_temperature', 'mean_sea_level_pressure',
            'surface_latent_heat_flux', 'surface_sensible_heat_flux',
        ],
        'year': [
            '1980', '1981', '1982',
            '1983', '1984', '1985',
            '1986', '1987', '1988',
            '1989', '1990', '1991',
            '1992', '1993', '1994',
            '1995', '1996', '1997',
            '1998', '1999', '2000',
            '2001', '2002', '2003',
            '2004', '2005', '2006',
            '2007', '2008', '2009',
            '2010', '2011', '2012',
            '2013', '2014', '2015',
            '2016', '2017', '2018',
            '2019', '2020', '2021',
        ],
        'month': [
            '01', '02', '03',
            '04', '05', '06',
            '07', '08', '09',
            '10', '11', '12',
        ],
        'time': [
            '00:00', '01:00', '02:00',
            '03:00', '04:00', '05:00',
            '06:00', '07:00', '08:00',
            '09:00', '10:00', '11:00',
            '12:00', '13:00', '14:00',
            '15:00', '16:00', '17:00',
            '18:00', '19:00', '20:00',
            '21:00', '22:00', '23:00',
        ],
        'format': 'netcdf',
        'area': [
            17, -91, 7,
            -81,
        ],
    },
    'data_raw/ca_era5-slv_month-hour_1980-2021.nc')

In [None]:
c.retrieve(
    'reanalysis-era5-single-levels-monthly-means',
    {
        'product_type': 'monthly_averaged_reanalysis_by_hour_of_day',
        'variable': [
            '100m_u_component_of_wind', '100m_v_component_of_wind', '10m_u_component_of_wind',
            '10m_v_component_of_wind', '2m_temperature', 'mean_sea_level_pressure',
            'surface_latent_heat_flux', 'surface_sensible_heat_flux',
        ],
        'year': [
            '1980', '1981', '1982',
            '1983', '1984', '1985',
            '1986', '1987', '1988',
            '1989', '1990', '1991',
            '1992', '1993', '1994',
            '1995', '1996', '1997',
            '1998', '1999', '2000',
            '2001', '2002', '2003',
            '2004', '2005', '2006',
            '2007', '2008', '2009',
            '2010', '2011', '2012',
            '2013', '2014', '2015',
            '2016', '2017', '2018',
            '2019', '2020', '2021',
        ],
        'month': [
            '01', '02', '03',
            '04', '05', '06',
            '07', '08', '09',
            '10', '11', '12',
        ],
        'time': [
            '00:00', '01:00', '02:00',
            '03:00', '04:00', '05:00',
            '06:00', '07:00', '08:00',
            '09:00', '10:00', '11:00',
            '12:00', '13:00', '14:00',
            '15:00', '16:00', '17:00',
            '18:00', '19:00', '20:00',
            '21:00', '22:00', '23:00',
        ],
        'format': 'netcdf',
        'area': [
            0, -65, -15,
            -30,
        ],
    },
    'data_raw/sa_era5-slv_month-hour_1980-2021.nc')

In [None]:
c.retrieve(
    'reanalysis-era5-single-levels-monthly-means',
    {
        'product_type': 'monthly_averaged_reanalysis_by_hour_of_day',
        'variable': [
            '100m_u_component_of_wind', '100m_v_component_of_wind', '10m_u_component_of_wind',
            '10m_v_component_of_wind', '2m_temperature', 'mean_sea_level_pressure',
            'surface_latent_heat_flux', 'surface_sensible_heat_flux',
        ],
        'year': [
            '1980', '1981', '1982',
            '1983', '1984', '1985',
            '1986', '1987', '1988',
            '1989', '1990', '1991',
            '1992', '1993', '1994',
            '1995', '1996', '1997',
            '1998', '1999', '2000',
            '2001', '2002', '2003',
            '2004', '2005', '2006',
            '2007', '2008', '2009',
            '2010', '2011', '2012',
            '2013', '2014', '2015',
            '2016', '2017', '2018',
            '2019', '2020', '2021',
        ],
        'month': [
            '01', '02', '03',
            '04', '05', '06',
            '07', '08', '09',
            '10', '11', '12',
        ],
        'time': [
            '00:00', '01:00', '02:00',
            '03:00', '04:00', '05:00',
            '06:00', '07:00', '08:00',
            '09:00', '10:00', '11:00',
            '12:00', '13:00', '14:00',
            '15:00', '16:00', '17:00',
            '18:00', '19:00', '20:00',
            '21:00', '22:00', '23:00',
        ],
        'format': 'netcdf',
        'area': [
            -30, 113, -35,
            123,
        ],
    },
    'data_raw/wa_era5-slv_month-hour_1980-2021.nc')