In [None]:
# Name: Wenqi Wang
# Github username: acse-ww721

In [13]:
# Basic setting for Jupyter_notebook to import utils
import os
import sys

notebook_path = os.path.abspath("")
project_root = os.path.abspath(os.path.join(notebook_path, "../../"))

sys.path.append(project_root)

In [14]:
import cdsapi
import threading
import time
import os
from tqdm import tqdm
from utils import folder_utils
from concurrent.futures import ThreadPoolExecutor  # thread pool module
# from data_era5_t850 import data_year, data_month, data_day, data_time, area_uk

In [15]:
# folder setting
country = [
    "GB",
]
data_folder = "data"
data_category = "raw_data"
output_folder = "ERA5_DATA"


In [16]:
# variable setting
dataset = "reanalysis-era5-single-levels"

variable_list = [
    "2m_temperature",
]

In [17]:
# variable setting
data_year = [  # the target years
    "1979",
    "1980",
    "1981",
    "1982",
    "1983",
    "1984",
    "1985",
    "1986",
    "1987",
    "1988",
    "1989",
    "1990",
    "1991",
    "1992",
    "1993",
    "1994",
    "1995",
    "1996",
    "1997",
    "1998",
    "1999",
    "2000",
    "2001",
    "2002",
    "2003",
    "2004",
    "2005",
    "2006",
    "2007",
    "2008",
    "2009",
    "2010",
    "2011",
    "2012",
    "2013",
    "2014",
    "2015",
    "2016",
    "2017",
    "2018",
    "2019",
    "2020",
    "2021",
    "2022",
]

data_month = [  # the target months
    "01",
    "02",
    "03",
    "04",
    "05",
    "06",
    "07",
    "08",
    "09",
    "10",
    "11",
    "12",
]

data_day = [
    "01",
    "02",
    "03",
    "04",
    "05",
    "06",
    "07",
    "08",
    "09",
    "10",
    "11",
    "12",
    "13",
    "14",
    "15",
    "16",
    "17",
    "18",
    "19",
    "20",
    "21",
    "22",
    "23",
    "24",
    "25",
    "26",
    "27",
    "28",
    "29",
    "30",
    "31",
]  # the target day  # s


data_time = [  # the target times_UTC
    "00:00",
    "01:00",
    "02:00",
    "03:00",
    "04:00",
    "05:00",
    "06:00",
    "07:00",
    "08:00",
    "09:00",
    "10:00",
    "11:00",
    "12:00",
    "13:00",
    "14:00",
    "15:00",
    "16:00",
    "17:00",
    "18:00",
    "19:00",
    "20:00",
    "21:00",
    "22:00",
    "23:00",
]

area_uk = [
    58,
    -7,
    50,
    2,
]

In [None]:
def time_select(df, date_column, start_date, end_date):
    df[date_column] = pd.to_datetime(df[date_column], format="%Y%m%d")
    filtered_df = df[(df[date_column] >= start_date) & (df[date_column] <= end_date)]
    return filtered_df


# Example
# filtered_noaa_df = time_select(processed_df_noaa, "DATE", start_date, end_date)
# filtered_asos_df = time_select(processed_df_asos, "date", start_date, end_date)


def is_leap_year(year):
    year = int(year)  # Convert the string to integer
    return (year % 4 == 0 and year % 100 != 0) or (year % 400 == 0)


def days_check(year, month):
    year = int(year)  # Convert the string to integer
    days_by_month = {
        "01": 31,  # January
        "02": 29 if is_leap_year(year) else 28,  # February
        "03": 31,  # March
        "04": 30,  # April
        "05": 31,  # May
        "06": 30,  # June
        "07": 31,  # July
        "08": 31,  # August
        "09": 30,  # September
        "10": 31,  # October
        "11": 30,  # November
        "12": 31,  # December
    }

    return [str(day).zfill(2) for day in range(1, days_by_month.get(month, 0) + 1)]

In [None]:

def get_current_directory():
    if "__file__" in globals():
        # Running in a Python file
        return os.path.abspath(os.path.dirname(__file__))
    else:
        # Running in a Jupyter Notebook
        return os.path.abspath(os.path.dirname(""))


def find_folder(c, data_folder, data_category, output_folder):
    # c: country list
    current_directory = get_current_directory()
    project_root = os.path.abspath(os.path.join(current_directory, "..", "."))
    folder_name = f"{c}_{output_folder}"
    folder_path = os.path.join(
        project_root, data_folder, data_category, output_folder, folder_name
    )

    return folder_path


def create_folder(c, data_folder, data_category, output_folder):
    # c: country list
    folder_path = find_folder(c, data_folder, data_category, output_folder)

    try:
        os.makedirs(folder_path, exist_ok=True)
        print(f"Folder '{folder_path}' created successfully.")
    except FileExistsError:
        print(f"Folder '{folder_path}' already exists.")

    return folder_path


In [18]:
def era5_get_data_single_level(c, dataset, variable_list, year):
    # c: api_server
    # dataset: target dataset
    # variable_list: the target variable
    try:
        output_directory = folder_utils.create_folder(
            country, data_folder, data_category, output_folder  # i is the data_year
        )
        output_filename = f"era5_single_level_{year}.nc"
        output_filepath = os.path.join(output_directory, output_filename)
        c.retrieve(
            dataset,
            {
                "product_type": "reanalysis",
                "format": "netcdf",
                "variable": "2m_temperature",
                "year": year,
                "month": data_month,
                "day": data_day,
                "time": data_time,
                # 'format': 'netcdf.zip',
                "area": area_uk,  # the UK range
            },
            output_filepath,
        )

        print(f"{output_filename} done!")

    except Exception as e:
        print(f"Error downloading {output_filename}: {e}\n")

In [None]:
def era5_get_data_single_level_st(c, dataset, variable_list, year, month):
    # single thread version
    # c: api_server
    # dataset: target dataset
    # variable_list: the target variable
    try:
        output_directory = folder_utils.create_folder(
            country, data_folder, data_category, output_folder  # i is the data_year
        )
        output_filename = f"era5_single_level_{year}.nc"
        output_filepath = os.path.join(output_directory, output_filename)
        c.retrieve(
            dataset,
            {
                "product_type": "reanalysis",
                "format": "netcdf",
                "variable": "2m_temperature",
                "year": year,
                "month": month,
                "day": days_check(year,month),
                "time": data_time,
                # 'format': 'netcdf.zip',
                "area": area_uk,  # the UK range
            },
            output_filepath,
        )

        print(f"{output_filename} done!")

    except Exception as e:
        print(f"Error downloading {output_filename}: {e}\n")


In [19]:
def thread_function(year,month):
    c = cdsapi.Client()  # Initialize client within the thread

    start_time = time.time()  # Record start time
    era5_get_data_single_level(
        c,
        dataset,
        variable_list,
        year,
        month,
    )
    end_time = time.time()  # Record end time
    run_time = end_time - start_time
    print(f"Download time: {run_time:.3f} s")


In [None]:
for i in data_year:
    for k in data_month:
        thread_function(i,k)

In [None]:
# Create a thread pool  # 8 threads
with ThreadPoolExecutor(max_workers=8) as executor:
    # iterate through the data_year and pressure_level
    for i in tqdm(data_year):
        executor.submit(thread_function, i,k)