# Set Up: Jupyter Notebook

In [1]:
# Import the required libraries
import requests
import pandas as pd

# ----------------- #

# ingester3n // Run in Terminal:
! pip show ingester3 | grep Version 

# Pandas extensions
from ingester3.extensions import *

# Set up for chache.
from ingester3.scratch import cache_manager

Version: 2.1.0


# Download: Climate Data from CRU and ERA5 data sats
- Reasoning for variable choice
    - Document with more extensive reasoning for why a variable should or should not be included in bruno-gbl's repository: https://github.com/bruno-gbl/climate-data-country/blob/main/docs/Variables%20Overview.md
    - Document also includes information on the variables used in Garret's script to ingest climate variables at the pgm level.
    - Document also includes more detailed information about the specific coding of the different variables.
- CRU vs. ERA5 variables:
    - In this script, each climate data variable is either from the CRU or ERA5 climate data set. Both are available through the same World bank Climate Data portal.
    - However, their API and the time-span of available data differs slightly. Therefore, they are treated sligthly different before being merged into one dataset.
- Data Specifications on WorldBank Climate Data Platform:
    - CRU data set > timeseries > monthly > 1901-2022 > mean > model "ts4.07"
    - ERA5 data set > timeseries > monthly > 1950-2023 > mean > model "era5" > model label "x0.25"

#### Define Function: Download CRU data

In [3]:
# The API Structure:
# https://cckpapi.worldbank.org/cckp/v1/cru-x0.5_timeseries_{variable_name}_timeseries_monthly_1901-2022_mean_historical_cru_ts4.07_mean/all_countries?_format=json

# Function downloading the climate data from the World Bank Climate Knowledge Portal and turning it into a pandas DataFrame
def download_cru_data(cru_variable):
        
        # Define the URL for the JSON file
        url_climate_data = f"https://cckpapi.worldbank.org/cckp/v1/cru-x0.5_timeseries_{cru_variable}_timeseries_monthly_1901-2022_mean_historical_cru_ts4.07_mean/all_countries?_format=json"
        # Fetch the JSON data using a GET request
        response_climate_data = requests.get(url_climate_data)
    
        # Parse the JSON response into a Python dictionary, if the request was successful
        if response_climate_data.status_code == 200:
            climate_data_json = response_climate_data.json()
        else:
            print(f"Failed to fetch data.")
            climate_data_json = None
    
        # Flatten the nested structure into a list of dictionaries
        climate_data_json_only_data = climate_data_json["data"]

        rows = []

        for country, timeseries in climate_data_json_only_data.items():
            for date, value in timeseries.items():
                rows.append({"Country": country, "Date": date, "Value": value})
    
        # Convert the list into a pandas DataFrame
        climate_data_df = pd.DataFrame(rows)
    
        # Return the DataFrame for further use
        return climate_data_df


#### Download: CRU variables that should definitely be adopted.

In [4]:
# tas // Average Mean Surface Air Temperature // G: Only relative temperature variables
tas_cru_df = download_cru_data("tas")

# tasmax // Average Maximum Surface Air Temperature // G: Only relative temperature variables
tasmax_cru_df = download_cru_data("tasmax")

# tasmin // Average Minimum Surface Air Temperature // G: Only relative temperature variables
tasmin_cru_df = download_cru_data("tasmin")


#### Define Function: Download ERA5 data

In [5]:
# The API Structure:
# https://cckpapi.worldbank.org/cckp/v1/era5-x0.25_timeseries_{variable_name}_timeseries_monthly_1950-2023_mean_historical_era5_x0.25_mean/all_countries?_format=json

# Function downloading the climate data from the World Bank Climate Knowledge Portal and turning it into a pandas DataFrame
def download_era5_data(era5_variable):
        
        # Define the URL for the JSON file
        url_climate_data = f"https://cckpapi.worldbank.org/cckp/v1/era5-x0.25_timeseries_{era5_variable}_timeseries_monthly_1950-2023_mean_historical_era5_x0.25_mean/all_countries?_format=json"
        # Fetch the JSON data using a GET request
        response_climate_data = requests.get(url_climate_data)
    
        # Parse the JSON response into a Python dictionary, if the request was successful
        if response_climate_data.status_code == 200:
            climate_data_json = response_climate_data.json()
        else:
            print(f"Failed to fetch data.")
            climate_data_json = None
    
        # Flatten the nested structure into a list of dictionaries
        climate_data_json_only_data = climate_data_json["data"]

        rows = []

        for country, timeseries in climate_data_json_only_data.items():
            for date, value in timeseries.items():
                rows.append({"Country": country, "Date": date, "Value": value})
    
        # Convert the list into a pandas DataFrame
        climate_data_df = pd.DataFrame(rows)
    
        # Return the DataFrame for further use
        return climate_data_df

#### Download: ERA5 variables that should definitely be adopted.

In [6]:
# cdd65 // Cooling Degree Days (ref-65°F)
cdd65_era5_df = download_era5_data("cdd65")

# hd35 // Number of Hot Days (Tmax > 35°C)
hd35_era5_df = download_era5_data("hd35")

# hd40 // Number of Hot Days (Tmax > 40°C)
hd40_era5_df = download_era5_data("hd40")

# hd42 // Number of Hot Days (Tmax > 42°C)
hd42_era5_df = download_era5_data("hd42")

# hdd65 // Heating degree days (ref-65°F)
hdd65_era5_df = download_era5_data("hdd65")

# hi35 // Number of Days with Heat Index > 35°C
hi35_era5_df = download_era5_data("hi35")

# hi37 // Number of Days with Heat Index > 37°C
hi37_era5_df = download_era5_data("hi37")

# hurs // Relative Humidity
hurs_era5_df = download_era5_data("hurs")

# prpercnt // Precipitation Percent Change
prpercnt_era5_df = download_era5_data("prpercnt")

# rx1day // Average Largest 1-Day Precipitation
rx1day_era5_df = download_era5_data("rx1day")

# rx5day // Average Largest 5-Day Cumulative Precipitation
rx5day_era5_df = download_era5_data("rx5day")

# tnn // Minimum of Daily Min-Temperature
tnn_era5_df = download_era5_data("tnn")

# txx // Maximum of Daily Max-Temperature
txx_era5_df = download_era5_data("txx")


#### Download: ERA5 variables that should probably be adopted.

In [7]:
# pr // Precipitation // G: "Total wet day precipitation" // similar, not identical to existing "pr" variable // This variable is average precipitation over a given time // Potentially also includes smaller variation of rain below 1mm
pr_era5_df = download_era5_data("pr")

# hd30 // Number of Hot Days (Tmax > 30°C) //  G: - // Overlaping with existing hd35, hd40, hd42
hd30_era5_df = download_era5_data("hd30")

# hd50 // Number of Hot Days (Tmax > 50°C) // G: - // Only daily maximum temperature per month/year.
hd50_era5_df = download_era5_data("hd50")

# hi39 // Number of Days with Heat Index > 39°C // Did not include variable that was available
hi39_era5_df = download_era5_data("hi39")

# hi41 // Number of Days with Heat Index > 41°C // G: - // Did not include variable that was available
hi41_era5_df = download_era5_data("hi41")

# r50mm // Number of Days with Precipitation >50mm // G: - // Only "Very heavy precipitation days" = days with more than 20mm
r50mm_era5_df = download_era5_data("r50mm")

# r95ptot // Precipitation amount during wettest days //  G: - // Did not include identical variable "Very wet day precipitation"
r95ptot_era5_df = download_era5_data("r95ptot")

# tr23 // Number of Tropical Nights (T-min > 23°C) // [Very similar to "tr"]
tr23_era5_df = download_era5_data("tr23")

# tr26 // Number of Tropical Nights (T-min > 26°C) // [Very similar to "tr"]
tr26_era5_df = download_era5_data("tr26")

# tr29 // Number of Tropical Nights (T-min > 29°C) // [Very similar to "tr"]
tr29_era5_df = download_era5_data("tr29")

# tr32 // Number of Tropical Nights (T-min > 32°C) // [Very similar to "tr"]
tr32_era5_df = download_era5_data("tr32")


#### Download: ERA5 variables that should probably NOT be adopted.

In [8]:
# fd // Number of Frost Days (Tmin < 0°C) // G: Almost identical variable // G: count variable // Here: Average over time, i.e. data period. Smoothed-out, long-term perspective of Frost Days; Less interesting for forecasting.
fd_era5_df = download_era5_data("fd")

# id // Number of Ice Days (Tmax < 0°C) // G: Almost identical variable // G: count variable // Here: Average over time, i.e. data period. Smoothed-out, long-term perspective of Ice Days; Less interesting for forecasting.
id_era5_df = download_era5_data("id")


# Data Wrangling: Preparing for ingestion

#### Turning all variable-specific dataframes into one dictionary of dataframes
- Easier filtering for which variables to include in the ingestion.
- Simplified code.

In [None]:
# Creating a dictiopnary of dataframes with the respective variable names as labels, to be able name the value columns according to the variable name.
dataframes_dict = {
    # CRU variables that should definitely be adopted.
    "tas": tas_cru_df,
    "tasmax": tasmax_cru_df,
    "tasmin": tasmin_cru_df,
    # ERA5 variables that should definitely be adopted.
    "cdd65": cdd65_era5_df,
    "hd35": hd35_era5_df,
    "hd40": hd40_era5_df,
    "hd42": hd42_era5_df,
    "hdd65": hdd65_era5_df,
    "hi35": hi35_era5_df,
    "hi37": hi37_era5_df,
    "hurs": hurs_era5_df,
    "prpercnt": prpercnt_era5_df,
    "rx1day": rx1day_era5_df,
    "rx5day": rx5day_era5_df,
    "tnn": tnn_era5_df,
    "txx": txx_era5_df,
    # ERA5 variables that should probably be adopted.
    "pr": pr_era5_df,
    "hd30": hd30_era5_df,
    "hd50": hd50_era5_df,
    "hi39": hi39_era5_df,
    "hi41": hi41_era5_df,
    "r50mm": r50mm_era5_df,
    "r95ptot": r95ptot_era5_df,
    "tr23": tr23_era5_df,
    "tr26": tr26_era5_df,
    "tr29": tr29_era5_df,
    "tr32": tr32_era5_df,
    # ERA5 variables that should probably NOT be adopted.
    "fd": fd_era5_df,
    "id": id_era5_df,
}

# Delete the standalone variable-specific dataframes to avoid confusion
for key in dataframes_dict.keys():
    if key + "_cru_df" in globals():
        del globals()[key + "_cru_df"]
    if key + "_era5_df" in globals():
        del globals()[key + "_era5_df"]

# Delete 'key' variable after the loop finishes
del key

#### Bring all dataframes in the dictionary to the same time period: Start from January 1990
- Convert the "Date" column to a datetime object
- Filter the data to only include data from January 1990 onwards

In [10]:
# Create a function to filter the time period to only include data from January 1990 onwards
def filter_timeseries_1990(df):
    # Convert the "Date" column to a datetime object
    df["Date"] = pd.to_datetime(df["Date"])
    # Filter the DataFrame to only include data from January 1990 onwards
    return df[df["Date"] >= "1990-01-01"]

# Apply filtering to all DataFrames in the dictionary
dataframes_dict = {key: filter_timeseries_1990(df) for key, df in dataframes_dict.items()}

#### Turn all variables into one dataframe

In [11]:
# Function to merge dataframes from a dictionary into a single DataFrame
def merge_datasets_from_dict(dataframes):
    # Initialize an empty DataFrame to start merging
    merged_df = pd.DataFrame()

    # Iterate through the dictionary
    for var_name, df in dataframes.items():
        # Check if the DataFrame has the expected structure
        if set(df.columns) != {"Country", "Date", "Value"}:
            raise ValueError(f"DataFrame for '{var_name}' does not have the expected columns: {df.columns}")
        
        # Rename the 'Value' column to the variable name
        df = df.rename(columns={"Value": var_name})

        # Merge with the existing DataFrame
        if merged_df.empty:
            merged_df = df
        else:
            merged_df = pd.merge(merged_df, df, on=["Country", "Date"], how="outer")
    
    return merged_df

# Execute function and merge all DataFrames
merged_df = merge_datasets_from_dict(dataframes_dict)

## Data Wrangling: Turn merged data set of all climate variables into a suitable VIEWS format
- Change Country Codes
- Change Month Codes
- Commands based on ingester3


In [12]:
# Option 1: Both at the same time
# Potential issues: 
#     The c_id is computed at the month of the datetime_col.
#     The tranformation of the date variable might not work: The date variable on the data sets has no "day" element. In that case, that would have to be addressed beforehand.

climate_variables_df = merged_df
climate_variables_df = pd.DataFrame.cm.from_datetime(climate_variables_df, datetime_col='Date')
climate_variables_df = pd.DataFrame.cm.from_iso(climate_variables_df, iso_col='Country', month_col=None) #Looks like a problem to call on the VIEWS databank or so. Fix tomorrow.


KeyError: 0