# Set Up: Jupyter Notebook

In [1]:
# Import the required libraries
import requests
import pandas as pd
import pycountry

# ----------------- #

# Pandas extensions
from ingester3.extensions import *

# Set up chache manager
from ingester3.scratch import cache_manager

In [2]:
# Check version of ingester3
from pkg_resources import get_distribution
installed_version = get_distribution('ingester3').version

if installed_version >= '1.8.1':
    print (f"You are running version {installed_version} which is consistent with the documentation")
else:
    print (f"""
You are running an obsolete version ({installed_version}). Run:
pip install ingester3 --upgrade 
to upgrade""")
    
del installed_version

You are running version 2.1.0 which is consistent with the documentation


  from pkg_resources import get_distribution


# Download: Climate Data from CRU and ERA5 data sets
- Reasoning for variable choice
    - Document with more extensive reasoning for why a variable should or should not be included is in bruno-gbl's repository: https://github.com/bruno-gbl/climate-data-country/blob/main/docs/Variables%20Overview.md
    - Document includes information on the variables used in Garret's script to ingest climate variables at the pgm level.
    - Document includes more detailed information about the specific coding of the different variables.
- CRU vs. ERA5 variables:
    - In this script, each climate data variable is either from the CRU or ERA5 climate data set. Both are available through the same World bank Climate Data portal ("Climate Change Knowledge Portal" - CCKP).
    - However, the structure of the API and the time-span of the two data sets differs slightly. Therefore, they are treated sligthly different before being merged into one dataset.
- Data Download Platform
    - https://climateknowledgeportal.worldbank.org/download-data
    - A user needs to be created to access the API's. However, the API's themselves are not user specific. This script can be run without having an account on the platform.
- Data Specifications on the WorldBank Climate Data Platform CCKP:
    - CRU data set > timeseries > monthly > 1901-2022 > mean > model "ts4.07"
    - ERA5 data set > timeseries > monthly > 1950-2023 > mean > model "era5" > model label "x0.25"

#### Define Function: Download CRU data

In [3]:
# The API Structure for CRU data is as follows:
# https://cckpapi.worldbank.org/cckp/v1/cru-x0.5_timeseries_{variable_name}_timeseries_monthly_1901-2022_mean_historical_cru_ts4.07_mean/all_countries?_format=json

# Example for the variable "tas": API to download JSON file. Can be studied using a normal browser.
# https://cckpapi.worldbank.org/cckp/v1/cru-x0.5_timeseries_tas_timeseries_monthly_1901-2022_mean_historical_cru_ts4.07_mean/all_countries?_format=json


# Function downloading the climate data from the World Bank Climate Knowledge Portal and turning it from a JSON file into a pandas DataFrame
def download_cru_data(cru_variable):
        
        # Define the URL for the JSON file
        url_climate_data = f"https://cckpapi.worldbank.org/cckp/v1/cru-x0.5_timeseries_{cru_variable}_timeseries_monthly_1901-2022_mean_historical_cru_ts4.07_mean/all_countries?_format=json"
        # Fetch the JSON data using a GET request
        response_climate_data = requests.get(url_climate_data)
    
        # Parse the JSON response into a Python dictionary, if the request was successful
        if response_climate_data.status_code == 200:
            climate_data_json = response_climate_data.json()
        else:
            print(f"Failed to fetch data.")
            climate_data_json = None
    
        # Flatten the nested structure into a list of dictionaries
        climate_data_json_only_data = climate_data_json["data"]

        rows = []

        for country, timeseries in climate_data_json_only_data.items():
            for date, value in timeseries.items():
                rows.append({"Country": country, "Date": date, "Value": value})
    
        # Convert the list into a pandas DataFrame
        climate_data_df = pd.DataFrame(rows)
    
        # Return the DataFrame for further use
        return climate_data_df


#### Download: CRU variables that should definitely be adopted.

In [4]:
# tas // Average Mean Surface Air Temperature // Garret (pgm-level climate data ingestion): Only has a relative temperature variables
tas_cru_df = download_cru_data("tas")

# tasmax // Average Maximum Surface Air Temperature // Garret (pgm-level climate data ingestion): Only has a relative temperature variables
tasmax_cru_df = download_cru_data("tasmax")

# tasmin // Average Minimum Surface Air Temperature // Garret (pgm-level climate data ingestion): Only has relative temperature variables
tasmin_cru_df = download_cru_data("tasmin")


#### Define Function: Download ERA5 data

In [5]:
# The API Structure:
# https://cckpapi.worldbank.org/cckp/v1/era5-x0.25_timeseries_{variable_name}_timeseries_monthly_1950-2023_mean_historical_era5_x0.25_mean/all_countries?_format=json

# Example for the variable "cdd65": API to download JSON file. Can be studied using a normal browser.
# https://cckpapi.worldbank.org/cckp/v1/era5-x0.25_timeseries_cdd65_timeseries_monthly_1950-2023_mean_historical_era5_x0.25_mean/all_countries?_format=json


# Function downloading the climate data from the World Bank Climate Knowledge Portal and turning it into a pandas DataFrame
def download_era5_data(era5_variable):
        
        # Define the URL for the JSON file
        url_climate_data = f"https://cckpapi.worldbank.org/cckp/v1/era5-x0.25_timeseries_{era5_variable}_timeseries_monthly_1950-2023_mean_historical_era5_x0.25_mean/all_countries?_format=json"
        # Fetch the JSON data using a GET request
        response_climate_data = requests.get(url_climate_data)
    
        # Parse the JSON response into a Python dictionary, if the request was successful
        if response_climate_data.status_code == 200:
            climate_data_json = response_climate_data.json()
        else:
            print(f"Failed to fetch data.")
            climate_data_json = None
    
        # Flatten the nested structure into a list of dictionaries
        climate_data_json_only_data = climate_data_json["data"]

        rows = []

        for country, timeseries in climate_data_json_only_data.items():
            for date, value in timeseries.items():
                rows.append({"Country": country, "Date": date, "Value": value})
    
        # Convert the list into a pandas DataFrame
        climate_data_df = pd.DataFrame(rows)
    
        # Return the DataFrame for further use
        return climate_data_df

#### Download: ERA5 variables that should definitely be adopted.

In [6]:
# cdd65 // Cooling Degree Days (ref-65°F)
cdd65_era5_df = download_era5_data("cdd65")

# hd35 // Number of Hot Days (Tmax > 35°C)
hd35_era5_df = download_era5_data("hd35")

# hd40 // Number of Hot Days (Tmax > 40°C)
hd40_era5_df = download_era5_data("hd40")

# hd42 // Number of Hot Days (Tmax > 42°C)
hd42_era5_df = download_era5_data("hd42")

# hdd65 // Heating degree days (ref-65°F)
hdd65_era5_df = download_era5_data("hdd65")

# hi35 // Number of Days with Heat Index > 35°C
hi35_era5_df = download_era5_data("hi35")

# hi37 // Number of Days with Heat Index > 37°C
hi37_era5_df = download_era5_data("hi37")

# hurs // Relative Humidity
hurs_era5_df = download_era5_data("hurs")

# prpercnt // Precipitation Percent Change
prpercnt_era5_df = download_era5_data("prpercnt")

# rx1day // Average Largest 1-Day Precipitation
rx1day_era5_df = download_era5_data("rx1day")

# rx5day // Average Largest 5-Day Cumulative Precipitation
rx5day_era5_df = download_era5_data("rx5day")

# tnn // Minimum of Daily Min-Temperature
tnn_era5_df = download_era5_data("tnn")

# txx // Maximum of Daily Max-Temperature
txx_era5_df = download_era5_data("txx")


#### Download: ERA5 variables that should probably be adopted.

In [7]:
# pr // Precipitation // Garret (pgm-level climate data ingestion): Has a variable called "Total wet day precipitation", which measures total annually summed precipitation on days with precipitation > 1mm. // The two variables are similar but not identical. The variable "pr" at hand from the ERA5 data measures the average precipitation over a given time and potentially also includes smaller variation of rain below 1mm. The variable "pr" could therefore be useful to include, despite not gaining a lot of new knowledge.
pr_era5_df = download_era5_data("pr")

# hd30 // Number of Hot Days (Tmax > 30°C) // Garret (pgm-level climate data ingestion): - // Overlaping with previously loaded variables hd35, hd40, hd42. Would do no harm, but does likely not add much new information.
hd30_era5_df = download_era5_data("hd30")

# hd50 // Number of Hot Days (Tmax > 50°C) // Garret (pgm-level climate data ingestion): - // Overlaping with previously loaded variables hd35, hd40, hd42. Would do no harm, but does likely not add much new information.
hd50_era5_df = download_era5_data("hd50")

# hi39 // Number of Days with Heat Index > 39°C // Garret (pgm-level climate data ingestion): Garret chose to not include an identical variable that was available on the data platform he used. Therefore, this variable could potentially be useful to include.
hi39_era5_df = download_era5_data("hi39")

# hi41 // Number of Days with Heat Index > 41°C // Garret (pgm-level climate data ingestion): Garret chose to not include an identical variable that was available on the data platform he used. Therefore, this variable could potentially be useful to include.
hi41_era5_df = download_era5_data("hi41")

# r50mm // Number of Days with Precipitation >50mm // Garret (pgm-level climate data ingestion): Only has a variable called "Very heavy precipitation days" = days with more than 20mm. Slight overlap with this variable, but the variable "r50mm" could potentially be useful to include, despite not gaining a lot of new knowledge.
r50mm_era5_df = download_era5_data("r50mm")

# r95ptot // Precipitation amount during wettest days // Garret (pgm-level climate data ingestion): Garret chose to not include an identical variable "Very wet day precipitation" that was available on the data platform he used. Therefore, this variable could potentially be useful to include.
r95ptot_era5_df = download_era5_data("r95ptot")

# tr23 // Number of Tropical Nights (T-min > 23°C) // Garret (pgm-level climate data ingestion): Has a similar variable "tr" which the number of nights > 20°C // This variable is slightluy different and would add new information to the dataset, despite being similar to the variable "tr".
tr23_era5_df = download_era5_data("tr23")

# tr26 // Number of Tropical Nights (T-min > 26°C) // See reasoning for tr23
tr26_era5_df = download_era5_data("tr26")

# tr29 // Number of Tropical Nights (T-min > 29°C) // See reasoning for tr23
tr29_era5_df = download_era5_data("tr29")

# tr32 // Number of Tropical Nights (T-min > 32°C) // See reasoning for tr23
tr32_era5_df = download_era5_data("tr32")


#### Download: ERA5 variables that should probably NOT be adopted.

In [8]:
# fd // Number of Frost Days (Tmin < 0°C) // Garret (pgm-level climate data ingestion): Has an almost identical variable // G: count variable // Here: Average over time, i.e. data period. Smoothed-out, long-term perspective of Frost Days; Less interesting for forecasting, so should not be included.
fd_era5_df = download_era5_data("fd")

# id // Number of Ice Days (Tmax < 0°C) // Garret (pgm-level climate data ingestion): Has an almost identical variable // G: count variable // Here: Average over time, i.e. data period. Smoothed-out, long-term perspective of Ice Days; Less interesting for forecasting, so should not be included
id_era5_df = download_era5_data("id")


# Data Wrangling: Merge all variable-specific dataframes

#### Turning all variable-specific dataframes into one dictionary of dataframes
- Easier filtering for which variables to include in the ingestion.
- Simplified code.

In [9]:
# Creating a dictionary of dataframes with the respective variable names as labels, to be able name the value columns according to the variable name in the merged data set.
dataframes_dict = {
    # CRU variables that should definitely be adopted.
    "climate_tas": tas_cru_df,
    "climate_tasmax": tasmax_cru_df,
    "climate_tasmin": tasmin_cru_df,
    # ERA5 variables that should definitely be adopted.
    "climate_cdd65": cdd65_era5_df,
    "climate_hd35": hd35_era5_df,
    "climate_hd40": hd40_era5_df,
    "climate_hd42": hd42_era5_df,
    "climate_hdd65": hdd65_era5_df,
    "climate_hi35": hi35_era5_df,
    "climate_hi37": hi37_era5_df,
    "climate_hurs": hurs_era5_df,
    "climate_prpercnt": prpercnt_era5_df,
    "climate_rx1day": rx1day_era5_df,
    "climate_rx5day": rx5day_era5_df,
    "climate_tnn": tnn_era5_df,
    "climate_txx": txx_era5_df,
    # ERA5 variables that should probably be adopted.
    "climate_pr": pr_era5_df,
    "climate_hd30": hd30_era5_df,
    "climate_hd50": hd50_era5_df,
    "climate_hi39": hi39_era5_df,
    "climate_hi41": hi41_era5_df,
    "climate_r50mm": r50mm_era5_df,
    "climate_r95ptot": r95ptot_era5_df,
    "climate_tr23": tr23_era5_df,
    "climate_tr26": tr26_era5_df,
    "climate_tr29": tr29_era5_df,
    "climate_tr32": tr32_era5_df,
    # ERA5 variables that should probably NOT be adopted.
    "climate_fd": fd_era5_df,
    "climate_id": id_era5_df,
}

# Delete the standalone variable-specific dataframes to avoid confusion
for key in dataframes_dict.keys():
    cru_var = key.replace("climate_", "") + "_cru_df"
    era5_var = key.replace("climate_", "") + "_era5_df"
    if cru_var in globals():
        del globals()[cru_var]
    if era5_var in globals():
        del globals()[era5_var]

# Delete unnecessary objects after the loop finishes
del key
del cru_var
del era5_var

#### Bring all dataframes in the dictionary to the same time period: Start from January 1990
- Convert the "Date" column to a datetime object
- Filter the data to only include data from January 1990 onwards

In [10]:
# Create a function to filter the time period to only include data from January 1990 onwards
def filter_timeseries_1990(df):
    # Convert the "Date" column to a datetime object
    df["Date"] = pd.to_datetime(df["Date"])
    # Filter the DataFrame to only include data from January 1990 onwards
    return df[df["Date"] >= "1990-01-01"]

# Apply filtering to all DataFrames in the dictionary
dataframes_dict = {key: filter_timeseries_1990(df) for key, df in dataframes_dict.items()}

#### Merge all variables from the dictionary into one dataframe

In [11]:
# Function to merge dataframes from a dictionary into a single DataFrame
def merge_datasets_from_dict(dataframes):
    # Initialize an empty DataFrame to start merging
    merged_df = pd.DataFrame()

    # Iterate through the dictionary
    for var_name, df in dataframes.items():
        # Check if the DataFrame has the expected structure
        if set(df.columns) != {"Country", "Date", "Value"}:
            raise ValueError(f"DataFrame for '{var_name}' does not have the expected columns: {df.columns}")
        
        # Rename the 'Value' column to the variable name
        df = df.rename(columns={"Value": var_name})

        # Merge with the existing DataFrame
        if merged_df.empty:
            merged_df = df
        else:
            merged_df = pd.merge(merged_df, df, on=["Country", "Date"], how="outer")
    
    return merged_df

# Execute function and merge all DataFrames
merged_df = merge_datasets_from_dict(dataframes_dict)

# Clean up workspace by deleting the non-relevant DataFrames
climate_variables_df = merged_df
del merged_df
del dataframes_dict

# Data Wrangling: Turn merged data set into a suitable VIEWS format
- Add Month ID
- Add Country ID
- Control for changes in country codes over time, like Sudan.


#### Wrangling: Inserting month_id column
- Create month_id column based on the existing DateTime column "Date".

In [12]:
# Adapt the data to VIEWS format using the Ingester3 extensions
climate_variables_df = pd.DataFrame.cm.from_datetime(climate_variables_df, datetime_col='Date')

#### Wrangling: Inserting c_id column
- Problem
    - The CCKP data only provides historical climate data within current geographical borders. If a country had territorial changes within our data period (1990-now), we only have accurate data for the territory in the most recent definition.
    - Example: Climate data for country Sudan (SDN) is for historical climate within today's borders of Sudan, i.e. northern Sudan.
    - Proof: https://climateknowledgeportal.worldbank.org/country/sudan
    - This can be confusing, as the pre-2011 Sudan, i.e. "whole" Sudan, had the same iso3-code "SDN".
- Code options within ingester3:
    - pd.DataFrame.cm.from_iso(df=climate_variables_df_c_id_1, iso_col='Country', month_col= None ) 
        - => Assumes the ISO3-codes apply to the current geographical configurations
    - pd.DataFrame.cm.from_iso(df=climate_variables_df_c_id_1, iso_col='Country', month_col='month_id') 
        - => Assumes the ISO3-codes apply to the geographical configurations at a given month.
        - => Generally misleading. ISO3-code SDN before 2011 was used for all of Sudan. The climate data at hand however does not.
        - => As a result, the month column should ideally not be specified so that the ingester3 code uses the most up-to-date interpretations of the ISO3-Codes.
        - => Alternatively, this approach can be used, if afterwards all rows with older c_id-interpretations of the ISO3-Codes are filtered out.
- Consequence: Potential approaches
    - Approach A: Specify month_col as "month_id". Then filter all countries to only keep the rows with the most recent c_id, if there are more than one.
        - Dataframe: climate_variables_df_A
        - => Accurate data. 
        - => Output: Only country-months that exist in the VIEWS database. 
        - => Example: All pre-2011 data on historical climate in northern Sudan (c_id 245) is filtered out.
        - => Con: Code is longer and potentially more prone to errors. 
    - Approach B: Specify month_col as "None".
        - Dataframe: climate_variables_df_B
        - => Also accurate data.
        - => Less error prone, as it only uses ingester3 functions.
        - => Output: Produces country-months that are not useful to VIEWS, but likely also not harmful.
        - => Example: "SDN" is correctly coded as c_id "245" for the entire data period. "245" represents the current Sudan, i.e. northern Sudan. This is coherent with the CCKP climate data at hand, which for every time point of SDN represents historical climate data for northern Sudan. However, c_id 245 only exists in the VIEWS data base from July 2011 onwards. So, every row of the c_id 245 (northern Sudan) from January 1990 to June 2011 is obsolete.

- **Open Question for Jim and Mihai**: Is it a problem, if we have obsolete data, like pre-2011 (northern) Sudan/SDN/245 data, even though the c_id 245 did not exist before 2011? Does ingester3 just filter them out?
    - If yes: Go with Approach B.
    - If no: Go with Approach A.
- **Second Question/Idea to try in January**
    - Maybe the obsolete data in Approach B can be filtered out, by soft-validating not on the basis of the ISO3-Code, but on the c_id variable?

#### Wrangling: Inserting c_id column // Approach A

#### Approach A: Step 1
- Soft-validate and filter for country-months that exist in VIEWS, based on month_id and ISO3-code.
- Create month-specific c_id, by specifying month_col='month_id'.
- Note: After this step, this data is wrongly coded and NEEDS to be filtered in a second step.
- Explanation (see also above)
    - The output after step 1 of approach A is wrong, as the month-specific coding of the c_id means that ISO3-Codes that changed meaning within our data period (1990-now), are coded in the meaning of their specific month. This wrong data needs to be filtered out in a step 2.
    - Example: ISO3-code SDN before 2011 was used for all of Sudan. However, the climate data at hand for "SDN" always refers to the modern northern Sudan borders. Therefore, the pre July 2011 coding of "SDN" as c_id "59" (whole Sudan) and not "245" (northern Sudan) is incorrect.

In [13]:
# Soft validation: Create valid_id and assign value "true" only to the country_months that resonate with the VIEWS database.
climate_variables_df_A = pd.DataFrame.cm.soft_validate_iso(climate_variables_df, iso_col='Country', month_col='month_id')

# Filter country-months for valid_id == True
climate_variables_df_A = climate_variables_df_A[climate_variables_df_A.valid_id==True]

# Create the c_id column
climate_variables_df_A = pd.DataFrame.cm.from_iso(df=climate_variables_df_A, iso_col='Country', month_col='month_id')


#### Approach A: Step 2

- Filter the dataset to keep only the rows with the latest/most up-to-date 'c_id' for each 'Country'.
- If an ISO3-code represents different geographical areas at different points in time, we only keep the most recent.
- This ensures that climate data from a country's geographical area today is not used as data for a country with a identical ISO3-code but a different geographical area.

- Obvious example: Sudan (SDN)
    - In the climate data at hand, the ISO3-Code "SDN" is used to label historical data for the region of today's (northern) Sudan.
    - For July 2011 onwards, the month-specific ingester3 coding of c_id codes this correctly as c_id "245".
    - However, for pre-July 2011, the month-specific ingester3 coding of c_id codes the country-months as having the c_id "59", which represents all of Sudan.
    - The problem stems from Sudan changing shape in 2011, but keeping the same ISO3-Code "SDN".
    - Therefore, the following lines of code filter and delete any rows with the ISO3-Code "SDN" that have the wrong c_id "59" assigned to them.
    - We are left with only rows of northern Sudan (c_id 245) from July 2011 onwards, which marks the creation of the new state Sudan, that incidentally kept using the ISO3-Code "SDN".

In [14]:
# Sort the data by 'Country', 'c_id', and 'month_id' to ensure the latest month is at the end for each combination
climate_variables_df_A = climate_variables_df_A.sort_values(by=['Country', 'c_id', 'month_id'], ascending=[True, True, True])

# Identify the latest 'c_id' for each 'Country'
latest_cid_per_country = climate_variables_df_A.groupby('Country').tail(1)['c_id']

# Filter the dataset to keep only the rows with the latest 'c_id' for each 'Country'
climate_variables_df_A = climate_variables_df_A[climate_variables_df_A['c_id'].isin(latest_cid_per_country)]

# Drop unneccessary objects
del latest_cid_per_country

### Wrangling: Inserting c_id column // Approach B
- Soft-validate and filter for country-months that exist in VIEWS, based on month_id and ISO3-code.
- Create c_id based on the most up-to-date meaning of the ISO3-Codes, by specifying month_col= None.

- Problem: For countries with ISO3-Codes that had territorial changes in the data period, this creates obsolete data.
    - Soft validation does not filter them out, because the ISO3-Code *does* exist. 
        - **Idea**: Maybe one can soft-validate not on the basis of the ISO3-Code, but on the newly created c_id variable?
    - The obsolete data is not wrong, but just not useful to Views.

- Example: Sudan
    - Using this approach, the ISO3-Code "SDN" is correctly interpreted as representing northern Sudan for all data points. The c_id for modern, northern Sudan is 245.
    - However, in the VIEWS data base, the c_id 245 only exists from July 2011 onwards.
    - Therefore, all datapoints from January 1990 to June 2011 is not useful for VIEWS and could create problems during the ingestion.


- Further cases to which this applies
    - SRB/ Serbia: Independence of Kosovo in 2008. Territory changed, but Serbian ISO3-Code remained identical.
    - ETH/ Ethiopia: Independence of Eritrea in 1993. Territory changed, but Ethiopian ISO3-Code remained identical.
    - IDN/ Indonesia: Independence of East-Timor in 2002. Territory changed, but Indonesian ISO3-Code remained identical.


In [15]:
# Soft validation: Create valid_id and assign value "true" only to the country_months that resonate with the VIEWS database.
climate_variables_df_B = pd.DataFrame.cm.soft_validate_iso(climate_variables_df, iso_col='Country', month_col='month_id')

# Filter country-months for valid_id == True
climate_variables_df_B = climate_variables_df_B[climate_variables_df_B.valid_id==True]

# Create the c_id column
climate_variables_df_B = pd.DataFrame.cm.from_iso(df=climate_variables_df_B, iso_col='Country', month_col=None)


#### [Experimentation] Trying to soft-validate on c_id variable to filter out obsolete data in Approach B.
- Does not seem to work. Potential arguments in "iso_col" seem to be limited to ISO-Codes.
- Output: Every row is intepreted as invalid.

In [16]:
climate_variables_df_B2 = pd.DataFrame.cm.soft_validate_iso(climate_variables_df_B, iso_col='c_id', month_col='month_id')

--------------------------

# Proof:  Country-level data is always following current geoghraphical borders.

In [17]:
# # Checking precipitation, as there is probably a bigger difference between the two countries.

# # Filter data for Sudan pre- and post-July 2011
# sudan_pre_july_2011 = climate_variables_df_A[climate_variables_df_A['c_id'] == 59]
# sudan_post_july_2011 = climate_variables_df_A[climate_variables_df_A['c_id'] == 245]
# south_sudan_post_july_2011 = climate_variables_df_A[climate_variables_df_A['c_id'] == 246]

# # Calculate summary statistics for the variable 'climate_pr'
# summary_pre = sudan_pre_july_2011['climate_pr'].describe()
# summary_post = sudan_post_july_2011['climate_pr'].describe()
# summary_post_ssd = south_sudan_post_july_2011['climate_pr'].describe()

# # Print the summaries
# print("Summary statistics for 'climate_pr' before July 2011 (c_id = 59):\n", summary_pre)
# print("\nSummary statistics for 'climate_pr' after July 2011 (c_id = 245):\n", summary_post)

# # Optional: Check if the summaries are identical
# identical = summary_pre.equals(summary_post)
# print("\nAre the summary statistics for Sudan identical?", identical)

# # Print the summary statistics for South Sudan
# print("\nSummary statistics for 'climate_pr' for SOUTH SUDAN after July 2011 (c_id = 246):\n", summary_post_ssd)


In [18]:
# # Clean up environment after tests
# del sudan_pre_july_2011
# del sudan_post_july_2011
# del south_sudan_post_july_2011

# del summary_pre
# del summary_post
# del summary_post_ssd

# del identical

#### Conclusion: Precipitation in the different territorial variations of Sudan
#### Ergo: We have country-level data only for current geographical borders
- The precipitation for Sudan is very similar in both periods.
- The precipitation is much higher in South Sudan in post-2011 compared to the post-2011 numbers from (North) Sudan.

- This suggests, that the SDN data is always exlusively for Northern Sudan, and that the data for South Sudan is separate.

- Therefore, previous geographic configurations for countries, which have independent c_id's in VIEWS, are not covered by this data.
- This suggests: The data is always structured along the current geographic specifications of global gountries today.

--------

# Brainstorm: Further problems with the data

- Question: Which countries were dropped from the data during the soft validation and filtering process?


In [19]:
# distinct values in the variable "Country" in the dataframe climate_variables_df
distinct_countries = climate_variables_df ['Country'].unique()

# distinct values in the variable "Country" in the dataframe climate_variables_df_B
distinct_countries_B = climate_variables_df_B['Country'].unique()

# Create a list with all countries that are in distinct_countries but not in distinct_countries_B
countries_lost = [country for country in distinct_countries if country not in distinct_countries_B]

# -----------------

# Function to get country name from ISO-3 code
def get_country_name(iso3):
    try:
        return pycountry.countries.get(alpha_3=iso3).name
    except AttributeError:
        return f"Invalid code: {iso3}"

# Convert the ISO-3 codes in countries_lost to country names
country_names = [get_country_name(code) for code in countries_lost]

# Print the results
print("Countries dropped from the dataset:")
for iso3, name in zip(countries_lost, country_names):
    print(f"{iso3}: {name}")

# Delete obsolete objects
del distinct_countries
del distinct_countries_B
del countries_lost
del iso3
del name
del country_names


Countries dropped from the dataset:
ABW: Aruba
AIA: Anguilla
ALA: Åland Islands
ASM: American Samoa
ATF: French Southern Territories
BES: Bonaire, Sint Eustatius and Saba
BLM: Saint Barthélemy
BMU: Bermuda
BVT: Bouvet Island
CCK: Cocos (Keeling) Islands
COK: Cook Islands
CUW: Curaçao
CXR: Christmas Island
CYM: Cayman Islands
FRO: Faroe Islands
GGY: Guernsey
GIB: Gibraltar
GLP: Guadeloupe
GRL: Greenland
GUF: French Guiana
GUM: Guam
HKG: Hong Kong
HMD: Heard Island and McDonald Islands
IMN: Isle of Man
IOT: British Indian Ocean Territory
JEY: Jersey
KSV: Invalid code: KSV
MAC: Macao
MAF: Saint Martin (French part)
MNP: Northern Mariana Islands
MSR: Montserrat
MTQ: Martinique
MYT: Mayotte
NCL: New Caledonia
NFK: Norfolk Island
NIU: Niue
PCN: Pitcairn
PRI: Puerto Rico
PSE: Palestine, State of
PYF: French Polynesia
REU: Réunion
SHN: Saint Helena, Ascension and Tristan da Cunha
SJM: Svalbard and Jan Mayen
SPM: Saint Pierre and Miquelon
SXM: Sint Maarten (Dutch part)
TCA: Turks and Caicos Isl

#### Answer: Categories of countries dropped

- Autonomous regions and islands: 
    - They are coded as independent countries in the CCKP climate data
    - Problem: By ommitting them, we lose all data on their climate and misrepresent the climate of the administering countries, as the data from the territories is not integrated into the administering countries data.
    - At the same time, there is no elegant way to merge the data of the autonomous regions with their administering countries.
    - Examples
        - Åland is not coded as part of Finland. By ommitting it, we lose all data on Åland, as the Åland data is not integrated into Finland data. This also biases the data we have on Finland, as we lack the data on Åland.
        - ABW, Aruba: Netherlands
        - AIA, Anguilla: UK
        - ASM, American Samoa: USA
        - ATF, French Southern Territories: France
        - BVT, Bouvet Island: Norway
        - FRO, Faroe Islands: Denmark
        - GRL, Greenland: Denmark
        - MAC, Macao: China
        - HKG, Hong Kong: China
        - ...

- Disputed and unrecognized territories
    - KSV/XKX, Kosovo
    - PSE, Palestine

- Small states
    - VAT, Vatican