In [1]:
import pandas as pd

## Cost of living

In [2]:
cost_of_living_df = pd.read_csv("data/Cost of living - purchasing power index 2024.csv")

In [3]:
cost_of_living_df = cost_of_living_df.drop(columns=["Rank", "Ø Monthly income (USD)"])
cost_of_living_df.rename(columns={"Country/Region": "Country Name"}, inplace=True)

In [4]:
cost_of_living_df["Country Name"] = cost_of_living_df["Country Name"].str.replace(
    "*", ""
)
cost_of_living_df["Country Name"] = cost_of_living_df["Country Name"].str.strip()

In [5]:
cost_of_living_df["Year"] = 2024

### Normalization Step: Add ISO-alpha3 Code

In [6]:
try:
    unsd_df = pd.read_csv("data/country_codes.csv", sep=",")
except FileNotFoundError:
    unsd_df = pd.read_csv("../data/country_codes.csv", sep=",")

# Create mappings
unsd_df["Country"] = unsd_df["Country"].str.strip()
name_to_iso3 = unsd_df.set_index("Country")["ISO-alpha3 Code"].to_dict()

m49_to_iso3 = {}
for idx, row in unsd_df.iterrows():
    try:
        m49_code = int(row["M49 Code"])
        m49_to_iso3[m49_code] = row["ISO-alpha3 Code"]
    except (ValueError, TypeError):
        continue

# Apply mapping
if "Country Name" in cost_of_living_df.columns:
    cost_of_living_df["Country Code"] = (
        cost_of_living_df["Country Name"].str.strip().map(name_to_iso3)
    )
else:
    print("  Source column 'Country/Region' not found.")

# Check results
missing_iso = cost_of_living_df["Country Code"].isna().sum()
if missing_iso > 0:
    print(f"  {missing_iso} rows have missing ISO-alpha3 codes")
    print(cost_of_living_df[cost_of_living_df["Country Code"].isna()][["Country Name"]])

  2 rows have missing ISO-alpha3 codes
       Country Name
10   Virgin Islands
115          Kosovo


In [7]:
cost_of_living_df

Unnamed: 0,Country Name,Cost index,Purchasing power index,Year,Country Code
0,Bermuda,144.1,116.3,2024,BMU
1,Cayman Islands,140.1,52.7,2024,CYM
2,Turks and Caicos Islands,131.7,31.5,2024,TCA
3,Switzerland,125.8,91.2,2024,CHE
4,Barbados,124.4,22.7,2024,BRB
...,...,...,...,...,...
170,Myanmar,23.9,6.1,2024,MMR
171,India,23.8,13.3,2024,IND
172,Pakistan,23.3,7.3,2024,PAK
173,Palestine,21.4,17.2,2024,PSE


## Food Inflation

In [8]:
food_inflation_df = pd.read_csv("data/Food price inflation - FAO_CP_23014.csv")

In [9]:
column_to_check = "TIME_PERIOD"
min_year = 2015
condition_to_keep = food_inflation_df[column_to_check].str[:4].astype(int) >= min_year
food_inflation_df = food_inflation_df[condition_to_keep].reset_index(drop=True)

In [10]:
column_to_check = "REF_AREA_LABEL"
condition_to_keep = ~food_inflation_df[column_to_check].isin(
    ["South Asia", "World", "North America"]
)
food_inflation_df = food_inflation_df[condition_to_keep]

In [11]:
uniques = food_inflation_df.nunique()
cols_to_drop = uniques[uniques <= 1].index
food_inflation_df = food_inflation_df.drop(columns=cols_to_drop)
food_inflation_df.rename(
    columns={"OBS_VALUE": "Food price inflation index"}, inplace=True
)

In [12]:
food_inflation_df.rename(columns={"REF_AREA": "Country Code"}, inplace=True)
food_inflation_df.rename(columns={"REF_AREA_LABEL": "Country Name"}, inplace=True)
food_inflation_df.rename(columns={"TIME_PERIOD": "Year"}, inplace=True)

In [13]:
food_inflation_df["Year_Int"] = (
    food_inflation_df["Year"].astype(str).str[:4].astype(int)
)

# Group by 'Country Code' and the new 'Year_Int' column, then calculate the mean
food_inflation_yearly = (
    food_inflation_df.groupby(["Country Code", "Year_Int"])[
        "Food price inflation index"
    ]
    .mean()
    .reset_index()
)

# Rename the columns for clarity and consistency
food_inflation_yearly.rename(
    columns={
        "Year_Int": "Year",
        "Food price inflation index": "Food Price Inflation (Mean)",
    },
    inplace=True,
)
food_inflation_df = food_inflation_yearly
food_inflation_df

Unnamed: 0,Country Code,Year,Food Price Inflation (Mean)
0,ABW,2015,2.452048
1,ABW,2016,-0.544981
2,ABW,2017,-3.029560
3,ABW,2018,7.129486
4,ABW,2019,10.606806
...,...,...,...
2188,ZWE,2021,156.989101
2189,ZWE,2022,144.820835
2190,ZWE,2023,115.154714
2191,ZWE,2024,98.607873


## GDP per Capita


In [14]:
gdp_df = pd.read_csv("data/GDP per Capita in USD - GDP per capita.csv")

In [15]:
uniques = gdp_df.nunique()
cols_to_drop = uniques[uniques <= 1].index
gdp_df = gdp_df.drop(columns=cols_to_drop)

In [16]:
column_to_check = "TIME_PERIOD"
threshold_number = 2015
rows_to_keep_condition = gdp_df[column_to_check] >= threshold_number
gdp_df = gdp_df[rows_to_keep_condition].reset_index(drop=True)

In [17]:
gdp_df.rename(columns={"REF_AREA": "Country Code"}, inplace=True)
gdp_df.rename(columns={"REF_AREA_LABEL": "Country Name"}, inplace=True)
gdp_df.rename(columns={"TIME_PERIOD": "Year"}, inplace=True)
gdp_df.rename(columns={"OBS_VALUE": "GDP per capita (current US$)"}, inplace=True)
gdp_df

Unnamed: 0,Country Code,Country Name,Year,GDP per capita (current US$)
0,AFE,Africa Eastern and Southern,2015,1479.615260
1,AFW,Africa Western and Central,2015,1860.727694
2,ARB,Arab World,2015,6262.041685
3,CSS,Caribbean small states,2015,14402.472580
4,CEB,Central Electricity Board (CEB),2015,12596.947510
...,...,...,...,...
2535,SOM,Somalia,2024,636.983478
2536,MDG,Madagascar,2024,544.997303
2537,CAF,Central African Republic,2024,516.170424
2538,MWI,Malawi,2024,508.371273


## Chicken Import/Export

In [18]:
import_export_df = pd.read_csv("data/Chicken-Turkey Import-Export quant.csv")

In [19]:
uniques = import_export_df.nunique()
cols_to_drop = uniques[uniques <= 1].index
import_export_df = import_export_df.drop(columns=cols_to_drop)

In [20]:
import_export_df = import_export_df.drop(
    columns=["Element Code", "Item Code (CPC)", "Year Code", "Flag", "Flag Description"]
)

In [21]:
import_export_df = import_export_df[import_export_df["Item"] == "Chickens"].reset_index(
    drop=True
)
import_export_df = import_export_df.drop(columns=["Item"])

In [22]:
import_export_df.rename(columns={"Area": "Country Name"}, inplace=True)
import_export_df

Unnamed: 0,Area Code (M49),Country Name,Element,Year,Value
0,4,Afghanistan,Import quantity,2015,2111
1,4,Afghanistan,Export quantity,2015,8
2,4,Afghanistan,Import quantity,2016,15005
3,4,Afghanistan,Import quantity,2017,18970
4,4,Afghanistan,Import quantity,2018,12297
...,...,...,...,...,...
2536,716,Zimbabwe,Export quantity,2021,269
2537,716,Zimbabwe,Import quantity,2022,363
2538,716,Zimbabwe,Export quantity,2022,308
2539,716,Zimbabwe,Import quantity,2023,348


### Take 2023 data and apply it for 2024

In [23]:
# Last Observation Carried Forward

rows_to_copy = import_export_df[import_export_df["Year"] == 2023].copy()
rows_to_copy["Year"] = 2024
import_export_df = pd.concat([import_export_df, rows_to_copy], ignore_index=True)

### Normalization Step: Add ISO-alpha3 Code

In [24]:
# 1. Load UNSD Methodology for mapping
try:
    unsd_df = pd.read_csv("data/country_codes.csv", sep=",")
except FileNotFoundError:
    unsd_df = pd.read_csv("../data/country_codes.csv", sep=",")

# 2. Create mappings
unsd_df["Country"] = unsd_df["Country"].astype(str).str.strip()
name_to_iso3 = unsd_df.set_index("Country")["ISO-alpha3 Code"].to_dict()

m49_to_iso3 = {}
for idx, row in unsd_df.iterrows():
    try:
        # Handle potential non-numeric or NaN M49 codes
        m49_code = int(row["M49 Code"])
        m49_to_iso3[m49_code] = row["ISO-alpha3 Code"]
    except (ValueError, TypeError):
        continue

# 3. Apply mapping
print("Applying ISO-alpha3 normalization...")
if "Area Code (M49)" in import_export_df.columns:

    def get_iso(val):
        try:
            return m49_to_iso3.get(int(val), None)
        except:
            return None

    import_export_df["Country Code"] = import_export_df["Area Code (M49)"].apply(
        get_iso
    )
    print("  Mapped M49 codes to 'Country Code'.")
else:
    print("  Warning: Source column 'Area Code (M49)' not found.")

# Check results
missing_iso = import_export_df["Country Code"].isna().sum()
if missing_iso > 0:
    print(f"  Warning: {missing_iso} rows have missing ISO-alpha3 codes.")
    print(
        import_export_df[import_export_df["Country Code"].isna()][
            ["Area Code (M49)"]
        ].head()
    )

Applying ISO-alpha3 normalization...
  Mapped M49 codes to 'Country Code'.
     Area Code (M49)
417              159
418              159
419              159
420              159
421              159


In [25]:
import_export_df = import_export_df.pivot_table(
    index=["Country Code", "Year"], columns="Element", values="Value", aggfunc="first"
).reset_index()

import_export_df.columns.name = None

import_export_df = import_export_df.rename(
    columns={"Import quantity": "Import", "Export quantity": "Export"}
)

import_export_df

Unnamed: 0,Country Code,Year,Export,Import
0,AFG,2015,8.0,2111.0
1,AFG,2016,,15005.0
2,AFG,2017,,18970.0
3,AFG,2018,,12297.0
4,AFG,2019,1.0,9067.0
...,...,...,...,...
1840,ZWE,2020,248.0,280.0
1841,ZWE,2021,269.0,313.0
1842,ZWE,2022,308.0,363.0
1843,ZWE,2023,309.0,348.0


## Logistics Performance Index

In [26]:
lpi_df = pd.read_csv("data/LPI_2014_to_2023.csv")

In [27]:
lpi_df.rename(columns={"Economy": "Country Name"}, inplace=True)

### Take 2023 data and apply it for 2024

In [28]:
# Last Observation Carried Forward
# Since we don't have 2024 data, i took thw 2023 values as it seems fine to me for an indicator like this, because Logistics infrastructure doesn't change fastly

rows_to_copy = lpi_df[lpi_df["Year"] == 2023].copy()
rows_to_copy["Year"] = 2024
lpi_df = pd.concat([lpi_df, rows_to_copy], ignore_index=True)

### Normalization Step: Add ISO-alpha3 Code

In [29]:
try:
    unsd_df = pd.read_csv("data/country_codes.csv", sep=",")
except FileNotFoundError:
    unsd_df = pd.read_csv("../data/country_codes.csv", sep=",")

# Create mappings
unsd_df["Country"] = unsd_df["Country"].str.strip()
name_to_iso3 = unsd_df.set_index("Country")["ISO-alpha3 Code"].to_dict()

m49_to_iso3 = {}
for idx, row in unsd_df.iterrows():
    try:
        m49_code = int(row["M49 Code"])
        m49_to_iso3[m49_code] = row["ISO-alpha3 Code"]
    except (ValueError, TypeError):
        continue

# Apply mapping
if "Country Name" in lpi_df.columns:
    lpi_df["Country Code"] = lpi_df["Country Name"].str.strip().map(name_to_iso3)
else:
    print("  Source column 'Country/Region' not found.")

# Check results
missing_iso = lpi_df["Country Code"].isna().sum()
if missing_iso > 0:
    print(f"  {missing_iso} rows have missing ISO-alpha3 codes")
    print(lpi_df[lpi_df["Country Code"].isna()][["Country Name"]])

In [30]:
lpi_df

Unnamed: 0,Country Name,LPI Score,Year,Country Code
0,Singapore,4.3,2023,SGP
1,Finland,4.2,2023,FIN
2,Denmark,4.1,2023,DNK
3,Germany,4.1,2023,DEU
4,Netherlands,4.1,2023,NLD
...,...,...,...,...
753,Cameroon,2.1,2024,CMR
754,Haiti,2.1,2024,HTI
755,Somalia,2.0,2024,SOM
756,Afghanistan,1.9,2024,AFG


## Share of Organic Agricultural land

In [31]:
land_df = pd.read_csv("data/Share of Organic Agricultural land.csv")

In [32]:
land_df = land_df.drop(
    columns=["Element Code", "Item Code", "Year Code", "Flag", "Flag Description"]
)

In [33]:
land_df = land_df[land_df["Element"] == "Share in Agricultural land"].reset_index(
    drop=True
)

In [34]:
land_df.rename(
    columns={"Value": "Share of Organic Agricultural land (%)"}, inplace=True
)
land_df.rename(columns={"Area": "Country Name"}, inplace=True)

In [35]:
uniques = land_df.nunique()
cols_to_drop = uniques[uniques <= 1].index
land_df = land_df.drop(columns=cols_to_drop)
land_df = land_df.sort_values(by="Country Name")

### Take 2023 data and apply it for 2024

In [36]:
rows_to_copy = land_df[land_df["Year"] == 2023].copy()
rows_to_copy["Year"] = 2024
land_df = pd.concat([land_df, rows_to_copy], ignore_index=True)

### Normalization Step: Add ISO-alpha3 Code

In [37]:
# 1. Load UNSD Methodology for mapping
try:
    unsd_df = pd.read_csv("data/country_codes.csv", sep=",")
except FileNotFoundError:
    unsd_df = pd.read_csv("../data/country_codes.csv", sep=",")

# 2. Create mappings
unsd_df["Country"] = unsd_df["Country"].astype(str).str.strip()
name_to_iso3 = unsd_df.set_index("Country")["ISO-alpha3 Code"].to_dict()

m49_to_iso3 = {}
for idx, row in unsd_df.iterrows():
    try:
        # Handle potential non-numeric or NaN M49 codes
        m49_code = int(row["M49 Code"])
        m49_to_iso3[m49_code] = row["ISO-alpha3 Code"]
    except (ValueError, TypeError):
        continue

# 3. Apply mapping
print("Applying ISO-alpha3 normalization...")
if "Area Code (M49)" in land_df.columns:

    def get_iso(val):
        try:
            return m49_to_iso3.get(int(val), None)
        except:
            return None

    land_df["Country Code"] = land_df["Area Code (M49)"].apply(get_iso)
    print("  Mapped M49 codes to 'Country Code'.")
else:
    print("  Warning: Source column 'Area Code (M49)' not found.")

# Check results
missing_iso = land_df["Country Code"].isna().sum()
if missing_iso > 0:
    print(f"  Warning: {missing_iso} rows have missing ISO-alpha3 codes.")
    print(land_df[land_df["Country Code"].isna()][["Area Code (M49)"]].head())

Applying ISO-alpha3 normalization...
  Mapped M49 codes to 'Country Code'.
     Area Code (M49)
60                64
61                64
62                64
63                64
105              830


In [38]:
land_df.drop(columns=["Area Code (M49)"], inplace=True)
land_df

Unnamed: 0,Country Name,Year,Share of Organic Agricultural land (%),Country Code
0,Afghanistan,2020,0.00,AFG
1,Afghanistan,2023,0.00,AFG
2,Afghanistan,2022,0.00,AFG
3,Afghanistan,2021,0.00,AFG
4,Albania,2022,0.06,ALB
...,...,...,...,...
839,Vanuatu,2024,3.44,VUT
840,Venezuela (Bolivarian Republic of),2024,0.01,VEN
841,Viet Nam,2024,1.42,VNM
842,Zambia,2024,0.01,ZMB


## Politics

In [39]:
politics_df = pd.read_csv("data/Political 2015-2023.csv")

In [40]:
politics_df = politics_df.drop_duplicates()

In [41]:
politics_df = politics_df.drop(columns=["Series Code"])
politics_df = politics_df.iloc[:-3]

In [42]:
id_vars = ["Country Name", "Country Code", "Series Name"]
year_cols = [
    "2015 [YR2015]",
    "2016 [YR2016]",
    "2017 [YR2017]",
    "2018 [YR2018]",
    "2019 [YR2019]",
    "2020 [YR2020]",
    "2021 [YR2021]",
    "2022 [YR2022]",
    "2023 [YR2023]",
]

politics_df = politics_df.melt(
    id_vars=id_vars, value_vars=year_cols, var_name="Year", value_name="Value"
)

In [43]:
politics_df["Year"] = politics_df["Year"].str[:4]
politics_df["Value"] = pd.to_numeric(politics_df["Value"], errors="coerce")

In [44]:
politics_df = politics_df.pivot_table(
    index=["Country Name", "Country Code", "Year"],
    columns="Series Name",
    values="Value",
)

In [45]:
politics_df = politics_df.reset_index()
politics_df.columns.name = None

### Take 2023 data and apply it for 2024

In [46]:
# Last Observation Carried Forward
# Since we don't have 2024 data, i took thw 2023 values as it seems fine to me for an indicator like this, because Logistics infrastructure doesn't change fastly

rows_to_copy = politics_df[(politics_df["Year"]).astype(int) == 2023].copy()
rows_to_copy["Year"] = 2024
politics_df = pd.concat([politics_df, rows_to_copy], ignore_index=True)
politics_df

Unnamed: 0,Country Name,Country Code,Year,Control of Corruption: Percentile Rank,Government Effectiveness: Percentile Rank,Political Stability and Absence of Violence/Terrorism: Percentile Rank,Regulatory Quality: Percentile Rank,Rule of Law: Percentile Rank
0,Afghanistan,AFG,2015,5.714286,6.666667,0.952381,13.809524,4.285714
1,Afghanistan,AFG,2016,3.809524,7.619048,0.952381,7.142857,5.714286
2,Afghanistan,AFG,2017,3.809524,7.142857,0.476190,7.142857,5.238095
3,Afghanistan,AFG,2018,4.761905,7.142857,0.471698,10.476191,4.285714
4,Afghanistan,AFG,2019,5.714286,6.666667,0.943396,10.476191,4.285714
...,...,...,...,...,...,...,...,...
2119,Virgin Islands (U.S.),VIR,2024,53.301888,73.113205,63.033176,89.622643,81.603775
2120,West Bank and Gaza,PSE,2024,26.415094,10.377358,7.109005,41.981133,25.943396
2121,"Yemen, Rep.",YEM,2024,1.886792,0.471698,0.947867,3.301887,1.886792
2122,Zambia,ZMB,2024,36.792454,27.830189,52.132702,33.490566,31.603773


## Unemployment

In [47]:
unemployment_df = pd.read_csv("data/Unemployment 2015-2024.csv")

In [48]:
uniques = unemployment_df.nunique()
cols_to_drop = uniques[uniques <= 1].index
unemployment_df = unemployment_df.drop(columns=cols_to_drop)

In [49]:
id_vars = ["Country Name", "Country Code"]
value_vars = [col for col in unemployment_df.columns if col not in id_vars]

unemployment_df_melted = unemployment_df.melt(
    id_vars=id_vars,
    value_vars=value_vars,
    var_name="Year",
    value_name="Unemployment (% of total labor force)",
)
unemployment_df_melted["Year"] = unemployment_df_melted["Year"].astype(int)
unemployment_df = unemployment_df_melted
unemployment_df

Unnamed: 0,Country Name,Country Code,Year,Unemployment (% of total labor force)
0,Aruba,ABW,2015,
1,Africa Eastern and Southern,AFE,2015,7.036357
2,Afghanistan,AFG,2015,9.052000
3,Africa Western and Central,AFW,2015,4.164467
4,Angola,AGO,2015,16.490000
...,...,...,...,...
2655,Kosovo,XKX,2024,
2656,"Yemen, Rep.",YEM,2024,17.086000
2657,South Africa,ZAF,2024,33.168000
2658,Zambia,ZMB,2024,5.961000


## Share of urban population

In [50]:
urban_pop_df = pd.read_csv("data/Urban population (% of total population).csv")

In [51]:
uniques = urban_pop_df.nunique()
cols_to_drop = uniques[uniques <= 1].index
urban_pop_df = urban_pop_df.drop(columns=cols_to_drop)

In [52]:
id_vars = ["Country Name", "Country Code"]
value_vars = [col for col in urban_pop_df.columns if col not in id_vars]

urban_pop_df = urban_pop_df.melt(
    id_vars=id_vars,
    value_vars=value_vars,
    var_name="Year",
    value_name="Urban population (% of total population)",
)
urban_pop_df["Year"] = urban_pop_df["Year"].astype(int)

urban_pop_df

Unnamed: 0,Country Name,Country Code,Year,Urban population (% of total population)
0,Aruba,ABW,2020,43.697000
1,Africa Eastern and Southern,AFE,2020,36.884034
2,Afghanistan,AFG,2020,26.026000
3,Africa Western and Central,AFW,2020,47.931021
4,Angola,AGO,2020,66.825000
...,...,...,...,...
1325,Kosovo,XKX,2024,
1326,"Yemen, Rep.",YEM,2024,40.477000
1327,South Africa,ZAF,2024,69.298000
1328,Zambia,ZMB,2024,46.914000


## Merge dataframes

In [55]:
dfs_to_merge = [
    gdp_df[["Country Code", "Year", "GDP per capita (current US$)"]],
    food_inflation_df,
    import_export_df,
    cost_of_living_df[["Country Code", "Year", "Cost index", "Purchasing power index"]],
    lpi_df[["Country Code", "Year", "LPI Score"]],
    politics_df,
    land_df[["Country Code", "Year", "Share of Organic Agricultural land (%)"]],
    unemployment_df[["Country Code", "Year", "Unemployment (% of total labor force)"]],
    urban_pop_df[["Country Code", "Year", "Urban population (% of total population)"]],
]

# Helper to cleanup duplicates and merge
final_df = dfs_to_merge[0].copy()

for i, df_next in enumerate(dfs_to_merge[1:], 1):
    # Work on an explicit copy to avoid SettingWithCopyWarning
    df_next = df_next.copy()

    # Ensure Join Keys are uniform types (use .loc to avoid chained-assignment)
    if "Year" in final_df.columns:
        final_df.loc[:, "Year"] = final_df["Year"].astype(int)
    if "Year" in df_next.columns:
        df_next.loc[:, "Year"] = df_next["Year"].astype(int)

    # Drop rows without Country Code to avoid merging NaNs
    final_df = final_df.dropna(subset=["Country Code"])
    df_next = df_next.dropna(subset=["Country Code"])

    # Identify columns to add (exclude keys and metadata like Country Name)
    cols_to_use = ["Country Code", "Year"] + [
        c
        for c in df_next.columns
        if c
        not in ["Country Code", "Year", "Country Name", "Country", "Country/Region"]
    ]

    final_df = pd.merge(
        final_df, df_next[cols_to_use], on=["Country Code", "Year"], how="outer"
    )

final_df = final_df.sort_values(by=["Country Code", "Year"])

# Add Country Name
iso3_to_name = unsd_df.set_index("ISO-alpha3 Code")["Country"].to_dict()
final_df["Country Name"] = final_df["Country Code"].map(iso3_to_name)

# Reorder columns
cols = ["Country Name", "Country Code", "Year"] + [
    c for c in final_df.columns if c not in ["Country Name", "Country Code", "Year"]
]
final_df = final_df[cols]


# Add Region Information
try:
    df_region = pd.read_csv("data/region_codes.csv")

    # Prepare region dataframe for merge
    region_cols_to_keep = ["alpha-3", "region", "sub-region"]
    df_region_clean = df_region[region_cols_to_keep].copy()
    df_region_clean.rename(
        columns={
            "alpha-3": "Country Code",
            "region": "Region",
            "sub-region": "Sub-region",
        },
        inplace=True,
    )

    # Remove duplicates in region data
    df_region_clean = df_region_clean.drop_duplicates(subset=["Country Code"])

    # Merge
    final_df = pd.merge(final_df, df_region_clean, on="Country Code", how="left")

    # Reorder again to put Region near Country Name
    cols = final_df.columns.tolist()
    if "Region" in cols:
        try:
            idx = cols.index("Country Code")
            cols.pop(cols.index("Region"))
            cols.insert(idx + 1, "Region")
            if "Sub-region" in cols:
                cols.pop(cols.index("Sub-region"))
                cols.insert(idx + 2, "Sub-region")
            final_df = final_df[cols]
        except ValueError:
            pass

except FileNotFoundError:
    print("Warning: data/region_codes.csv not found. Skipping region merge.")

final_df.to_csv("merged_data_cleaned.csv", index=False)

In [54]:
final_df[final_df["Year"] == 2024]

Unnamed: 0,Country Name,Country Code,Region,Sub-region,Year,GDP per capita (current US$),Food Price Inflation (Mean),Export,Import,Cost index,Purchasing power index,LPI Score,Control of Corruption: Percentile Rank,Government Effectiveness: Percentile Rank,Political Stability and Absence of Violence/Terrorism: Percentile Rank,Regulatory Quality: Percentile Rank,Rule of Law: Percentile Rank,Share of Organic Agricultural land (%),Unemployment (% of total labor force),Urban population (% of total population)
9,Aruba,ABW,Americas,Latin America and the Caribbean,2024,,2.564054,,,84.5,42.6,,75.471695,78.301888,97.630333,80.188683,87.264153,,,44.474000
20,,AFE,,,2024,1567.635839,,,,,,,,,,,,,7.772654,38.949114
31,Afghanistan,AFG,Asia,Southern Asia,2024,,-10.612025,,9272.0,,,1.9,13.679245,1.415094,1.421801,9.433962,5.188679,0.00,13.295000,27.265000
42,,AFW,,,2024,1284.154441,,,,,,,,,,,,,3.218313,50.280934
53,Angola,AGO,Africa,Sub-Saharan Africa,2024,2122.083690,31.975465,,1266.0,35.7,7.4,2.1,29.245283,14.622642,32.227489,22.641510,14.622642,,14.464000,69.281000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3079,,XKX,,,2024,7299.434981,,,,,,,48.584908,47.641510,34.597157,41.037735,41.509434,,,
3090,Yemen,YEM,Asia,Western Asia,2024,,,,1409.0,,,2.2,1.886792,0.471698,0.947867,3.301887,1.886792,,17.086000,40.477000
3101,South Africa,ZAF,Africa,Sub-Saharan Africa,2024,6253.371582,4.713374,4797.0,457.0,42.0,17.4,3.7,45.754719,40.566036,20.853081,44.339622,54.245281,0.04,33.168000,69.298000
3113,Zambia,ZMB,Africa,Sub-Saharan Africa,2024,1235.084665,16.671439,2277.0,335.0,28.9,5.2,,36.792454,27.830189,52.132702,33.490566,31.603773,0.01,5.961000,46.914000
