In [1]:
import polars as pl


In [2]:
regions_df = pl.read_csv("../data/continents2.csv", infer_schema_length=0)
main_df = pl.read_csv("../data/WorldSustainabilityDataset.csv", infer_schema_length=0)

In [3]:
regions_df.columns

['name',
 'alpha-2',
 'alpha-3',
 'country-code',
 'iso_3166-2',
 'region',
 'sub-region',
 'intermediate-region',
 'region-code',
 'sub-region-code',
 'intermediate-region-code']

In [4]:
main_df.columns

['Country Name',
 'Country Code',
 'Year',
 'Access to electricity (% of population) - EG.ELC.ACCS.ZS',
 'Adjusted net national income per capita (annual % growth) - NY.ADJ.NNTY.PC.KD.ZG',
 'Adjusted net savings, excluding particulate emission damage (% of GNI) - NY.ADJ.SVNX.GN.ZS',
 'Adjusted savings: carbon dioxide damage (% of GNI) - NY.ADJ.DCO2.GN.ZS',
 'Adjusted savings: natural resources depletion (% of GNI) - NY.ADJ.DRES.GN.ZS',
 'Adjusted savings: net forest depletion (% of GNI) - NY.ADJ.DFOR.GN.ZS',
 'Adjusted savings: particulate emission damage (% of GNI) - NY.ADJ.DPEM.GN.ZS',
 'Automated teller machines (ATMs) (per 100,000 adults) - FB.ATM.TOTL.P5',
 'Broad money (% of GDP) - FM.LBL.BMNY.GD.ZS',
 'Children out of school (% of primary school age) - SE.PRM.UNER.ZS',
 'Compulsory education, duration (years) - SE.COM.DURS',
 'Cost of business start-up procedures, female (% of GNI per capita) - IC.REG.COST.PC.FE.ZS',
 'Cost of business start-up procedures, male (% of GNI per cap

In [6]:
merged = main_df.join(
    regions_df,
    left_on="Country Name",
    right_on="name",
    how="left"
)

In [7]:
not_matched = merged.filter(
    pl.col("sub-region").is_null()
)['Country Name'].unique().to_list()

print(len(not_matched))
not_matched

23


['Kyrgyz Republic',
 'Egypt, Arab Rep.',
 'Macao SAR, China',
 'Syrian Arab Republic',
 'Russian Federation',
 'Bosnia and Herzegovina',
 'Hong Kong SAR, China',
 'Slovak Republic',
 "Cote d'Ivoire",
 'Guinea-Bissau',
 'St. Vincent and the Grenadines',
 'Bahamas, The',
 'Lao PDR',
 'Iran, Islamic Rep.',
 'West Bank and Gaza',
 'North Macedonia',
 'Korea, Rep.',
 'Congo, Dem. Rep.',
 "Korea, Dem. People's Rep.",
 'St. Lucia',
 'Venezuela, RB',
 'Gambia, The',
 'Congo, Rep.']

### 23 countries could not be matched; let's investigate

In [8]:
regions_df.filter(
    pl.col('name').str.contains('Kyrgyz')
)

name,alpha-2,alpha-3,country-code,iso_3166-2,region,sub-region,intermediate-region,region-code,sub-region-code,intermediate-region-code
str,str,str,str,str,str,str,str,str,str,str
"""Kyrgyzstan""","""KG""","""KGZ""","""417""","""ISO 3166-2:KG""","""Asia""","""Central Asia""",,"""142""","""143""",


In [9]:
main_df.join(
    regions_df,
    left_on="Country Code",
    right_on="alpha-3",
    how="left"
).filter(
    pl.col("sub-region").is_null()
)['Country Name'].unique().to_list()

[]

### Use country code and alpha-3 to merge instead

In [10]:
merged = main_df.join(
    regions_df,
    left_on="Country Code",
    right_on="alpha-3",
    how="left"
)

In [11]:
merged.unique(subset='Country Name').sample(5)

Country Name,Country Code,Year,Access to electricity (% of population) - EG.ELC.ACCS.ZS,Adjusted net national income per capita (annual % growth) - NY.ADJ.NNTY.PC.KD.ZG,"Adjusted net savings, excluding particulate emission damage (% of GNI) - NY.ADJ.SVNX.GN.ZS",Adjusted savings: carbon dioxide damage (% of GNI) - NY.ADJ.DCO2.GN.ZS,Adjusted savings: natural resources depletion (% of GNI) - NY.ADJ.DRES.GN.ZS,Adjusted savings: net forest depletion (% of GNI) - NY.ADJ.DFOR.GN.ZS,Adjusted savings: particulate emission damage (% of GNI) - NY.ADJ.DPEM.GN.ZS,"Automated teller machines (ATMs) (per 100,000 adults) - FB.ATM.TOTL.P5",Broad money (% of GDP) - FM.LBL.BMNY.GD.ZS,Children out of school (% of primary school age) - SE.PRM.UNER.ZS,"Compulsory education, duration (years) - SE.COM.DURS","Cost of business start-up procedures, female (% of GNI per capita) - IC.REG.COST.PC.FE.ZS","Cost of business start-up procedures, male (% of GNI per capita) - IC.REG.COST.PC.MA.ZS",Exports of goods and services (% of GDP) - NE.EXP.GNFS.ZS,Final consumption expenditure (% of GDP) - NE.CON.TOTL.ZS,GDP (current US$) - NY.GDP.MKTP.CD,GDP per capita (current US$) - NY.GDP.PCAP.CD,General government final consumption expenditure (% of GDP) - NE.CON.GOVT.ZS,Gross national expenditure (% of GDP) - NE.DAB.TOTL.ZS,Gross savings (% of GDP) - NY.GNS.ICTR.ZS,Imports of goods and services (% of GDP) - NE.IMP.GNFS.ZS,"Inflation, consumer prices (annual %) - FP.CPI.TOTL.ZG","Primary completion rate, total (% of relevant age group) - SE.PRM.CMPT.ZS",Proportion of seats held by women in national parliaments (%) - SG.GEN.PARL.ZS,"Pupil-teacher ratio, primary - SE.PRM.ENRL.TC.ZS",Renewable electricity output (% of total electricity output) - EG.ELC.RNEW.ZS,Renewable energy consumption (% of total final energy consumption) - EG.FEC.RNEW.ZS,"School enrollment, preprimary (% gross) - SE.PRE.ENRR","School enrollment, primary (% gross) - SE.PRM.ENRR","School enrollment, secondary (% gross) - SE.SEC.ENRR",Trade (% of GDP) - NE.TRD.GNFS.ZS,Women Business and the Law Index Score (scale 1-100) - SG.LAW.INDX,Prevalence of undernourishment (%) - SN_ITK_DEFC - 2.1.1,Proportion of population below international poverty line (%) - SI_POV_DAY1 - 1.1.1,Proportion of population covered by at least a 2G mobile network (%) - IT_MOB_2GNTWK - 9.c.1,Proportion of population covered by at least a 3G mobile network (%) - IT_MOB_3GNTWK - 9.c.1,Proportion of population using basic drinking water services (%) - SP_ACS_BSRVH2O - 1.4.1,"Unemployment rate, male (%) - SL_TLF_UEM - 8.5.2","Unemployment rate, women (%) - SL_TLF_UEM - 8.5.2","Annual production-based emissions of carbon dioxide (CO2), measured in million tonnes",Continent,Gini index (World Bank estimate) - SI.POV.GINI,Income Classification (World Bank Definition),Individuals using the Internet (% of population) - IT.NET.USER.ZS,"Life expectancy at birth, total (years) - SP.DYN.LE00.IN","Population, total - SP.POP.TOTL",Regime Type (RoW Measure Definition),Rural population (% of total population) - SP.RUR.TOTL.ZS,Total natural resources rents (% of GDP) - NY.GDP.TOTL.RT.ZS,Urban population (% of total population) - SP.URB.TOTL.IN.ZS,World Regions (UN SDG Definition),name,alpha-2,country-code,iso_3166-2,region,sub-region,intermediate-region,region-code,sub-region-code,intermediate-region-code
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""Ecuador""","""ECU""","""2000""","""93.53726196""","""-6.074535102""","""2.279974297""","""2.249033532""","""13.32870086""","""0""","""0.326183809""",,"""20.7136997""","""2.931010008""","""10""",,,"""32.12753529""","""73.93090772""","""18327764882""","""1445.279324""","""9.350397446""","""95.20962873""","""26.76513796""","""27.33716401""","""96.09411369""","""97.24127197""","""14.63414634""","""23.25134""","""71.70184697""","""19.42849922""","""63.34661102""","""111.4111023""","""57.70922852""","""59.4646993""","""75.625""",,"""28.4""",,,,,,"""20.563""","""South America""","""56.4""","""Lower-middle i…",,,,"""Electoral Demo…","""39.701""",,,"""Latin America …","""Ecuador""","""EC""","""218""","""ISO 3166-2:EC""","""Americas""","""Latin America …","""South America""","""19""","""419""","""5"""
"""St. Lucia""","""LCA""","""2000""",,,,"""0.771249762""","""0""","""0""","""0.360523435""",,"""59.11584512""","""8.366000175""","""10""",,,,,"""932592592.6""","""5950.047485""",,,,,"""3.710786087""",,"""11.11111111""","""23.23281""","""0""","""24.05290031""","""63.58243942""","""103.4909363""","""73.9931488""",,"""80.625""",,,,,"""90""","""12.6""","""20.8""","""0.348""","""North America""",,"""Upper-middle i…",,,,,"""72.226""",,,"""Latin America …","""Saint Lucia""","""LC""","""662""","""ISO 3166-2:LC""","""Americas""","""Latin America …","""Caribbean""","""19""","""419""","""29"""
"""Trinidad and T…","""TTO""","""2000""","""91.29""",,,"""5.881107308""","""11.29115152""","""0""","""0.289780793""",,"""45.87028023""","""0.628719985""","""6""",,,,,"""8154338233""","""6435.134212""",,,,,"""3.555413197""","""93.35761261""",,"""20.75006""","""0.366367467""","""0.761300027""","""60.19070816""","""104.9102402""",,,"""63.125""",,,,,"""92""","""10.2""","""15.1""","""24.191""","""North America""",,"""Upper-middle i…",,,,"""Electoral Demo…","""44.095""",,,"""Latin America …","""Trinidad and T…","""TT""","""780""","""ISO 3166-2:TT""","""Americas""","""Latin America …","""Caribbean""","""19""","""419""","""29"""
"""Macao SAR, Chi…","""MAC""","""2000""","""100""",,,"""0.488837289""",,,,,"""156.1878846""","""10.48552036""","""10""",,,"""89.26226342""","""58.85706929""","""6774193548""","""15835.99101""","""12.45194872""","""69.2361456""",,"""58.5002483""","""-1.605719644""","""98.33943939""",,"""29.95057""","""0""","""0.177499995""","""93.94873047""","""104.0669403""","""83.08939362""","""147.7625117""",,,,"""99.99""",,,,,"""1.63""","""Asia""",,"""High income""",,,,,"""0""",,,"""Eastern and So…","""Macao""","""MO""","""446""","""ISO 3166-2:MO""","""Asia""","""Eastern Asia""",,"""142""","""30""",
"""United States""","""USA""","""2000""","""100""","""2.792936204""","""8.840900501""","""1.018018888""","""0.679954011""","""0""","""0.124382217""",,"""68.50483165""",,"""12""",,,"""10.69272396""","""79.98304416""","""10252300000000…","""36334.90878""","""14.02601""","""103.6581925""","""20.84075305""","""14.35091126""","""3.376857271""",,,,"""8.205996942""","""5.429699898""",,"""100.7299576""","""94.04963684""","""25.04363522""","""83.75""",,"""0.7""","""99""",,,"""3.9""","""4.1""","""5998.07""","""North America""","""40.1""","""High income""",,,,"""Liberal Democr…","""20.943""",,,"""Europe and Nor…","""United States""","""US""","""840""","""ISO 3166-2:US""","""Americas""","""Northern Ameri…",,"""19""","""21""",


In [12]:
merged.write_csv("../data/sustainability_w_regions.csv")