In [1]:
import numpy as np 
import pandas as pd
import re

In [2]:
def clean_text(text):
    if isinstance(text, str): 
        return re.sub(r'\s+', ' ', text).strip()
    return text

In [3]:
def summary(df):
    print(f'data shape: {df.shape}')  
    summ = pd.DataFrame(df.dtypes, columns=['data type'])
    summ['#missing'] = df.isnull().sum().values 
    summ['%missing'] = df.isnull().sum().values / len(df)
    summ['#unique'] = df.nunique().values
    desc = pd.DataFrame(df.describe(include='all').transpose())
#     summ['min'] = desc['min'].values
#     summ['max'] = desc['max'].values
    return summ

In [4]:
file_path = '../metadata/hierarchy_topic_by_sheet.xlsx' 
xls = pd.ExcelFile(file_path)


dfs = {}

for sheet_name in xls.sheet_names:
    first_word = sheet_name.split()[0].lower() 
    df = pd.read_excel(xls, sheet_name=sheet_name)  
    dfs[first_word] = df  


for name, df in dfs.items():
    print(f"{name}")

economic
education
environment
financial
gender
health
infrastructure
poverty
private
public
social
trade


In [5]:
dfs['education']

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
0,SE.PRM.PRS5.ZS,"Persistence to grade 5, total (% of cohort)",Education: Efficiency,Education,Efficiency,,
1,SE.PRM.GINT.ZS,Gross intake ratio in first grade of primary e...,Education: Efficiency,Education,Efficiency,,
2,SE.PRM.NINT.ZS,Net intake rate in grade 1 (% of official scho...,Education: Efficiency,Education,Efficiency,,
3,SE.PRM.OENR.MA.ZS,"Over-age students, primary, male (% of male en...",Education: Efficiency,Education,Efficiency,,
4,SE.PRM.PRS5.MA.ZS,"Persistence to grade 5, male (% of cohort)",Education: Efficiency,Education,Efficiency,,
...,...,...,...,...,...,...,...
151,SE.ENR.PRIM.FM.ZS,"School enrollment, primary (gross), gender par...",Education: Participation,Education,Participation,,
152,SE.PRM.NENR,"School enrollment, primary (% net)",Education: Participation,Education,Participation,,
153,SE.PRE.ENRR.MA,"School enrollment, preprimary, male (% gross)",Education: Participation,Education,Participation,,
154,SE.PRM.PRIV.ZS,"School enrollment, primary, private (% of tota...",Education: Participation,Education,Participation,,


In [6]:
example_education = dfs['education'].copy()

In [7]:
example_education['code3'] = example_education['Code'].str.split('.').str[:3].str.join('.')

In [8]:
grouped_counts = example_education['code3'].value_counts()
grouped_counts.head()

code3
SE.TER.CUAT    12
SE.SEC.CUAT     9
SE.LPV.PRIM     9
SE.SEC.TCAQ     9
SE.SEC.ENRL     9
Name: count, dtype: int64

In [9]:

with pd.option_context('display.max_colwidth', 200):

    display(example_education[example_education['code3'] == 'SE.TER.CUAT'])


Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3,code3
68,SE.TER.CUAT.BA.FE.ZS,"Educational attainment, at least Bachelor's or equivalent, population 25+, female (%) (cumulative)",Education: Outcomes,Education,Outcomes,,,SE.TER.CUAT
69,SE.TER.CUAT.DO.FE.ZS,"Educational attainment, Doctoral or equivalent, population 25+, female (%) (cumulative)",Education: Outcomes,Education,Outcomes,,,SE.TER.CUAT
70,SE.TER.CUAT.BA.MA.ZS,"Educational attainment, at least Bachelor's or equivalent, population 25+, male (%) (cumulative)",Education: Outcomes,Education,Outcomes,,,SE.TER.CUAT
71,SE.TER.CUAT.DO.MA.ZS,"Educational attainment, Doctoral or equivalent, population 25+, male (%) (cumulative)",Education: Outcomes,Education,Outcomes,,,SE.TER.CUAT
73,SE.TER.CUAT.ST.FE.ZS,"Educational attainment, at least completed short-cycle tertiary, population 25+, female (%) (cumulative)",Education: Outcomes,Education,Outcomes,,,SE.TER.CUAT
76,SE.TER.CUAT.ST.MA.ZS,"Educational attainment, at least completed short-cycle tertiary, population 25+, male (%) (cumulative)",Education: Outcomes,Education,Outcomes,,,SE.TER.CUAT
77,SE.TER.CUAT.MS.ZS,"Educational attainment, at least Master's or equivalent, population 25+, total (%) (cumulative)",Education: Outcomes,Education,Outcomes,,,SE.TER.CUAT
78,SE.TER.CUAT.BA.ZS,"Educational attainment, at least Bachelor's or equivalent, population 25+, total (%) (cumulative)",Education: Outcomes,Education,Outcomes,,,SE.TER.CUAT
80,SE.TER.CUAT.DO.ZS,"Educational attainment, Doctoral or equivalent, population 25+, total (%) (cumulative)",Education: Outcomes,Education,Outcomes,,,SE.TER.CUAT
81,SE.TER.CUAT.MS.MA.ZS,"Educational attainment, at least Master's or equivalent, population 25+, male (%) (cumulative)",Education: Outcomes,Education,Outcomes,,,SE.TER.CUAT


In this eample of education outcomes, we observe that indicator codes beginning with **SE.TER.CUAT** can be further subdivided to populate relevant subtopics.  

To achieve this, we will:  
1. **Read the WDI Indicators file** to examine the full list of codes and their corresponding descriptions.  
2. **Identify patterns** within the codes that allow for classification into subtopics.  
3. **Extract meaningful subtopic information** from the code structure and descriptions.  

By implementing this approach, we aim to enhance the granularity of our dataset and improve analytical insights into education outcomes.  

In [10]:
wdi_codes = pd.read_excel('../metadata/WDI_Indicators.xlsx', sheet_name="Coding")

In [11]:
wdi_codes

Unnamed: 0,Topic,Topic description,General subject,General subject description,Specific subject,Specific subject description,Extensions,Extensions description
0,AG,Agriculture,ACS,Access,0003,Age 0-3,05,2005 (PPP)
1,BG,Balance of payments: gross,ADJ,Adjusted savings,0004,Age 0-4,10,Decile
2,BM,"Balance of payments: imports, payments (credit)",ADM,Admission,0014,Age 0-14,14,Age 0-14
3,BN,Balance of payments: net,ADO,Adolescent,0324,Age 0-24,20,Quintile
4,BX,"Balance of payments: exports, receipts (debit)",ADT,Adult,0306,Age 3-6,90,% changes since 1990
...,...,...,...,...,...,...,...,...
1057,,,,,XOKA,Excluding official capital transfers,,
1058,,,,,XPND,Expenditure,,
1059,,,,,XPRT,Exports,,
1060,,,,,XTHR,Other manufactures (trade),,


### Understanding the Code Structure using `wdi_codes`

Based on our analysis, we have identified a general pattern in the structure of the codes formatted as **XX.XX.XX...**. While this does not apply to all cases, the common breakdown is as follows:

1. **Topic Column**: Corresponds to the `Topic` and `SubTopic1` of the dataset as the values for that column are **name : name**.
2. **General subject Column**: Represents the `SubTopic2`.
3. **Specific subject Column**: Defines the `SubTopic3`.
4. **Extension (the rest)**: Any additional components following the third part can be considered as **Extensions**, which provide further granularity or variations within the specific subject. Note we dont have a column for taht yet and we will find a way to add informations of extension if they are important to our work

By leveraging this structure, we can systematically extract hierarchical information from the codes and use it for improved classification and mapping in our analysis.


We will delete the topic and topic description as we already has the topic metadata filled

In [12]:
wdi_codes.drop(columns=['Topic', 'Topic description'], inplace=True)
wdi_codes.head()

Unnamed: 0,General subject,General subject description,Specific subject,Specific subject description,Extensions,Extensions description
0,ACS,Access,3,Age 0-3,5,2005 (PPP)
1,ADJ,Adjusted savings,4,Age 0-4,10,Decile
2,ADM,Admission,14,Age 0-14,14,Age 0-14
3,ADO,Adolescent,324,Age 0-24,20,Quintile
4,ADT,Adult,306,Age 3-6,90,% changes since 1990


In [13]:
for col in ['General subject', 'Specific subject', 'Extensions']:
    redundant = wdi_codes[col].dropna()[wdi_codes[col].dropna().duplicated()]
    if not redundant.empty:
        print(f"Redundant values in column '{col}':")
        print(redundant)
    else:
        print(f"No redundant values in column '{col}'.")


No redundant values in column 'General subject'.
No redundant values in column 'Specific subject'.
No redundant values in column 'Extensions'.


In [14]:
import pandas as pd

wdi_codes['has_match'] = wdi_codes.apply(
    lambda row: (
        (pd.notna(row['General subject']) and row['General subject'] in [row['Specific subject'], row['Extensions']]) or
        (pd.notna(row['Specific subject']) and row['Specific subject'] in [row['General subject'], row['Extensions']]) or
        (pd.notna(row['Extensions']) and row['Extensions'] in [row['General subject'], row['Specific subject']])
    ), axis=1
)

matching_rows = wdi_codes[wdi_codes['has_match']]
print("Rows with matching values between columns:")
print(matching_rows[['General subject', 'Specific subject', 'Extensions']])


wdi_codes.drop(columns=['has_match'], inplace=True)


Rows with matching values between columns:
Empty DataFrame
Columns: [General subject, Specific subject, Extensions]
Index: []


In [15]:
# df_melted = pd.melt(wdi_codes, 
#                     value_vars=['General subject', 'Specific subject', 'Extensions'], 
#                     var_name='Key_Type', value_name='Key')

# df_melted['Value'] = pd.melt(wdi_codes, 
#                              value_vars=['General subject description', 'Specific subject description', 'Extensions description'], 
#                              var_name='Value_Type', value_name='Value')['Value']

# df_melted.dropna(subset=['Key', 'Value'], how='any', inplace=True)


In [16]:
# df_melted

## Testing in the example of education `example_education`

In [17]:
print(example_education['SubTopic2'].notna().any())
print(example_education['SubTopic3'].notna().any())

False
False


# This is the main function we will be using to assign the subtopics

In [18]:
def extract_and_match(row, df2):

    parts = row['Code'].split('.')
    if len(parts) < 3:
        first_desc, second_desc = None, None
    else:
        first_desc = df2.loc[df2['General subject'] == parts[1], 'General subject description'].values
        second_desc = df2.loc[df2['Specific subject'] == parts[2], 'Specific subject description'].values

        first_desc = first_desc[0] if first_desc.size > 0 else None
        second_desc = second_desc[0] if second_desc.size > 0 else None


    if pd.notna(row['SubTopic2']):


        if first_desc != None and row['SubTopic2'] != first_desc: 

            row['SubTopic2'] = [row['SubTopic2'], first_desc]            
    else:

        row['SubTopic2'] = first_desc

    if pd.notna(row['SubTopic3']):

        if row['SubTopic3'] != second_desc:
            row['SubTopic3'] = [row['SubTopic3'], second_desc]
    else:

        row['SubTopic3'] = second_desc

    return row



In [19]:
example_education = example_education.apply(extract_and_match, axis=1, df2=wdi_codes)

In [20]:
example_education

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3,code3
0,SE.PRM.PRS5.ZS,"Persistence to grade 5, total (% of cohort)",Education: Efficiency,Education,Efficiency,Primary education,Persistence to grade 5,SE.PRM.PRS5
1,SE.PRM.GINT.ZS,Gross intake ratio in first grade of primary e...,Education: Efficiency,Education,Efficiency,Primary education,Gross intake,SE.PRM.GINT
2,SE.PRM.NINT.ZS,Net intake rate in grade 1 (% of official scho...,Education: Efficiency,Education,Efficiency,Primary education,Net intake,SE.PRM.NINT
3,SE.PRM.OENR.MA.ZS,"Over-age students, primary, male (% of male en...",Education: Efficiency,Education,Efficiency,Primary education,Over-age enrollment,SE.PRM.OENR
4,SE.PRM.PRS5.MA.ZS,"Persistence to grade 5, male (% of cohort)",Education: Efficiency,Education,Efficiency,Primary education,Persistence to grade 5,SE.PRM.PRS5
...,...,...,...,...,...,...,...,...
151,SE.ENR.PRIM.FM.ZS,"School enrollment, primary (gross), gender par...",Education: Participation,Education,Participation,Enrollment,Primary education,SE.ENR.PRIM
152,SE.PRM.NENR,"School enrollment, primary (% net)",Education: Participation,Education,Participation,Primary education,Net enrollment,SE.PRM.NENR
153,SE.PRE.ENRR.MA,"School enrollment, preprimary, male (% gross)",Education: Participation,Education,Participation,Preprimary education,Enrolment rate,SE.PRE.ENRR
154,SE.PRM.PRIV.ZS,"School enrollment, primary, private (% of tota...",Education: Participation,Education,Participation,Primary education,Private,SE.PRM.PRIV


In [21]:
summary(example_education).style.background_gradient(cmap='Blues')

data shape: (156, 8)


Unnamed: 0,data type,#missing,%missing,#unique
Code,object,0,0.0,156
Indicator Name,object,0,0.0,156
General Topic,object,0,0.0,4
Topic,object,0,0.0,1
SubTopic1,object,0,0.0,4
SubTopic2,object,9,0.057692,8
SubTopic3,object,0,0.0,30
code3,object,0,0.0,54


In [22]:
with pd.option_context('display.max_colwidth', 200):

    display(example_education[example_education['SubTopic2'].isna()])


Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3,code3
75,SE.LPV.PRIM.SD,Primary school age children out-of-school (%),Education: Outcomes,Education,Outcomes,,Primary education,SE.LPV.PRIM
89,SE.LPV.PRIM.LD,Pupils below minimum reading proficiency at end of primary (%). Low GAML threshold,Education: Outcomes,Education,Outcomes,,Primary education,SE.LPV.PRIM
94,SE.LPV.PRIM.MA,Learning poverty: Share of Male Children at the End-of-Primary age below minimum reading proficiency adjusted by Out-of-School Children (%),Education: Outcomes,Education,Outcomes,,Primary education,SE.LPV.PRIM
95,SE.LPV.PRIM.FE,Learning poverty: Share of Female Children at the End-of-Primary age below minimum reading proficiency adjusted by Out-of-School Children (%),Education: Outcomes,Education,Outcomes,,Primary education,SE.LPV.PRIM
101,SE.LPV.PRIM,Learning poverty: Share of Children at the End-of-Primary age below minimum reading proficiency adjusted by Out-of-School Children (%),Education: Outcomes,Education,Outcomes,,Primary education,SE.LPV.PRIM
104,SE.LPV.PRIM.LD.FE,Female pupils below minimum reading proficiency at end of primary (%). Low GAML threshold,Education: Outcomes,Education,Outcomes,,Primary education,SE.LPV.PRIM
105,SE.LPV.PRIM.SD.FE,Female primary school age children out-of-school (%),Education: Outcomes,Education,Outcomes,,Primary education,SE.LPV.PRIM
106,SE.LPV.PRIM.SD.MA,Male primary school age children out-of-school (%),Education: Outcomes,Education,Outcomes,,Primary education,SE.LPV.PRIM
109,SE.LPV.PRIM.LD.MA,Male pupils below minimum reading proficiency at end of primary (%). Low GAML threshold,Education: Outcomes,Education,Outcomes,,Primary education,SE.LPV.PRIM


In [23]:
example_education['SubTopic2'] = example_education['SubTopic2'].fillna('Low Poverty Vulnerability')
summary(example_education).style.background_gradient(cmap='Blues')

data shape: (156, 8)


Unnamed: 0,data type,#missing,%missing,#unique
Code,object,0,0.0,156
Indicator Name,object,0,0.0,156
General Topic,object,0,0.0,4
Topic,object,0,0.0,1
SubTopic1,object,0,0.0,4
SubTopic2,object,0,0.0,9
SubTopic3,object,0,0.0,30
code3,object,0,0.0,54


## In this example
As we can see, we were able to extend the granularity and hierarchy of the education metadata.

There are missing values for subtopic1, and after further research, we can impute these with the value **LPV = 'Low Poverty Vulnerability'**.

For now, this will be done manually for each sheet.

## We apply our function to the entire df

In [24]:
for key, df in dfs.items():

    df = df.apply(extract_and_match, axis=1, df2=wdi_codes)  
    dfs[key] = df  


In [25]:
for key, df in dfs.items():

    missing_subtopic2 = df['SubTopic2'].isnull().sum()
    missing_subtopic3 = df['SubTopic3'].isnull().sum()
    

    print(f"\n{'*' * 10} {key} {'*' * 10}")
    print(f"Missing subtopic2: {missing_subtopic2}")
    print(f"Missing subtopic3: {missing_subtopic3}")
    print("-" * 40)  


********** economic **********
Missing subtopic2: 0
Missing subtopic3: 15
----------------------------------------

********** education **********
Missing subtopic2: 9
Missing subtopic3: 0
----------------------------------------

********** environment **********
Missing subtopic2: 43
Missing subtopic3: 44
----------------------------------------

********** financial **********
Missing subtopic2: 0
Missing subtopic3: 0
----------------------------------------

********** gender **********
Missing subtopic2: 1
Missing subtopic3: 2
----------------------------------------

********** health **********
Missing subtopic2: 12
Missing subtopic3: 45
----------------------------------------

********** infrastructure **********
Missing subtopic2: 0
Missing subtopic3: 0
----------------------------------------

********** poverty **********
Missing subtopic2: 0
Missing subtopic3: 9
----------------------------------------

********** private **********
Missing subtopic2: 6
Missing subtopic3

### For now i think the best way is to go throught each individually and handle missing and the different values in "list" subtopics (chatgpt + wdi documentation)

# Economic

In [26]:
economic = dfs["economic"]

**Notice we dont create a copy as we want the changes to reflect to dfs["economic"].**

In [27]:
with pd.option_context('display.max_colwidth', 200):

    display(economic[economic['SubTopic3'].isna()])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
269,DT.NFL.WITC.CD,"Net official flows from UN agencies, WTO-ITC (current US$)",Economic Policy & Debt: Official development assistance,Economic Policy & Debt,Official development assistance,Net flows,
286,DC.DAC.LTUL.CD,"Net bilateral aid flows from DAC donors, Lithuania (current US$)",Economic Policy & Debt: Official development assistance,Economic Policy & Debt,Official development assistance,Development Assistance Committee (OECD),
304,DT.NFL.CERF.CD,"Net official flows from UN agencies, CERF (current US$)",Economic Policy & Debt: Official development assistance,Economic Policy & Debt,Official development assistance,Net flows,
308,DT.NFL.UNCTAD.CD,"Net official flows from UN agencies, UNCTAD (current US$)",Economic Policy & Debt: Official development assistance,Economic Policy & Debt,Official development assistance,Net flows,
311,DT.NFL.UNEP.CD,"Net official flows from UN agencies, UNEP (current US$)",Economic Policy & Debt: Official development assistance,Economic Policy & Debt,Official development assistance,Net flows,
315,DT.NFL.UNID.CD,"Net official flows from UN agencies, UNIDIR (current US$)",Economic Policy & Debt: Official development assistance,Economic Policy & Debt,Official development assistance,Net flows,
316,DT.NFL.UNIDO.CD,"Net official flows from UN agencies, UNIDO (current US$)",Economic Policy & Debt: Official development assistance,Economic Policy & Debt,Official development assistance,Net flows,
319,DT.NFL.UNWN.CD,"Net official flows from UN agencies, UNWOMEN (current US$)",Economic Policy & Debt: Official development assistance,Economic Policy & Debt,Official development assistance,Net flows,
320,DT.NFL.UNWT.CD,"Net official flows from UN agencies, UNWTO (current US$)",Economic Policy & Debt: Official development assistance,Economic Policy & Debt,Official development assistance,Net flows,
323,DT.NFL.UNCV.CD,"Net official flows from UN agencies, UNCOVID (current US$)",Economic Policy & Debt: Official development assistance,Economic Policy & Debt,Official development assistance,Net flows,


In [28]:
economic_mapping = {
    "WITC": "World Integrated Trade Solution",
    "LTUL": "Long-Term Unemployment Level",
    "CERF": "Central Emergency Response Fund",
    "UNCTAD": "United Nations Conference on Trade and Development",
    "UNEP": "United Nations Environment Programme",
    "UNID": "United Nations Industrial Development Organization",
    "UNIDO": "United Nations Industrial Development Organization",
    "UNWN": "United Nations World Network (Unconfirmed)",
    "UNCV": "United Nations Convention",
    "SPRP": "Social Protection and Resilience Programs",
    "HUNL": "Hunger Levels (Unconfirmed)",
    "ESTL": "Estimated Labor Statistics",
    "SDGF": "Sustainable Development Goals Fund",
    "UNWT": "United Nations World Tourism Organization",
    "UNCD": "United Nations Conference on Disarmament"


}

economic.loc[economic["SubTopic3"].isna(), "SubTopic3"] = (
    economic["Code"].str.split(".").str[2].map(economic_mapping)
)


In [29]:
with pd.option_context('display.max_colwidth', 200):

    display(economic[economic['SubTopic3'].isna()])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3


In [50]:
def get_code_prefix(code):
    return ".".join(code.split(".")[:2])

filtered = economic[
    economic['SubTopic2'].apply(lambda x: isinstance(x, list)) | 
    economic['SubTopic3'].apply(lambda x: isinstance(x, list))
]

filtered = filtered.copy()
filtered.loc[:, "CodeGroup"] = filtered["Code"].apply(get_code_prefix)


grouped = filtered.groupby("CodeGroup")



In [67]:
group_keys = list(grouped.groups.keys())

index = 12  

selected_group_key = group_keys[index]
selected_group_df = grouped.get_group(selected_group_key)


with pd.option_context('display.max_colwidth', 200):

    display(selected_group_df)


Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3,CodeGroup
4,BX.GSR.CMCP.ZS,"Communications, computer, etc. (% of service exports, BoP)","Economic Policy & Debt: Balance of payments: Current account: Goods, services & income",Economic Policy & Debt,Balance of payments,"[Current account, Goods and services]","[Goods, services & income, Communications, computer, etc.]",BX.GSR
5,BX.GSR.FCTY.CD,"Primary income receipts (BoP, current US$)","Economic Policy & Debt: Balance of payments: Current account: Goods, services & income",Economic Policy & Debt,Balance of payments,"[Current account, Goods and services]","[Goods, services & income, Factor income]",BX.GSR
13,BX.GSR.GNFS.CD,"Exports of goods and services (BoP, current US$)","Economic Policy & Debt: Balance of payments: Current account: Goods, services & income",Economic Policy & Debt,Balance of payments,"[Current account, Goods and services]","[Goods, services & income, Goods and (nonfactor) services]",BX.GSR
15,BX.GSR.TRAN.ZS,"Transport services (% of service exports, BoP)","Economic Policy & Debt: Balance of payments: Current account: Goods, services & income",Economic Policy & Debt,Balance of payments,"[Current account, Goods and services]","[Goods, services & income, Transport]",BX.GSR
17,BX.GSR.TOTL.CD,"Exports of goods, services and primary income (BoP, current US$)","Economic Policy & Debt: Balance of payments: Current account: Goods, services & income",Economic Policy & Debt,Balance of payments,"[Current account, Goods and services]","[Goods, services & income, Total]",BX.GSR
21,BX.GSR.INSF.ZS,"Insurance and financial services (% of service exports, BoP)","Economic Policy & Debt: Balance of payments: Current account: Goods, services & income",Economic Policy & Debt,Balance of payments,"[Current account, Goods and services]","[Goods, services & income, Insurance and financial services]",BX.GSR
24,BX.GSR.NFSV.CD,"Service exports (BoP, current US$)","Economic Policy & Debt: Balance of payments: Current account: Goods, services & income",Economic Policy & Debt,Balance of payments,"[Current account, Goods and services]","[Goods, services & income, (Nonfactor) services]",BX.GSR
29,BX.GSR.ROYL.CD,"Charges for the use of intellectual property, receipts (BoP, current US$)","Economic Policy & Debt: Balance of payments: Current account: Goods, services & income",Economic Policy & Debt,Balance of payments,"[Current account, Goods and services]","[Goods, services & income, Royalty and license fees]",BX.GSR
38,BX.GSR.TRVL.ZS,"Travel services (% of service exports, BoP)","Economic Policy & Debt: Balance of payments: Current account: Goods, services & income",Economic Policy & Debt,Balance of payments,"[Current account, Goods and services]","[Goods, services & income, Travel]",BX.GSR
49,BX.GSR.MRCH.CD,"Goods exports (BoP, current US$)","Economic Policy & Debt: Balance of payments: Current account: Goods, services & income",Economic Policy & Debt,Balance of payments,"[Current account, Goods and services]","[Goods, services & income, Goods (merchandise)]",BX.GSR


### For the various examples we analyzed, we concluded that selecting the values added from the WDI (World Development Indicators) dataset provides a more accurate representation. These values offer a finer level of granularity, enhancing our ability to analyze and interpret the data effectively.

In [85]:
def select_second_item(value):
    if isinstance(value, list) and len(value) > 1:
        return value[1]  
    return value 

In [88]:
economic["SubTopic2"] = economic["SubTopic2"].apply(select_second_item)
economic["SubTopic3"] = economic["SubTopic3"].apply(select_second_item)

# Education

In [92]:
education = dfs['education']

In [93]:
with pd.option_context('display.max_colwidth', 200):

    display(education[education['SubTopic2'].isna()])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
75,SE.LPV.PRIM.SD,Primary school age children out-of-school (%),Education: Outcomes,Education,Outcomes,,Primary education
89,SE.LPV.PRIM.LD,Pupils below minimum reading proficiency at end of primary (%). Low GAML threshold,Education: Outcomes,Education,Outcomes,,Primary education
94,SE.LPV.PRIM.MA,Learning poverty: Share of Male Children at the End-of-Primary age below minimum reading proficiency adjusted by Out-of-School Children (%),Education: Outcomes,Education,Outcomes,,Primary education
95,SE.LPV.PRIM.FE,Learning poverty: Share of Female Children at the End-of-Primary age below minimum reading proficiency adjusted by Out-of-School Children (%),Education: Outcomes,Education,Outcomes,,Primary education
101,SE.LPV.PRIM,Learning poverty: Share of Children at the End-of-Primary age below minimum reading proficiency adjusted by Out-of-School Children (%),Education: Outcomes,Education,Outcomes,,Primary education
104,SE.LPV.PRIM.LD.FE,Female pupils below minimum reading proficiency at end of primary (%). Low GAML threshold,Education: Outcomes,Education,Outcomes,,Primary education
105,SE.LPV.PRIM.SD.FE,Female primary school age children out-of-school (%),Education: Outcomes,Education,Outcomes,,Primary education
106,SE.LPV.PRIM.SD.MA,Male primary school age children out-of-school (%),Education: Outcomes,Education,Outcomes,,Primary education
109,SE.LPV.PRIM.LD.MA,Male pupils below minimum reading proficiency at end of primary (%). Low GAML threshold,Education: Outcomes,Education,Outcomes,,Primary education


In [94]:
education['SubTopic2'] = education['SubTopic2'].fillna('Low Poverty Vulnerability')

In [96]:
with pd.option_context('display.max_colwidth', 200):

    display(education[education['SubTopic3'].isna()])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3


In [99]:
has_list = education.apply(lambda row: isinstance(row['SubTopic2'], list) or isinstance(row['SubTopic3'], list), axis=1)


education_with_lists = education[has_list]
print(education_with_lists)


Empty DataFrame
Columns: [Code, Indicator Name, General Topic, Topic, SubTopic1, SubTopic2, SubTopic3]
Index: []


# Environment 

In [102]:
environment = dfs["environment"]

In [105]:
with pd.option_context('display.max_colwidth', 200):

    display(environment[(environment['SubTopic2'].isna())|(environment['SubTopic3'].isna())])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
65,ER.H2O.FWST.ZS,Level of water stress: freshwater withdrawal as a proportion of available freshwater resources,Environment: Freshwater,Environment,Freshwater,Water,
71,EN.GHG.ALL.LU.MT.CE.AR5,Total greenhouse gas emissions including LULUCF (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,,
72,EN.GHG.ALL.PC.CE.AR5,Total greenhouse gas emissions per capita excluding LULUCF (t CO2e/capita),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,,
73,EN.GHG.CO2.MT.CE.AR5,Carbon dioxide (CO2) emissions (total) excluding LULUCF (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,,
74,EN.GHG.CO2.LU.DF.MT.CE.AR5,Carbon dioxide (CO2) net fluxes from LULUCF - Deforestation (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,,
75,EN.GHG.CO2.LU.OS.MT.CE.AR5,Carbon dioxide (CO2) net fluxes from LULUCF - Organic Soil (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,,
76,EN.GHG.FGAS.IP.MT.CE.AR5,Fluorinated greenhouse gases (F-gases) emissions from Industrial Processes (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,,
77,EN.GHG.ALL.MT.CE.AR5,Total greenhouse gas emissions excluding LULUCF (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,,
78,EN.GHG.CO2.ZG.AR5,Carbon dioxide (CO2) emissions (total) excluding LULUCF (% change from 1990),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,,
79,EN.GHG.CO2.LU.OL.MT.CE.AR5,Carbon dioxide (CO2) net fluxes from LULUCF - Other Land (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,,


In [115]:
# environment_mapping_subtopic3 = {
#     "FWST": "freshwater withdrawal"
# }

In [123]:
environment.loc[environment["Code"] == "ER.H2O.FWST.ZS", "SubTopic3"] = "Water stress: freshwater withdrawal"


In [109]:
environment_mapping = {
    "ALL": "All Greenhouse Gases",
    "FGAS": "Fluorinated Gases emissions",
    "CH4": "Methane emissions",
    "N2O": "Nitrous oxide emissions",
    "CO2": "Carbon dioxide emissions",
    "TOT": "Total"
}

environment.loc[environment["SubTopic2"].isna(), "SubTopic2"] = (
    environment["Code"].str.split(".").str[2].map(environment_mapping)
)


In [118]:
with pd.option_context('display.max_colwidth', 200):

    display(environment[environment['SubTopic2'].isna()])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3


In [124]:
with pd.option_context('display.max_colwidth', 200):

    display(environment[environment['SubTopic3'].isna()])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
72,EN.GHG.ALL.PC.CE.AR5,Total greenhouse gas emissions per capita excluding LULUCF (t CO2e/capita),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,All Greenhouse Gases,
73,EN.GHG.CO2.MT.CE.AR5,Carbon dioxide (CO2) emissions (total) excluding LULUCF (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Carbon dioxide emissions,
77,EN.GHG.ALL.MT.CE.AR5,Total greenhouse gas emissions excluding LULUCF (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,All Greenhouse Gases,
78,EN.GHG.CO2.ZG.AR5,Carbon dioxide (CO2) emissions (total) excluding LULUCF (% change from 1990),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Carbon dioxide emissions,
84,EN.GHG.TOT.ZG.AR5,Total greenhouse gas emissions excluding LULUCF (% change from 1990),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Total,
87,EN.GHG.CH4.ZG.AR5,Methane (CH4) emissions (total) excluding LULUCF (% change from 1990),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Methane emissions,
88,EN.GHG.N2O.ZG.AR5,Nitrous oxide (N2O) emissions (total) excluding LULUCF (% change from 1990),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Nitrous oxide emissions,
95,EN.GHG.N2O.MT.CE.AR5,Nitrous oxide (N2O) emissions (total) excluding LULUCF (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Nitrous oxide emissions,
97,EN.GHG.CH4.MT.CE.AR5,Methane (CH4) emissions (total) excluding LULUCF (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Methane emissions,
113,EN.GHG.CO2.PC.CE.AR5,Carbon dioxide (CO2) emissions excluding LULUCF per capita (t CO2e/capita),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Carbon dioxide emissions,


In [114]:
environment_mapping_subtopic3 = {
    "LU": "Including Land Use and Land-Use Change (LULUC)",
    "RT": "Carbon intensity of GDP",
    "OS": "Other Sources",
    "OL": "Other Land Use",
    "WA": "Waste",
    "IC": "Industrial Combustion",
    "IP": "Industrial Processes",
    "BU": "Buildings",
    "AG": "Agriculture",
    "PI": "Power Industry",
    "TR": "Transport",
    "FE": "Fugitive Emissions",

}

environment.loc[environment["SubTopic3"].isna(), "SubTopic3"] = (
    environment["Code"].str.split(".").str[3].map(environment_mapping_subtopic3)
)


In [119]:
with pd.option_context('display.max_colwidth', 200):
    display(environment[environment['Code'].str.startswith('EN.GHG')])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
71,EN.GHG.ALL.LU.MT.CE.AR5,Total greenhouse gas emissions including LULUCF (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,All Greenhouse Gases,Including Land Use and Land-Use Change (LULUC)
72,EN.GHG.ALL.PC.CE.AR5,Total greenhouse gas emissions per capita excluding LULUCF (t CO2e/capita),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,All Greenhouse Gases,
73,EN.GHG.CO2.MT.CE.AR5,Carbon dioxide (CO2) emissions (total) excluding LULUCF (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Carbon dioxide emissions,
74,EN.GHG.CO2.LU.DF.MT.CE.AR5,Carbon dioxide (CO2) net fluxes from LULUCF - Deforestation (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Carbon dioxide emissions,Including Land Use and Land-Use Change (LULUC)
75,EN.GHG.CO2.LU.OS.MT.CE.AR5,Carbon dioxide (CO2) net fluxes from LULUCF - Organic Soil (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Carbon dioxide emissions,Including Land Use and Land-Use Change (LULUC)
76,EN.GHG.FGAS.IP.MT.CE.AR5,Fluorinated greenhouse gases (F-gases) emissions from Industrial Processes (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Fluorinated Gases,Industrial Processes
77,EN.GHG.ALL.MT.CE.AR5,Total greenhouse gas emissions excluding LULUCF (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,All Greenhouse Gases,
78,EN.GHG.CO2.ZG.AR5,Carbon dioxide (CO2) emissions (total) excluding LULUCF (% change from 1990),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Carbon dioxide emissions,
79,EN.GHG.CO2.LU.OL.MT.CE.AR5,Carbon dioxide (CO2) net fluxes from LULUCF - Other Land (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Carbon dioxide emissions,Including Land Use and Land-Use Change (LULUC)
80,EN.GHG.CO2.RT.GDP.PP.KD,Carbon intensity of GDP (kg CO2e per 2021 PPP $),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Carbon dioxide emissions,Carbon intensity of GDP


In [132]:
environment.loc[environment["SubTopic3"].isna(), "SubTopic3"] = "Excluding  Land Use and Land-Use Change (LULUC)"


In [133]:
with pd.option_context('display.max_colwidth', 200):
    display(environment[environment['Code'].str.startswith('EN.GHG')])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
71,EN.GHG.ALL.LU.MT.CE.AR5,Total greenhouse gas emissions including LULUCF (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,All Greenhouse Gases,Including Land Use and Land-Use Change (LULUC)
72,EN.GHG.ALL.PC.CE.AR5,Total greenhouse gas emissions per capita excluding LULUCF (t CO2e/capita),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,All Greenhouse Gases,Excluding Land Use and Land-Use Change (LULUC)
73,EN.GHG.CO2.MT.CE.AR5,Carbon dioxide (CO2) emissions (total) excluding LULUCF (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Carbon dioxide emissions,Excluding Land Use and Land-Use Change (LULUC)
74,EN.GHG.CO2.LU.DF.MT.CE.AR5,Carbon dioxide (CO2) net fluxes from LULUCF - Deforestation (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Carbon dioxide emissions,Including Land Use and Land-Use Change (LULUC)
75,EN.GHG.CO2.LU.OS.MT.CE.AR5,Carbon dioxide (CO2) net fluxes from LULUCF - Organic Soil (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Carbon dioxide emissions,Including Land Use and Land-Use Change (LULUC)
76,EN.GHG.FGAS.IP.MT.CE.AR5,Fluorinated greenhouse gases (F-gases) emissions from Industrial Processes (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Fluorinated Gases,Industrial Processes
77,EN.GHG.ALL.MT.CE.AR5,Total greenhouse gas emissions excluding LULUCF (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,All Greenhouse Gases,Excluding Land Use and Land-Use Change (LULUC)
78,EN.GHG.CO2.ZG.AR5,Carbon dioxide (CO2) emissions (total) excluding LULUCF (% change from 1990),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Carbon dioxide emissions,Excluding Land Use and Land-Use Change (LULUC)
79,EN.GHG.CO2.LU.OL.MT.CE.AR5,Carbon dioxide (CO2) net fluxes from LULUCF - Other Land (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Carbon dioxide emissions,Including Land Use and Land-Use Change (LULUC)
80,EN.GHG.CO2.RT.GDP.PP.KD,Carbon intensity of GDP (kg CO2e per 2021 PPP $),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Carbon dioxide emissions,Carbon intensity of GDP


In [131]:
has_list = environment.apply(lambda row: isinstance(row['SubTopic2'], list) or isinstance(row['SubTopic3'], list), axis=1)


environment_with_lists = environment[has_list]
print(environment_with_lists)


Empty DataFrame
Columns: [Code, Indicator Name, General Topic, Topic, SubTopic1, SubTopic2, SubTopic3]
Index: []


# Financial 

In [135]:
financial  = dfs["financial"]

In [137]:
with pd.option_context('display.max_colwidth', 200):

    display(financial[financial['SubTopic2'].isna() | financial['SubTopic3'].isna()])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3


In [139]:
financial

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
0,FX.OWN.TOTL.SO.ZS,Account ownership at a financial institution o...,Financial Sector: Access,Financial Sector,Access,Ownership,Total
1,FB.ATM.TOTL.P5,"Automated teller machines (ATMs) (per 100,000 ...",Financial Sector: Access,Financial Sector,Access,Atmosphere (or automated teller machine),Total
2,FX.OWN.TOTL.YG.ZS,Account ownership at a financial institution o...,Financial Sector: Access,Financial Sector,Access,Ownership,Total
3,FX.OWN.TOTL.60.ZS,Account ownership at a financial institution o...,Financial Sector: Access,Financial Sector,Access,Ownership,Total
4,FX.OWN.TOTL.MA.ZS,Account ownership at a financial institution o...,Financial Sector: Access,Financial Sector,Access,Ownership,Total
5,FX.OWN.TOTL.OL.ZS,Account ownership at a financial institution o...,Financial Sector: Access,Financial Sector,Access,Ownership,Total
6,FX.OWN.TOTL.40.ZS,Account ownership at a financial institution o...,Financial Sector: Access,Financial Sector,Access,Ownership,Total
7,SI.RMT.COST.IB.ZS,Average transaction cost of sending remittance...,Financial Sector: Access,Financial Sector,Access,Remittances,Cost
8,SI.RMT.COST.OB.ZS,Average transaction cost of sending remittance...,Financial Sector: Access,Financial Sector,Access,Remittances,Cost
9,FB.CBK.BRCH.P5,"Commercial bank branches (per 100,000 adults)",Financial Sector: Access,Financial Sector,Access,Commercial bank,Bank branches


In [140]:
has_list = financial.apply(lambda row: isinstance(row['SubTopic2'], list) or isinstance(row['SubTopic3'], list), axis=1)


financial_with_lists = financial[has_list]
print(financial_with_lists)


Empty DataFrame
Columns: [Code, Indicator Name, General Topic, Topic, SubTopic1, SubTopic2, SubTopic3]
Index: []


# Gender 

In [141]:
gender  = dfs["gender"]

In [146]:
with pd.option_context('display.max_colwidth', 200):

    display(gender)

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
0,SP.M18.2024.FE.ZS,Women who were first married by age 18 (% of women ages 20-24),Gender: Agency,Gender,Agency,Married by age 18,Age 20-24
1,SP.M15.2024.FE.ZS,Women who were first married by age 15 (% of women ages 20-24),Gender: Agency,Gender,Agency,Married by age 15,Age 20-24
2,SG.VAW.REAS.ZS,Women who believe a husband is justified in beating his wife (any of five reasons) (%),Gender: Health,Gender,Health,Violence against women,Reason
3,SG.VAW.ARGU.ZS,Women who believe a husband is justified in beating his wife when she argues with him (%),Gender: Health,Gender,Health,Violence against women,Argue
4,SG.VAW.1549.ZS,Proportion of women subjected to physical and/or sexual violence in the last 12 months (% of ever-partnered women ages 15-49),Gender: Health,Gender,Health,Violence against women,
5,SG.VAW.BURN.ZS,Women who believe a husband is justified in beating his wife when she burns the food (%),Gender: Health,Gender,Health,Violence against women,Burns food
6,SG.VAW.GOES.ZS,Women who believe a husband is justified in beating his wife when she goes out without telling him (%),Gender: Health,Gender,Health,Violence against women,Goes out
7,SG.VAW.NEGL.ZS,Women who believe a husband is justified in beating his wife when she neglects the children (%),Gender: Health,Gender,Health,Violence against women,Neglects children
8,SG.VAW.REFU.ZS,Women who believe a husband is justified in beating his wife when she refuses sex with him (%),Gender: Health,Gender,Health,Violence against women,Refuse
9,SG.TIM.UWRK.MA,"Proportion of time spent on unpaid domestic and care work, male (% of 24 hour day)",Gender: Participation & access,Gender,Participation & access,Time,Unpaid work


In [142]:
with pd.option_context('display.max_colwidth', 200):

    display(gender[gender['SubTopic2'].isna()])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
1,SP.M15.2024.FE.ZS,Women who were first married by age 15 (% of women ages 20-24),Gender: Agency,Gender,Agency,,Age 20-24


In [145]:
gender.loc[gender["Code"] == "SP.M15.2024.FE.ZS", "SubTopic2"] = "Married by age 15"

In [147]:
with pd.option_context('display.max_colwidth', 200):

    display(gender[gender['SubTopic3'].isna()])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
4,SG.VAW.1549.ZS,Proportion of women subjected to physical and/or sexual violence in the last 12 months (% of ever-partnered women ages 15-49),Gender: Health,Gender,Health,Violence against women,
11,SG.DMK.SRCR.FN.ZS,"Women making their own informed decisions regarding sexual relations, contraceptive use and reproductive health care (% of women age 15-49)",Gender: Public life & decision making,Gender,Public life & decision making,Decision making,


In [148]:
gender.loc[gender["Code"] == "SG.VAW.1549.ZS", "SubTopic3"] = "Age 15-49"
gender.loc[gender["Code"] == "SG.DMK.SRCR.FN.ZS", "SubTopic3"] = "Age 15-49"

In [149]:
with pd.option_context('display.max_colwidth', 200):

    display(gender)

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
0,SP.M18.2024.FE.ZS,Women who were first married by age 18 (% of women ages 20-24),Gender: Agency,Gender,Agency,Married by age 18,Age 20-24
1,SP.M15.2024.FE.ZS,Women who were first married by age 15 (% of women ages 20-24),Gender: Agency,Gender,Agency,Married by age 15,Age 20-24
2,SG.VAW.REAS.ZS,Women who believe a husband is justified in beating his wife (any of five reasons) (%),Gender: Health,Gender,Health,Violence against women,Reason
3,SG.VAW.ARGU.ZS,Women who believe a husband is justified in beating his wife when she argues with him (%),Gender: Health,Gender,Health,Violence against women,Argue
4,SG.VAW.1549.ZS,Proportion of women subjected to physical and/or sexual violence in the last 12 months (% of ever-partnered women ages 15-49),Gender: Health,Gender,Health,Violence against women,Age 15-49
5,SG.VAW.BURN.ZS,Women who believe a husband is justified in beating his wife when she burns the food (%),Gender: Health,Gender,Health,Violence against women,Burns food
6,SG.VAW.GOES.ZS,Women who believe a husband is justified in beating his wife when she goes out without telling him (%),Gender: Health,Gender,Health,Violence against women,Goes out
7,SG.VAW.NEGL.ZS,Women who believe a husband is justified in beating his wife when she neglects the children (%),Gender: Health,Gender,Health,Violence against women,Neglects children
8,SG.VAW.REFU.ZS,Women who believe a husband is justified in beating his wife when she refuses sex with him (%),Gender: Health,Gender,Health,Violence against women,Refuse
9,SG.TIM.UWRK.MA,"Proportion of time spent on unpaid domestic and care work, male (% of 24 hour day)",Gender: Participation & access,Gender,Participation & access,Time,Unpaid work


In [150]:
has_list = gender.apply(lambda row: isinstance(row['SubTopic2'], list) or isinstance(row['SubTopic3'], list), axis=1)


gender_with_lists = gender[has_list]
print(gender_with_lists)


Empty DataFrame
Columns: [Code, Indicator Name, General Topic, Topic, SubTopic1, SubTopic2, SubTopic3]
Index: []


# Health  

In [151]:
health  = dfs["health"]

In [174]:
with pd.option_context('display.max_colwidth', 200):

    display(health)

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
0,SH.MLR.NETS.ZS,Use of insecticide-treated bed nets (% of under-5 population),Health: Disease prevention,Health,Disease prevention,Malaria,Insecticide-treated bed nets
1,SH.STA.SMSS.RU.ZS,"People using safely managed sanitation services, rural (% of rural population)",Health: Disease prevention,Health,Disease prevention,Stationary,
2,SH.IMM.MEAS,"Immunization, measles (% of children ages 12-23 months)",Health: Disease prevention,Health,Disease prevention,Immunization,Measles
3,SH.STA.SMSS.ZS,People using safely managed sanitation services (% of population),Health: Disease prevention,Health,Disease prevention,Stationary,
4,SH.H2O.SMDW.RU.ZS,"People using safely managed drinking water services, rural (% of rural population)",Health: Disease prevention,Health,Disease prevention,Water,
...,...,...,...,...,...,...,...
243,SH.UHC.TOTR.ZS,Proportion of population pushed or further pushed below the 60% median consumption poverty line by out-of-pocket health expenditure (%),Health: Universal Health Coverage,Health,Universal Health Coverage,Poverty Line (pushed or further pushed below),60% median consumption poverty line
244,SH.UHC.OOPC.25.ZS,Proportion of population spending more than 25% of household consumption or income on out-of-pocket health care expenditure (%),Health: Universal Health Coverage,Health,Universal Health Coverage,Poverty Line (spending more),25% of household consumption or income
245,SH.UHC.FBPR.ZS,Proportion of population pushed further below the 60% median consumption poverty line by out-of-pocket health care expenditure (%),Health: Universal Health Coverage,Health,Universal Health Coverage,Poverty Line (pushed further below),60% median consumption poverty line
246,SH.UHC.NOP1.ZS,Proportion of population pushed below the $2.15 ($ 2017 PPP) poverty line by out-of-pocket health care expenditure (%),Health: Universal Health Coverage,Health,Universal Health Coverage,Poverty Line (pushed below),2.15(2017 PPP) poverty line


In [170]:
with pd.option_context('display.max_colwidth', 200):

    display(health[health['SubTopic2'].isna()])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3


In [158]:
health.loc[health["Code"] == "SH.UHC.SRVS.CV.XD", ["SubTopic2", "SubTopic3"]] = ["Service Coverage", "Index"]

In [167]:
health.loc[health["Code"] == "SH.UHC.NOP2.ZS",  "SubTopic3"] = ["3.65(2017 PPP) poverty line"]
health.loc[health["Code"] == "SH.UHC.NOP1.ZS",  "SubTopic3"] = ["2.15(2017 PPP) poverty line"]
health.loc[health["Code"] == "SH.UHC.OOPC.10.ZS",  "SubTopic3"] = ["10% of household consumption or income"]
health.loc[health["Code"] == "SH.UHC.OOPC.25.ZS",  "SubTopic3"] = ["25% of household consumption or income"]

In [165]:
health.loc[health['Code'].str.startswith('SH.UHC') & 
           health['Code'].str.contains('NOP1|NOP2|NOPR', regex=True), "SubTopic2"] = "Poverty Line (pushed below)"

health.loc[health['Code'].str.startswith('SH.UHC') & 
           health['Code'].str.contains('FBP1|FBP2|FBPR', regex=True), "SubTopic2"] = "Poverty Line (pushed further below)"

health.loc[health['Code'].str.startswith('SH.UHC') & 
           health['Code'].str.contains('TOT1|TOT2|TOTR', regex=True), "SubTopic2"] = "Poverty Line (pushed or further pushed below)"

health.loc[health['Code'].str.startswith('SH.UHC') & 
           health['Code'].str.contains('OOPC', regex=True), "SubTopic2"] = "Poverty Line (spending more)"

In [163]:
health_subtopic3_mapping = {
    "NOPR": "60% median consumption poverty line",
    "FBP1": "2.15(2017 PPP) poverty line",
    "FBP2": "3.65(2017 PPP) poverty line",
    "TOT1": "2.15(2017 PPP) poverty line",
    "TOT2": "3.65(2017 PPP) poverty line",
    "TOTR": "60% median consumption poverty line",
    "FBPR": "60% median consumption poverty line"
}
health.loc[health["SubTopic3"].isna(), "SubTopic3"] = (
    health["Code"].str.split(".").str[2].map(health_subtopic3_mapping)
)

In [168]:
health[health['Code'].str.startswith('SH.UHC')]

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
236,SH.UHC.SRVS.CV.XD,UHC service coverage index,Health: Universal Health Coverage,Health,Universal Health Coverage,Service Coverage,Index
237,SH.UHC.NOP2.ZS,Proportion of population pushed below the $3.6...,Health: Universal Health Coverage,Health,Universal Health Coverage,Poverty Line (pushed below),3.65(2017 PPP) poverty line
238,SH.UHC.NOPR.ZS,Proportion of population pushed below the 60% ...,Health: Universal Health Coverage,Health,Universal Health Coverage,Poverty Line (pushed below),60% median consumption poverty line
239,SH.UHC.FBP1.ZS,Proportion of population pushed further below ...,Health: Universal Health Coverage,Health,Universal Health Coverage,Poverty Line (pushed further below),2.15(2017 PPP) poverty line
240,SH.UHC.FBP2.ZS,Proportion of population pushed further below ...,Health: Universal Health Coverage,Health,Universal Health Coverage,Poverty Line (pushed further below),3.65(2017 PPP) poverty line
241,SH.UHC.TOT1.ZS,Proportion of population pushed or further pus...,Health: Universal Health Coverage,Health,Universal Health Coverage,Poverty Line (pushed or further pushed below),2.15(2017 PPP) poverty line
242,SH.UHC.TOT2.ZS,Proportion of population pushed or further pus...,Health: Universal Health Coverage,Health,Universal Health Coverage,Poverty Line (pushed or further pushed below),3.65(2017 PPP) poverty line
243,SH.UHC.TOTR.ZS,Proportion of population pushed or further pus...,Health: Universal Health Coverage,Health,Universal Health Coverage,Poverty Line (pushed or further pushed below),60% median consumption poverty line
244,SH.UHC.OOPC.25.ZS,Proportion of population spending more than 25...,Health: Universal Health Coverage,Health,Universal Health Coverage,Poverty Line (spending more),25% of household consumption or income
245,SH.UHC.FBPR.ZS,Proportion of population pushed further below ...,Health: Universal Health Coverage,Health,Universal Health Coverage,Poverty Line (pushed further below),60% median consumption poverty line


In [169]:
with pd.option_context('display.max_colwidth', 200):

    display(health[health['SubTopic3'].isna()])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
1,SH.STA.SMSS.RU.ZS,"People using safely managed sanitation services, rural (% of rural population)",Health: Disease prevention,Health,Disease prevention,Stationary,
3,SH.STA.SMSS.ZS,People using safely managed sanitation services (% of population),Health: Disease prevention,Health,Disease prevention,Stationary,
4,SH.H2O.SMDW.RU.ZS,"People using safely managed drinking water services, rural (% of rural population)",Health: Disease prevention,Health,Disease prevention,Water,
5,SH.STA.BASS.UR.ZS,"People using at least basic sanitation services, urban (% of urban population)",Health: Disease prevention,Health,Disease prevention,Stationary,
6,SH.STA.BASS.RU.ZS,"People using at least basic sanitation services, rural (% of rural population)",Health: Disease prevention,Health,Disease prevention,Stationary,
7,SH.STA.BASS.ZS,People using at least basic sanitation services (% of population),Health: Disease prevention,Health,Disease prevention,Stationary,
13,SH.STA.HYGN.UR.ZS,"People with basic handwashing facilities including soap and water, urban (% of urban population)",Health: Disease prevention,Health,Disease prevention,Stationary,
14,SH.STA.HYGN.ZS,People with basic handwashing facilities including soap and water (% of population),Health: Disease prevention,Health,Disease prevention,Stationary,
16,SH.STA.SMSS.UR.ZS,"People using safely managed sanitation services, urban (% of urban population)",Health: Disease prevention,Health,Disease prevention,Stationary,
18,SH.H2O.SMDW.ZS,People using safely managed drinking water services (% of population),Health: Disease prevention,Health,Disease prevention,Water,


In [148]:
gender.loc[gender["Code"] == "SG.VAW.1549.ZS", "SubTopic3"] = "Age 15-49"
gender.loc[gender["Code"] == "SG.DMK.SRCR.FN.ZS", "SubTopic3"] = "Age 15-49"

In [149]:
with pd.option_context('display.max_colwidth', 200):

    display(gender)

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
0,SP.M18.2024.FE.ZS,Women who were first married by age 18 (% of women ages 20-24),Gender: Agency,Gender,Agency,Married by age 18,Age 20-24
1,SP.M15.2024.FE.ZS,Women who were first married by age 15 (% of women ages 20-24),Gender: Agency,Gender,Agency,Married by age 15,Age 20-24
2,SG.VAW.REAS.ZS,Women who believe a husband is justified in beating his wife (any of five reasons) (%),Gender: Health,Gender,Health,Violence against women,Reason
3,SG.VAW.ARGU.ZS,Women who believe a husband is justified in beating his wife when she argues with him (%),Gender: Health,Gender,Health,Violence against women,Argue
4,SG.VAW.1549.ZS,Proportion of women subjected to physical and/or sexual violence in the last 12 months (% of ever-partnered women ages 15-49),Gender: Health,Gender,Health,Violence against women,Age 15-49
5,SG.VAW.BURN.ZS,Women who believe a husband is justified in beating his wife when she burns the food (%),Gender: Health,Gender,Health,Violence against women,Burns food
6,SG.VAW.GOES.ZS,Women who believe a husband is justified in beating his wife when she goes out without telling him (%),Gender: Health,Gender,Health,Violence against women,Goes out
7,SG.VAW.NEGL.ZS,Women who believe a husband is justified in beating his wife when she neglects the children (%),Gender: Health,Gender,Health,Violence against women,Neglects children
8,SG.VAW.REFU.ZS,Women who believe a husband is justified in beating his wife when she refuses sex with him (%),Gender: Health,Gender,Health,Violence against women,Refuse
9,SG.TIM.UWRK.MA,"Proportion of time spent on unpaid domestic and care work, male (% of 24 hour day)",Gender: Participation & access,Gender,Participation & access,Time,Unpaid work


In [150]:
has_list = gender.apply(lambda row: isinstance(row['SubTopic2'], list) or isinstance(row['SubTopic3'], list), axis=1)


gender_with_lists = gender[has_list]
print(gender_with_lists)


Empty DataFrame
Columns: [Code, Indicator Name, General Topic, Topic, SubTopic1, SubTopic2, SubTopic3]
Index: []
