In [1]:
import numpy as np 
import pandas as pd
import re

In [2]:
def clean_text(text):
    if isinstance(text, str): 
        return re.sub(r'\s+', ' ', text).strip()
    return text

In [3]:
def summary(df):
    print(f'data shape: {df.shape}')  
    summ = pd.DataFrame(df.dtypes, columns=['data type'])
    summ['#missing'] = df.isnull().sum().values 
    summ['%missing'] = df.isnull().sum().values / len(df)
    summ['#unique'] = df.nunique().values
    desc = pd.DataFrame(df.describe(include='all').transpose())
#     summ['min'] = desc['min'].values
#     summ['max'] = desc['max'].values
    return summ

In [4]:
def select_second_item(value):
    if isinstance(value, list) and len(value) > 1:
        return value[1]  
    return value 

def select_first_item(value):
    if isinstance(value, list) and len(value) > 1:
        return value[0]  
    return value 

In [5]:
file_path = '../data/hierarchy/hierarchy_topic_by_sheet.xlsx' 
xls = pd.ExcelFile(file_path)


dfs = {}

for sheet_name in xls.sheet_names:
    first_word = sheet_name.split()[0].lower() 
    df = pd.read_excel(xls, sheet_name=sheet_name)  
    dfs[first_word] = df  


for name, df in dfs.items():
    print(f"{name}")

economic
education
environment
financial
gender
health
infrastructure
poverty
private
public
social
trade


In [6]:
dfs['education']

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
0,SE.PRM.OENR.ZS,"Over-age students, primary (% of enrollment)",Education: Efficiency,Education,Efficiency,,
1,SE.PRM.OENR.FE.ZS,"Over-age students, primary, female (% of femal...",Education: Efficiency,Education,Efficiency,,
2,SE.PRM.OENR.MA.ZS,"Over-age students, primary, male (% of male en...",Education: Efficiency,Education,Efficiency,,
3,SE.PRM.REPT.FE.ZS,"Repeaters, primary, female (% of female enroll...",Education: Efficiency,Education,Efficiency,,
4,SE.PRM.REPT.MA.ZS,"Repeaters, primary, male (% of male enrollment)",Education: Efficiency,Education,Efficiency,,
...,...,...,...,...,...,...,...
151,SE.PRM.TENR.FE,"Adjusted net enrollment rate, primary, female ...",Education: Participation,Education,Participation,,
152,SE.PRM.ENRR,"School enrollment, primary (% gross)",Education: Participation,Education,Participation,,
153,SE.SEC.ENRR,"School enrollment, secondary (% gross)",Education: Participation,Education,Participation,,
154,SE.TER.ENRR,"School enrollment, tertiary (% gross)",Education: Participation,Education,Participation,,


In [7]:
example_education = dfs['education'].copy()

In [8]:
example_education['code3'] = example_education['Code'].str.split('.').str[:3].str.join('.')

In [9]:
grouped_counts = example_education['code3'].value_counts()
grouped_counts.head()

code3
SE.TER.CUAT    12
SE.SEC.CUAT     9
SE.LPV.PRIM     9
SE.SEC.TCAQ     9
SE.SEC.ENRL     9
Name: count, dtype: int64

In [10]:

with pd.option_context('display.max_colwidth', 200):

    display(example_education[example_education['code3'] == 'SE.TER.CUAT'])


Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3,code3
83,SE.TER.CUAT.ST.FE.ZS,"Educational attainment, at least completed short-cycle tertiary, population 25+, female (%) (cumulative)",Education: Outcomes,Education,Outcomes,,,SE.TER.CUAT
84,SE.TER.CUAT.ST.MA.ZS,"Educational attainment, at least completed short-cycle tertiary, population 25+, male (%) (cumulative)",Education: Outcomes,Education,Outcomes,,,SE.TER.CUAT
85,SE.TER.CUAT.ST.ZS,"Educational attainment, at least completed short-cycle tertiary, population 25+, total (%) (cumulative)",Education: Outcomes,Education,Outcomes,,,SE.TER.CUAT
88,SE.TER.CUAT.MS.FE.ZS,"Educational attainment, at least Master's or equivalent, population 25+, female (%) (cumulative)",Education: Outcomes,Education,Outcomes,,,SE.TER.CUAT
89,SE.TER.CUAT.MS.MA.ZS,"Educational attainment, at least Master's or equivalent, population 25+, male (%) (cumulative)",Education: Outcomes,Education,Outcomes,,,SE.TER.CUAT
90,SE.TER.CUAT.MS.ZS,"Educational attainment, at least Master's or equivalent, population 25+, total (%) (cumulative)",Education: Outcomes,Education,Outcomes,,,SE.TER.CUAT
91,SE.TER.CUAT.DO.FE.ZS,"Educational attainment, Doctoral or equivalent, population 25+, female (%) (cumulative)",Education: Outcomes,Education,Outcomes,,,SE.TER.CUAT
92,SE.TER.CUAT.DO.MA.ZS,"Educational attainment, Doctoral or equivalent, population 25+, male (%) (cumulative)",Education: Outcomes,Education,Outcomes,,,SE.TER.CUAT
94,SE.TER.CUAT.DO.ZS,"Educational attainment, Doctoral or equivalent, population 25+, total (%) (cumulative)",Education: Outcomes,Education,Outcomes,,,SE.TER.CUAT
96,SE.TER.CUAT.BA.ZS,"Educational attainment, at least Bachelor's or equivalent, population 25+, total (%) (cumulative)",Education: Outcomes,Education,Outcomes,,,SE.TER.CUAT


In this eample of education outcomes, we observe that indicator codes beginning with **SE.TER.CUAT** can be further subdivided to populate relevant subtopics.  

To achieve this, we will:  
1. **Read the WDI Indicators file** to examine the full list of codes and their corresponding descriptions.  
2. **Identify patterns** within the codes that allow for classification into subtopics.  
3. **Extract meaningful subtopic information** from the code structure and descriptions.  

By implementing this approach, we aim to enhance the granularity of our dataset and improve analytical insights into education outcomes.  

In [11]:
wdi_codes = pd.read_excel('../data/WDI_Indicators.xlsx', sheet_name="Coding")

In [12]:
wdi_codes

Unnamed: 0,Topic,Topic description,General subject,General subject description,Specific subject,Specific subject description,Extensions,Extensions description
0,AG,Agriculture,ACS,Access,0003,Age 0-3,05,2005 (PPP)
1,BG,Balance of payments: gross,ADJ,Adjusted savings,0004,Age 0-4,10,Decile
2,BM,"Balance of payments: imports, payments (credit)",ADM,Admission,0014,Age 0-14,14,Age 0-14
3,BN,Balance of payments: net,ADO,Adolescent,0324,Age 0-24,20,Quintile
4,BX,"Balance of payments: exports, receipts (debit)",ADT,Adult,0306,Age 3-6,90,% changes since 1990
...,...,...,...,...,...,...,...,...
1057,,,,,XOKA,Excluding official capital transfers,,
1058,,,,,XPND,Expenditure,,
1059,,,,,XPRT,Exports,,
1060,,,,,XTHR,Other manufactures (trade),,


### Understanding the Code Structure using `wdi_codes`

Based on our analysis, we have identified a general pattern in the structure of the codes formatted as **XX.XX.XX...**. While this does not apply to all cases, the common breakdown is as follows:

1. **Topic Column**: Corresponds to the `Topic` and `SubTopic1` of the dataset as the values for that column are **name : name**.
2. **General subject Column**: Represents the `SubTopic2`.
3. **Specific subject Column**: Defines the `SubTopic3`.
4. **Extension (the rest)**: Any additional components following the third part can be considered as **Extensions**, which provide further granularity or variations within the specific subject. Note we dont have a column for taht yet and we will find a way to add informations of extension if they are important to our work

By leveraging this structure, we can systematically extract hierarchical information from the codes and use it for improved classification and mapping in our analysis.


We will delete the topic and topic description as we already has the topic metadata filled

In [13]:
wdi_codes.drop(columns=['Topic', 'Topic description'], inplace=True)
wdi_codes.head()

Unnamed: 0,General subject,General subject description,Specific subject,Specific subject description,Extensions,Extensions description
0,ACS,Access,3,Age 0-3,5,2005 (PPP)
1,ADJ,Adjusted savings,4,Age 0-4,10,Decile
2,ADM,Admission,14,Age 0-14,14,Age 0-14
3,ADO,Adolescent,324,Age 0-24,20,Quintile
4,ADT,Adult,306,Age 3-6,90,% changes since 1990


In [14]:
for col in ['General subject', 'Specific subject', 'Extensions']:
    redundant = wdi_codes[col].dropna()[wdi_codes[col].dropna().duplicated()]
    if not redundant.empty:
        print(f"Redundant values in column '{col}':")
        print(redundant)
    else:
        print(f"No redundant values in column '{col}'.")


No redundant values in column 'General subject'.
No redundant values in column 'Specific subject'.
No redundant values in column 'Extensions'.


In [15]:
import pandas as pd

wdi_codes['has_match'] = wdi_codes.apply(
    lambda row: (
        (pd.notna(row['General subject']) and row['General subject'] in [row['Specific subject'], row['Extensions']]) or
        (pd.notna(row['Specific subject']) and row['Specific subject'] in [row['General subject'], row['Extensions']]) or
        (pd.notna(row['Extensions']) and row['Extensions'] in [row['General subject'], row['Specific subject']])
    ), axis=1
)

matching_rows = wdi_codes[wdi_codes['has_match']]
print("Rows with matching values between columns:")
print(matching_rows[['General subject', 'Specific subject', 'Extensions']])


wdi_codes.drop(columns=['has_match'], inplace=True)


Rows with matching values between columns:
Empty DataFrame
Columns: [General subject, Specific subject, Extensions]
Index: []


In [16]:
# df_melted = pd.melt(wdi_codes, 
#                     value_vars=['General subject', 'Specific subject', 'Extensions'], 
#                     var_name='Key_Type', value_name='Key')

# df_melted['Value'] = pd.melt(wdi_codes, 
#                              value_vars=['General subject description', 'Specific subject description', 'Extensions description'], 
#                              var_name='Value_Type', value_name='Value')['Value']

# df_melted.dropna(subset=['Key', 'Value'], how='any', inplace=True)


In [17]:
# df_melted

## Testing in the example of education `example_education`

In [18]:
print(example_education['SubTopic2'].notna().any())
print(example_education['SubTopic3'].notna().any())

False
False


# This is the main function we will be using to assign the subtopics

In [19]:
def extract_and_match(row, df2):

    parts = row['Code'].split('.')
    if len(parts) < 3:
        first_desc, second_desc = None, None
    else:
        first_desc = df2.loc[df2['General subject'] == parts[1], 'General subject description'].values
        second_desc = df2.loc[df2['Specific subject'] == parts[2], 'Specific subject description'].values

        first_desc = first_desc[0] if first_desc.size > 0 else None
        second_desc = second_desc[0] if second_desc.size > 0 else None


    if pd.notna(row['SubTopic2']):


        if first_desc != None and row['SubTopic2'] != first_desc: 

            row['SubTopic2'] = [row['SubTopic2'], first_desc]            
    else:

        row['SubTopic2'] = first_desc

    if pd.notna(row['SubTopic3']):

        if row['SubTopic3'] != second_desc:
            row['SubTopic3'] = [row['SubTopic3'], second_desc]
    else:

        row['SubTopic3'] = second_desc

    return row



In [20]:
example_education = example_education.apply(extract_and_match, axis=1, df2=wdi_codes)

In [21]:
example_education

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3,code3
0,SE.PRM.OENR.ZS,"Over-age students, primary (% of enrollment)",Education: Efficiency,Education,Efficiency,Primary education,Over-age enrollment,SE.PRM.OENR
1,SE.PRM.OENR.FE.ZS,"Over-age students, primary, female (% of femal...",Education: Efficiency,Education,Efficiency,Primary education,Over-age enrollment,SE.PRM.OENR
2,SE.PRM.OENR.MA.ZS,"Over-age students, primary, male (% of male en...",Education: Efficiency,Education,Efficiency,Primary education,Over-age enrollment,SE.PRM.OENR
3,SE.PRM.REPT.FE.ZS,"Repeaters, primary, female (% of female enroll...",Education: Efficiency,Education,Efficiency,Primary education,Repeaters,SE.PRM.REPT
4,SE.PRM.REPT.MA.ZS,"Repeaters, primary, male (% of male enrollment)",Education: Efficiency,Education,Efficiency,Primary education,Repeaters,SE.PRM.REPT
...,...,...,...,...,...,...,...,...
151,SE.PRM.TENR.FE,"Adjusted net enrollment rate, primary, female ...",Education: Participation,Education,Participation,Primary education,Total enrollment,SE.PRM.TENR
152,SE.PRM.ENRR,"School enrollment, primary (% gross)",Education: Participation,Education,Participation,Primary education,Enrolment rate,SE.PRM.ENRR
153,SE.SEC.ENRR,"School enrollment, secondary (% gross)",Education: Participation,Education,Participation,Secondary education,Enrolment rate,SE.SEC.ENRR
154,SE.TER.ENRR,"School enrollment, tertiary (% gross)",Education: Participation,Education,Participation,Tertiary,Enrolment rate,SE.TER.ENRR


In [22]:
summary(example_education).style.background_gradient(cmap='Blues')

data shape: (156, 8)


Unnamed: 0,data type,#missing,%missing,#unique
Code,object,0,0.0,156
Indicator Name,object,0,0.0,156
General Topic,object,0,0.0,4
Topic,object,0,0.0,1
SubTopic1,object,0,0.0,4
SubTopic2,object,9,0.057692,8
SubTopic3,object,0,0.0,30
code3,object,0,0.0,54


In [23]:
with pd.option_context('display.max_colwidth', 200):

    display(example_education[example_education['SubTopic2'].isna()])


Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3,code3
68,SE.LPV.PRIM.MA,Learning poverty: Share of Male Children at the End-of-Primary age below minimum reading proficiency adjusted by Out-of-School Children (%),Education: Outcomes,Education,Outcomes,,Primary education,SE.LPV.PRIM
69,SE.LPV.PRIM.FE,Learning poverty: Share of Female Children at the End-of-Primary age below minimum reading proficiency adjusted by Out-of-School Children (%),Education: Outcomes,Education,Outcomes,,Primary education,SE.LPV.PRIM
71,SE.LPV.PRIM,Learning poverty: Share of Children at the End-of-Primary age below minimum reading proficiency adjusted by Out-of-School Children (%),Education: Outcomes,Education,Outcomes,,Primary education,SE.LPV.PRIM
73,SE.LPV.PRIM.LD.FE,Female pupils below minimum reading proficiency at end of primary (%). Low GAML threshold,Education: Outcomes,Education,Outcomes,,Primary education,SE.LPV.PRIM
74,SE.LPV.PRIM.SD.FE,Female primary school age children out-of-school (%),Education: Outcomes,Education,Outcomes,,Primary education,SE.LPV.PRIM
76,SE.LPV.PRIM.LD,Pupils below minimum reading proficiency at end of primary (%). Low GAML threshold,Education: Outcomes,Education,Outcomes,,Primary education,SE.LPV.PRIM
97,SE.LPV.PRIM.SD,Primary school age children out-of-school (%),Education: Outcomes,Education,Outcomes,,Primary education,SE.LPV.PRIM
106,SE.LPV.PRIM.LD.MA,Male pupils below minimum reading proficiency at end of primary (%). Low GAML threshold,Education: Outcomes,Education,Outcomes,,Primary education,SE.LPV.PRIM
107,SE.LPV.PRIM.SD.MA,Male primary school age children out-of-school (%),Education: Outcomes,Education,Outcomes,,Primary education,SE.LPV.PRIM


In [24]:
example_education['SubTopic2'] = example_education['SubTopic2'].fillna('Low Poverty Vulnerability')
summary(example_education).style.background_gradient(cmap='Blues')

data shape: (156, 8)


Unnamed: 0,data type,#missing,%missing,#unique
Code,object,0,0.0,156
Indicator Name,object,0,0.0,156
General Topic,object,0,0.0,4
Topic,object,0,0.0,1
SubTopic1,object,0,0.0,4
SubTopic2,object,0,0.0,9
SubTopic3,object,0,0.0,30
code3,object,0,0.0,54


## In this example
As we can see, we were able to extend the granularity and hierarchy of the education metadata.

There are missing values for subtopic1, and after further research, we can impute these with the value **LPV = 'Low Poverty Vulnerability'**.

For now, this will be done manually for each sheet.

## We apply our function to the entire df

In [25]:
for key, df in dfs.items():

    df = df.apply(extract_and_match, axis=1, df2=wdi_codes)  
    dfs[key] = df  


In [26]:
for key, df in dfs.items():

    missing_subtopic2 = df['SubTopic2'].isnull().sum()
    missing_subtopic3 = df['SubTopic3'].isnull().sum()
    

    print(f"\n{'*' * 10} {key} {'*' * 10}")
    print(f"Missing subtopic2: {missing_subtopic2}")
    print(f"Missing subtopic3: {missing_subtopic3}")
    print("-" * 40)  


********** economic **********
Missing subtopic2: 0
Missing subtopic3: 15
----------------------------------------

********** education **********
Missing subtopic2: 9
Missing subtopic3: 0
----------------------------------------

********** environment **********
Missing subtopic2: 43
Missing subtopic3: 44
----------------------------------------

********** financial **********
Missing subtopic2: 0
Missing subtopic3: 0
----------------------------------------

********** gender **********
Missing subtopic2: 1
Missing subtopic3: 2
----------------------------------------

********** health **********
Missing subtopic2: 12
Missing subtopic3: 45
----------------------------------------

********** infrastructure **********
Missing subtopic2: 0
Missing subtopic3: 0
----------------------------------------

********** poverty **********
Missing subtopic2: 0
Missing subtopic3: 9
----------------------------------------

********** private **********
Missing subtopic2: 6
Missing subtopic3

## Another porblem when looking into the code description in wdi idnicators data there is some code parts that can describe both things as PO : Post (or plus other)

### This will cause a problem as the some subtiopics will have 2 descriptions seperated by (or)

In [27]:
#EXAMPLE FOR THE PROBLEM
with pd.option_context('display.max_colwidth', 200):
    display(dfs["economic"]["SubTopic2"][dfs["economic"]['SubTopic2'].str.contains(r'\(or', na=False)].iloc[0])


'Consumption (or condom)'

### For now i think the best way is to go throught each individually and handle missing and the different values in "list" subtopics (chatgpt + wdi documentation)

# Economic

In [28]:
economic = dfs["economic"]

**Notice we dont create a copy as we want the changes to reflect to dfs["economic"].**

In [29]:
with pd.option_context('display.max_colwidth', 200):

    display(economic[economic['SubTopic3'].isna()])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
284,DT.NFL.CERF.CD,"Net official flows from UN agencies, CERF (current US$)",Economic Policy & Debt: Official development assistance,Economic Policy & Debt,Official development assistance,Net flows,
303,DC.DAC.LTUL.CD,"Net bilateral aid flows from DAC donors, Lithuania (current US$)",Economic Policy & Debt: Official development assistance,Economic Policy & Debt,Official development assistance,Development Assistance Committee (OECD),
309,DC.DAC.HUNL.CD,"Net bilateral aid flows from DAC donors, Hungary (current US$)",Economic Policy & Debt: Official development assistance,Economic Policy & Debt,Official development assistance,Development Assistance Committee (OECD),
315,DC.DAC.ESTL.CD,"Net bilateral aid flows from DAC donors, Estonia (current US$)",Economic Policy & Debt: Official development assistance,Economic Policy & Debt,Official development assistance,Development Assistance Committee (OECD),
321,DT.NFL.SDGF.CD,"Net official flows from UN agencies, SDGFUND (current US$)",Economic Policy & Debt: Official development assistance,Economic Policy & Debt,Official development assistance,Net flows,
322,DT.NFL.SPRP.CD,"Net official flows from UN agencies, SPRP (current US$)",Economic Policy & Debt: Official development assistance,Economic Policy & Debt,Official development assistance,Net flows,
324,DT.NFL.UNCD.CD,"Net official flows from UN agencies, UNCDF (current US$)",Economic Policy & Debt: Official development assistance,Economic Policy & Debt,Official development assistance,Net flows,
325,DT.NFL.UNCV.CD,"Net official flows from UN agencies, UNCOVID (current US$)",Economic Policy & Debt: Official development assistance,Economic Policy & Debt,Official development assistance,Net flows,
326,DT.NFL.UNCTAD.CD,"Net official flows from UN agencies, UNCTAD (current US$)",Economic Policy & Debt: Official development assistance,Economic Policy & Debt,Official development assistance,Net flows,
329,DT.NFL.UNEP.CD,"Net official flows from UN agencies, UNEP (current US$)",Economic Policy & Debt: Official development assistance,Economic Policy & Debt,Official development assistance,Net flows,


In [30]:
economic_mapping = {
    "WITC": "World Integrated Trade Solution",
    "LTUL": "Long-Term Unemployment Level",
    "CERF": "Central Emergency Response Fund",
    "UNCTAD": "United Nations Conference on Trade and Development",
    "UNEP": "United Nations Environment Programme",
    "UNID": "United Nations Industrial Development Organization",
    "UNIDO": "United Nations Industrial Development Organization",
    "UNWN": "United Nations World Network (Unconfirmed)",
    "UNCV": "United Nations Convention",
    "SPRP": "Social Protection and Resilience Programs",
    "HUNL": "Hunger Levels (Unconfirmed)",
    "ESTL": "Estimated Labor Statistics",
    "SDGF": "Sustainable Development Goals Fund",
    "UNWT": "United Nations World Tourism Organization",
    "UNCD": "United Nations Conference on Disarmament"


}

economic.loc[economic["SubTopic3"].isna(), "SubTopic3"] = (
    economic["Code"].str.split(".").str[2].map(economic_mapping)
)


In [31]:
with pd.option_context('display.max_colwidth', 200):

    display(economic[economic['SubTopic3'].isna()])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3


In [32]:
def get_code_prefix(code):
    return ".".join(code.split(".")[:2])

filtered = economic[
    economic['SubTopic2'].apply(lambda x: isinstance(x, list)) | 
    economic['SubTopic3'].apply(lambda x: isinstance(x, list))
]

filtered = filtered.copy()
filtered.loc[:, "CodeGroup"] = filtered["Code"].apply(get_code_prefix)


grouped = filtered.groupby("CodeGroup")



In [33]:
group_keys = list(grouped.groups.keys())

index = 12  

selected_group_key = group_keys[index]
selected_group_df = grouped.get_group(selected_group_key)


with pd.option_context('display.max_colwidth', 200):

    display(selected_group_df)


Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3,CodeGroup
16,BX.GSR.MRCH.CD,"Goods exports (BoP, current US$)","Economic Policy & Debt: Balance of payments: Current account: Goods, services & income",Economic Policy & Debt,Balance of payments,"[Current account, Goods and services]","[Goods, services & income, Goods (merchandise)]",BX.GSR
17,BX.GSR.TOTL.CD,"Exports of goods, services and primary income (BoP, current US$)","Economic Policy & Debt: Balance of payments: Current account: Goods, services & income",Economic Policy & Debt,Balance of payments,"[Current account, Goods and services]","[Goods, services & income, Total]",BX.GSR
18,BX.GSR.GNFS.CD,"Exports of goods and services (BoP, current US$)","Economic Policy & Debt: Balance of payments: Current account: Goods, services & income",Economic Policy & Debt,Balance of payments,"[Current account, Goods and services]","[Goods, services & income, Goods and (nonfactor) services]",BX.GSR
20,BX.GSR.FCTY.CD,"Primary income receipts (BoP, current US$)","Economic Policy & Debt: Balance of payments: Current account: Goods, services & income",Economic Policy & Debt,Balance of payments,"[Current account, Goods and services]","[Goods, services & income, Factor income]",BX.GSR
22,BX.GSR.INSF.ZS,"Insurance and financial services (% of service exports, BoP)","Economic Policy & Debt: Balance of payments: Current account: Goods, services & income",Economic Policy & Debt,Balance of payments,"[Current account, Goods and services]","[Goods, services & income, Insurance and financial services]",BX.GSR
26,BX.GSR.NFSV.CD,"Service exports (BoP, current US$)","Economic Policy & Debt: Balance of payments: Current account: Goods, services & income",Economic Policy & Debt,Balance of payments,"[Current account, Goods and services]","[Goods, services & income, (Nonfactor) services]",BX.GSR
28,BX.GSR.CMCP.ZS,"Communications, computer, etc. (% of service exports, BoP)","Economic Policy & Debt: Balance of payments: Current account: Goods, services & income",Economic Policy & Debt,Balance of payments,"[Current account, Goods and services]","[Goods, services & income, Communications, computer, etc.]",BX.GSR
30,BX.GSR.TRAN.ZS,"Transport services (% of service exports, BoP)","Economic Policy & Debt: Balance of payments: Current account: Goods, services & income",Economic Policy & Debt,Balance of payments,"[Current account, Goods and services]","[Goods, services & income, Transport]",BX.GSR
32,BX.GSR.TRVL.ZS,"Travel services (% of service exports, BoP)","Economic Policy & Debt: Balance of payments: Current account: Goods, services & income",Economic Policy & Debt,Balance of payments,"[Current account, Goods and services]","[Goods, services & income, Travel]",BX.GSR
34,BX.GSR.ROYL.CD,"Charges for the use of intellectual property, receipts (BoP, current US$)","Economic Policy & Debt: Balance of payments: Current account: Goods, services & income",Economic Policy & Debt,Balance of payments,"[Current account, Goods and services]","[Goods, services & income, Royalty and license fees]",BX.GSR


### For the various examples we analyzed, we concluded that selecting the values added from the WDI (World Development Indicators) dataset provides a more accurate representation. These values offer a finer level of granularity, enhancing our ability to analyze and interpret the data effectively.

In [34]:
economic["SubTopic2"] = economic["SubTopic2"].apply(select_second_item)
economic["SubTopic3"] = economic["SubTopic3"].apply(select_second_item)

## handling the (text (or text))

In [35]:

with pd.option_context('display.max_colwidth', 200):
    display(economic[economic['SubTopic2'].str.contains(r'\(or', na=False) ])


Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
8,BN.RES.INCL.CD,"Reserves and related items (BoP, current US$)",Economic Policy & Debt: Balance of payments: Capital & financial account,Economic Policy & Debt,Balance of payments,Reserves (or residential),Including
45,FI.RES.TOTL.CD,"Total reserves (includes gold, current US$)",Economic Policy & Debt: Balance of payments: Reserves & other items,Economic Policy & Debt,Balance of payments,Reserves (or residential),Total
46,FI.RES.XGLD.CD,Total reserves minus gold (current US$),Economic Policy & Debt: Balance of payments: Reserves & other items,Economic Policy & Debt,Balance of payments,Reserves (or residential),Excluding gold
47,FI.RES.TOTL.MO,Total reserves in months of imports,Economic Policy & Debt: Balance of payments: Reserves & other items,Economic Policy & Debt,Balance of payments,Reserves (or residential),Total
49,FI.RES.TOTL.DT.ZS,Total reserves (% of total external debt),Economic Policy & Debt: Balance of payments: Reserves & other items,Economic Policy & Debt,Balance of payments,Reserves (or residential),Total
129,NE.CON.GOVT.KD.ZG,General government final consumption expenditure (annual % growth),Economic Policy & Debt: National accounts: Growth rates,Economic Policy & Debt,National accounts,Consumption (or condom),Government
130,NE.CON.PRVT.PC.KD.ZG,Household final consumption expenditure per capita growth (annual %),Economic Policy & Debt: National accounts: Growth rates,Economic Policy & Debt,National accounts,Consumption (or condom),Private
131,NE.CON.PRVT.KD.ZG,Household and NPISHs Final consumption expenditure (annual % growth),Economic Policy & Debt: National accounts: Growth rates,Economic Policy & Debt,National accounts,Consumption (or condom),Private
135,NE.CON.TOTL.KD.ZG,Final consumption expenditure (annual % growth),Economic Policy & Debt: National accounts: Growth rates,Economic Policy & Debt,National accounts,Consumption (or condom),Total
136,NV.IND.TOTL.KD.ZG,"Industry (including construction), value added (annual % growth)",Economic Policy & Debt: National accounts: Growth rates,Economic Policy & Debt,National accounts,Industry (or interest due) (or index),Total


In [36]:
economic.loc[economic['Code'].str.startswith('BN.RES') , "SubTopic2"] = "Reserves"
economic.loc[economic['Code'].str.startswith('FI.RES') , "SubTopic2"] = "Reserves"
economic.loc[economic['Code'].str.startswith('NE.CON'), "SubTopic2"] = "Consumption"
economic.loc[economic['Code'].str.startswith('NV.IND'), "SubTopic2"] = "Industry"
economic.loc[economic['Code'].str.startswith('NY.TAX'), "SubTopic2"] = "Tax revenue (or tax related)"



In [37]:
with pd.option_context('display.max_colwidth', 200):
    display(economic[economic['SubTopic3'].str.contains(r'\(or', na=False) ])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3


# Education

In [38]:
education = dfs['education']

In [39]:
with pd.option_context('display.max_colwidth', 200):

    display(education[education['SubTopic2'].isna()])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
68,SE.LPV.PRIM.MA,Learning poverty: Share of Male Children at the End-of-Primary age below minimum reading proficiency adjusted by Out-of-School Children (%),Education: Outcomes,Education,Outcomes,,Primary education
69,SE.LPV.PRIM.FE,Learning poverty: Share of Female Children at the End-of-Primary age below minimum reading proficiency adjusted by Out-of-School Children (%),Education: Outcomes,Education,Outcomes,,Primary education
71,SE.LPV.PRIM,Learning poverty: Share of Children at the End-of-Primary age below minimum reading proficiency adjusted by Out-of-School Children (%),Education: Outcomes,Education,Outcomes,,Primary education
73,SE.LPV.PRIM.LD.FE,Female pupils below minimum reading proficiency at end of primary (%). Low GAML threshold,Education: Outcomes,Education,Outcomes,,Primary education
74,SE.LPV.PRIM.SD.FE,Female primary school age children out-of-school (%),Education: Outcomes,Education,Outcomes,,Primary education
76,SE.LPV.PRIM.LD,Pupils below minimum reading proficiency at end of primary (%). Low GAML threshold,Education: Outcomes,Education,Outcomes,,Primary education
97,SE.LPV.PRIM.SD,Primary school age children out-of-school (%),Education: Outcomes,Education,Outcomes,,Primary education
106,SE.LPV.PRIM.LD.MA,Male pupils below minimum reading proficiency at end of primary (%). Low GAML threshold,Education: Outcomes,Education,Outcomes,,Primary education
107,SE.LPV.PRIM.SD.MA,Male primary school age children out-of-school (%),Education: Outcomes,Education,Outcomes,,Primary education


In [40]:
education['SubTopic2'] = education['SubTopic2'].fillna('Learning poverty')

In [41]:
with pd.option_context('display.max_colwidth', 200):

    display(education[education['SubTopic3'].isna()])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3


In [42]:
has_list = education.apply(lambda row: isinstance(row['SubTopic2'], list) or isinstance(row['SubTopic3'], list), axis=1)


education_with_lists = education[has_list]
print(education_with_lists)


Empty DataFrame
Columns: [Code, Indicator Name, General Topic, Topic, SubTopic1, SubTopic2, SubTopic3]
Index: []


## handling the (text (or text))

In [43]:

with pd.option_context('display.max_colwidth', 200):
    display(education[education['SubTopic2'].str.contains(r'\(or', na=False) ])


Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
75,SE.COM.DURS,"Compulsory education, duration (years)",Education: Outcomes,Education,Outcomes,Commitments (or compulsory),Duration


In [44]:
education.loc[education['Code'].str.startswith('SE.COM') , "SubTopic2"] = "Compulsory"

In [45]:
with pd.option_context('display.max_colwidth', 200):
    display(education[education['SubTopic3'].str.contains(r'\(or', na=False) ])


Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3


# More adjustments (personal)

In [None]:
education.loc[education['Code'].str.startswith('SE.ADT') , "SubTopic2"] = "Literacy rate"
education.loc[education['Code'].str.startswith('SE.ADT.1524') , "SubTopic3"] = "Youth (ages 15-24)"
education.loc[education['Code'].str.startswith('SE.ADT.LITR') , "SubTopic3"] = "Adult (ages 15 and above)"

# Environment 

In [46]:
environment = dfs["environment"]

In [47]:
with pd.option_context('display.max_colwidth', 200):

    display(environment[(environment['SubTopic2'].isna())|(environment['SubTopic3'].isna())])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
66,ER.H2O.FWST.ZS,Level of water stress: freshwater withdrawal as a proportion of available freshwater resources,Environment: Freshwater,Environment,Freshwater,Water,
71,EN.GHG.CH4.TR.MT.CE.AR5,Methane (CH4) emissions from Transport (Energy) (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,,
72,EN.GHG.CH4.PI.MT.CE.AR5,Methane (CH4) emissions from Power Industry (Energy) (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,,
73,EN.GHG.CH4.WA.MT.CE.AR5,Methane (CH4) emissions from Waste (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,,
74,EN.GHG.CH4.IP.MT.CE.AR5,Methane (CH4) emissions from Industrial Processes (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,,
75,EN.GHG.N2O.AG.MT.CE.AR5,Nitrous oxide (N2O) emissions from Agriculture (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,,
76,EN.GHG.CH4.IC.MT.CE.AR5,Methane (CH4) emissions from Industrial Combustion (Energy) (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,,
77,EN.GHG.CH4.AG.MT.CE.AR5,Methane (CH4) emissions from Agriculture (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,,
78,EN.GHG.CH4.MT.CE.AR5,Methane (CH4) emissions (total) excluding LULUCF (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,,
79,EN.GHG.N2O.WA.MT.CE.AR5,Nitrous oxide (N2O) emissions from Waste (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,,


In [48]:
# environment_mapping_subtopic3 = {
#     "FWST": "freshwater withdrawal"
# }

In [49]:
environment.loc[environment["Code"] == "ER.H2O.FWST.ZS", "SubTopic3"] = "Water stress: freshwater withdrawal"


In [50]:
environment_mapping = {
    "ALL": "All Greenhouse Gases",
    "FGAS": "Fluorinated Gases emissions",
    "CH4": "Methane emissions",
    "N2O": "Nitrous oxide emissions",
    "CO2": "Carbon dioxide emissions",
    "TOT": "Total"
}

environment.loc[environment["SubTopic2"].isna(), "SubTopic2"] = (
    environment["Code"].str.split(".").str[2].map(environment_mapping)
)


In [51]:
with pd.option_context('display.max_colwidth', 200):

    display(environment[environment['SubTopic2'].isna()])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3


In [52]:
with pd.option_context('display.max_colwidth', 200):

    display(environment[environment['SubTopic3'].isna()])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
71,EN.GHG.CH4.TR.MT.CE.AR5,Methane (CH4) emissions from Transport (Energy) (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Methane emissions,
72,EN.GHG.CH4.PI.MT.CE.AR5,Methane (CH4) emissions from Power Industry (Energy) (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Methane emissions,
73,EN.GHG.CH4.WA.MT.CE.AR5,Methane (CH4) emissions from Waste (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Methane emissions,
74,EN.GHG.CH4.IP.MT.CE.AR5,Methane (CH4) emissions from Industrial Processes (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Methane emissions,
75,EN.GHG.N2O.AG.MT.CE.AR5,Nitrous oxide (N2O) emissions from Agriculture (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Nitrous oxide emissions,
76,EN.GHG.CH4.IC.MT.CE.AR5,Methane (CH4) emissions from Industrial Combustion (Energy) (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Methane emissions,
77,EN.GHG.CH4.AG.MT.CE.AR5,Methane (CH4) emissions from Agriculture (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Methane emissions,
78,EN.GHG.CH4.MT.CE.AR5,Methane (CH4) emissions (total) excluding LULUCF (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Methane emissions,
79,EN.GHG.N2O.WA.MT.CE.AR5,Nitrous oxide (N2O) emissions from Waste (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Nitrous oxide emissions,
80,EN.GHG.N2O.TR.MT.CE.AR5,Nitrous oxide (N2O) emissions from Transport (Energy) (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Nitrous oxide emissions,


In [53]:
environment_mapping_subtopic3 = {
    "LU": "Including Land Use and Land-Use Change (LULUC)",
    "RT": "Carbon intensity of GDP",
    "OS": "Other Sources",
    "OL": "Other Land Use",
    "WA": "Waste",
    "IC": "Industrial Combustion",
    "IP": "Industrial Processes",
    "BU": "Buildings",
    "AG": "Agriculture",
    "PI": "Power Industry",
    "TR": "Transport",
    "FE": "Fugitive Emissions",

}

environment.loc[environment["SubTopic3"].isna(), "SubTopic3"] = (
    environment["Code"].str.split(".").str[3].map(environment_mapping_subtopic3)
)


In [54]:
with pd.option_context('display.max_colwidth', 200):
    display(environment[environment['Code'].str.startswith('EN.GHG')])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
71,EN.GHG.CH4.TR.MT.CE.AR5,Methane (CH4) emissions from Transport (Energy) (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Methane emissions,Transport
72,EN.GHG.CH4.PI.MT.CE.AR5,Methane (CH4) emissions from Power Industry (Energy) (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Methane emissions,Power Industry
73,EN.GHG.CH4.WA.MT.CE.AR5,Methane (CH4) emissions from Waste (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Methane emissions,Waste
74,EN.GHG.CH4.IP.MT.CE.AR5,Methane (CH4) emissions from Industrial Processes (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Methane emissions,Industrial Processes
75,EN.GHG.N2O.AG.MT.CE.AR5,Nitrous oxide (N2O) emissions from Agriculture (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Nitrous oxide emissions,Agriculture
76,EN.GHG.CH4.IC.MT.CE.AR5,Methane (CH4) emissions from Industrial Combustion (Energy) (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Methane emissions,Industrial Combustion
77,EN.GHG.CH4.AG.MT.CE.AR5,Methane (CH4) emissions from Agriculture (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Methane emissions,Agriculture
78,EN.GHG.CH4.MT.CE.AR5,Methane (CH4) emissions (total) excluding LULUCF (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Methane emissions,
79,EN.GHG.N2O.WA.MT.CE.AR5,Nitrous oxide (N2O) emissions from Waste (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Nitrous oxide emissions,Waste
80,EN.GHG.N2O.TR.MT.CE.AR5,Nitrous oxide (N2O) emissions from Transport (Energy) (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Nitrous oxide emissions,Transport


In [55]:
environment.loc[environment["SubTopic3"].isna(), "SubTopic3"] = "Excluding  Land Use and Land-Use Change (LULUC)"


In [56]:
with pd.option_context('display.max_colwidth', 200):
    display(environment[environment['Code'].str.startswith('EN.GHG')])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
71,EN.GHG.CH4.TR.MT.CE.AR5,Methane (CH4) emissions from Transport (Energy) (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Methane emissions,Transport
72,EN.GHG.CH4.PI.MT.CE.AR5,Methane (CH4) emissions from Power Industry (Energy) (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Methane emissions,Power Industry
73,EN.GHG.CH4.WA.MT.CE.AR5,Methane (CH4) emissions from Waste (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Methane emissions,Waste
74,EN.GHG.CH4.IP.MT.CE.AR5,Methane (CH4) emissions from Industrial Processes (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Methane emissions,Industrial Processes
75,EN.GHG.N2O.AG.MT.CE.AR5,Nitrous oxide (N2O) emissions from Agriculture (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Nitrous oxide emissions,Agriculture
76,EN.GHG.CH4.IC.MT.CE.AR5,Methane (CH4) emissions from Industrial Combustion (Energy) (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Methane emissions,Industrial Combustion
77,EN.GHG.CH4.AG.MT.CE.AR5,Methane (CH4) emissions from Agriculture (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Methane emissions,Agriculture
78,EN.GHG.CH4.MT.CE.AR5,Methane (CH4) emissions (total) excluding LULUCF (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Methane emissions,Excluding Land Use and Land-Use Change (LULUC)
79,EN.GHG.N2O.WA.MT.CE.AR5,Nitrous oxide (N2O) emissions from Waste (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Nitrous oxide emissions,Waste
80,EN.GHG.N2O.TR.MT.CE.AR5,Nitrous oxide (N2O) emissions from Transport (Energy) (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Nitrous oxide emissions,Transport


In [57]:
has_list = environment.apply(lambda row: isinstance(row['SubTopic2'], list) or isinstance(row['SubTopic3'], list), axis=1)


environment_with_lists = environment[has_list]
print(environment_with_lists)


Empty DataFrame
Columns: [Code, Indicator Name, General Topic, Topic, SubTopic1, SubTopic2, SubTopic3]
Index: []


## handling the (text (or text))

In [58]:

with pd.option_context('display.max_colwidth', 200):
    display(environment[environment['SubTopic2'].str.contains(r'\(or', na=False) ])


Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
4,AG.CON.FERT.ZS,Fertilizer consumption (kilograms per hectare of arable land),Environment: Agricultural production,Environment,Agricultural production,Consumption (or condom),Fertilizer
7,AG.CON.FERT.PT.ZS,Fertilizer consumption (% of fertilizer production),Environment: Agricultural production,Environment,Agricultural production,Consumption (or condom),Fertilizer
30,EN.ATM.PM25.MC.T3.ZS,"PM2.5 pollution, population exposed to levels exceeding WHO Interim Target-3 value (% of total)",Environment: Emissions,Environment,Emissions,Atmosphere (or automated teller machine),PM 2.5 air pollution
31,EN.ATM.PM25.MC.T2.ZS,"PM2.5 pollution, population exposed to levels exceeding WHO Interim Target-2 value (% of total)",Environment: Emissions,Environment,Emissions,Atmosphere (or automated teller machine),PM 2.5 air pollution
32,EN.ATM.PM25.MC.T1.ZS,"PM2.5 pollution, population exposed to levels exceeding WHO Interim Target-1 value (% of total)",Environment: Emissions,Environment,Emissions,Atmosphere (or automated teller machine),PM 2.5 air pollution
33,EN.ATM.PM25.MC.ZS,"PM2.5 air pollution, population exposed to levels exceeding WHO guideline value (% of total)",Environment: Emissions,Environment,Emissions,Atmosphere (or automated teller machine),PM 2.5 air pollution
34,EN.ATM.PM25.MC.M3,"PM2.5 air pollution, mean annual exposure (micrograms per cubic meter)",Environment: Emissions,Environment,Emissions,Atmosphere (or automated teller machine),PM 2.5 air pollution


In [59]:
environment.loc[environment['Code'].str.startswith('AG.CON') , "SubTopic2"] = "Consumption"
environment.loc[environment['Code'].str.startswith('EN.ATM') , "SubTopic2"] = "Atmosphere"

In [60]:
with pd.option_context('display.max_colwidth', 200):
    display(environment[environment['SubTopic3'].str.contains(r'\(or', na=False) ])


Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3


# Financial 

In [61]:
financial  = dfs["financial"]

In [62]:
with pd.option_context('display.max_colwidth', 200):

    display(financial[financial['SubTopic2'].isna() | financial['SubTopic3'].isna()])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3


In [63]:
financial

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
0,FX.OWN.TOTL.SO.ZS,Account ownership at a financial institution o...,Financial Sector: Access,Financial Sector,Access,Ownership,Total
1,FB.CBK.DPTR.P3,"Depositors with commercial banks (per 1,000 ad...",Financial Sector: Access,Financial Sector,Access,Commercial bank,Depositors
2,FX.OWN.TOTL.PL.ZS,Account ownership at a financial institution o...,Financial Sector: Access,Financial Sector,Access,Ownership,Total
3,FX.OWN.TOTL.40.ZS,Account ownership at a financial institution o...,Financial Sector: Access,Financial Sector,Access,Ownership,Total
4,FX.OWN.TOTL.OL.ZS,Account ownership at a financial institution o...,Financial Sector: Access,Financial Sector,Access,Ownership,Total
5,FX.OWN.TOTL.MA.ZS,Account ownership at a financial institution o...,Financial Sector: Access,Financial Sector,Access,Ownership,Total
6,FX.OWN.TOTL.60.ZS,Account ownership at a financial institution o...,Financial Sector: Access,Financial Sector,Access,Ownership,Total
7,FX.OWN.TOTL.YG.ZS,Account ownership at a financial institution o...,Financial Sector: Access,Financial Sector,Access,Ownership,Total
8,FB.CBK.BRCH.P5,"Commercial bank branches (per 100,000 adults)",Financial Sector: Access,Financial Sector,Access,Commercial bank,Bank branches
9,FX.OWN.TOTL.ZS,Account ownership at a financial institution o...,Financial Sector: Access,Financial Sector,Access,Ownership,Total


In [64]:
has_list = financial.apply(lambda row: isinstance(row['SubTopic2'], list) or isinstance(row['SubTopic3'], list), axis=1)


financial_with_lists = financial[has_list]
print(financial_with_lists)


Empty DataFrame
Columns: [Code, Indicator Name, General Topic, Topic, SubTopic1, SubTopic2, SubTopic3]
Index: []


## handling the (text (or text))

In [65]:

with pd.option_context('display.max_colwidth', 200):
    display(financial[financial['SubTopic2'].str.contains(r'\(or', na=False) ])


Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
14,FB.ATM.TOTL.P5,"Automated teller machines (ATMs) (per 100,000 adults)",Financial Sector: Access,Financial Sector,Access,Atmosphere (or automated teller machine),Total
27,FD.RES.LIQU.AS.ZS,Bank liquid reserves to bank assets ratio (%),Financial Sector: Assets,Financial Sector,Assets,Reserves (or residential),Liquid


In [66]:
financial.loc[financial['Code'].str.startswith('FB.ATM') , "SubTopic2"] = "Automated teller machines"
financial.loc[financial['Code'].str.startswith('FD.RES') , "SubTopic2"] = "Reserves"

In [67]:
with pd.option_context('display.max_colwidth', 200):
    display(financial[financial['SubTopic3'].str.contains(r'\(or', na=False) ])


Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
31,CM.MKT.TRNR,"Stocks traded, turnover ratio of domestic shares (%)",Financial Sector: Capital markets,Financial Sector,Capital markets,Markets (stock),Transport receipts (or Turnover ratio)


In [68]:
financial.loc[financial['Code'].str.startswith('CM.MKT.TRNR') , "SubTopic3"] = "Turnover ratio"

# More adjustments (personal)

In [None]:
financial.loc[financial['Code'].str.contains('FM.AST') , "SubTopic2"] = "Monetary Survey"
financial.loc[financial['Code'].str.contains('FS.FS') , "SubTopic2"] = "Bank Survey"
financial.loc[financial['Code'].str.contains('FB.AST') , "SubTopic2"] = "Bank (miscellaneous)"
financial.loc[financial['Code'].str.contains('FD.AST') , "SubTopic2"] = "Deposit money banks"



# Gender 

In [69]:
gender  = dfs["gender"]

In [70]:
with pd.option_context('display.max_colwidth', 200):

    display(gender)

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
0,SP.M18.2024.FE.ZS,Women who were first married by age 18 (% of women ages 20-24),Gender: Agency,Gender,Agency,Married by age 18,Age 20-24
1,SP.M15.2024.FE.ZS,Women who were first married by age 15 (% of women ages 20-24),Gender: Agency,Gender,Agency,,Age 20-24
2,SG.VAW.REAS.ZS,Women who believe a husband is justified in beating his wife (any of five reasons) (%),Gender: Health,Gender,Health,Violence against women,Reason
3,SG.VAW.ARGU.ZS,Women who believe a husband is justified in beating his wife when she argues with him (%),Gender: Health,Gender,Health,Violence against women,Argue
4,SG.VAW.1549.ZS,Proportion of women subjected to physical and/or sexual violence in the last 12 months (% of ever-partnered women ages 15-49),Gender: Health,Gender,Health,Violence against women,
5,SG.VAW.BURN.ZS,Women who believe a husband is justified in beating his wife when she burns the food (%),Gender: Health,Gender,Health,Violence against women,Burns food
6,SG.VAW.GOES.ZS,Women who believe a husband is justified in beating his wife when she goes out without telling him (%),Gender: Health,Gender,Health,Violence against women,Goes out
7,SG.VAW.NEGL.ZS,Women who believe a husband is justified in beating his wife when she neglects the children (%),Gender: Health,Gender,Health,Violence against women,Neglects children
8,SG.VAW.REFU.ZS,Women who believe a husband is justified in beating his wife when she refuses sex with him (%),Gender: Health,Gender,Health,Violence against women,Refuse
9,SG.TIM.UWRK.MA,"Proportion of time spent on unpaid domestic and care work, male (% of 24 hour day)",Gender: Participation & access,Gender,Participation & access,Time,Unpaid work


In [71]:
with pd.option_context('display.max_colwidth', 200):

    display(gender[gender['SubTopic2'].isna()])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
1,SP.M15.2024.FE.ZS,Women who were first married by age 15 (% of women ages 20-24),Gender: Agency,Gender,Agency,,Age 20-24


In [72]:
gender.loc[gender["Code"] == "SP.M15.2024.FE.ZS", "SubTopic2"] = "Married by age 15"

In [73]:
with pd.option_context('display.max_colwidth', 200):

    display(gender[gender['SubTopic3'].isna()])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
4,SG.VAW.1549.ZS,Proportion of women subjected to physical and/or sexual violence in the last 12 months (% of ever-partnered women ages 15-49),Gender: Health,Gender,Health,Violence against women,
11,SG.DMK.SRCR.FN.ZS,"Women making their own informed decisions regarding sexual relations, contraceptive use and reproductive health care (% of women age 15-49)",Gender: Public life & decision making,Gender,Public life & decision making,Decision making,


In [74]:
gender.loc[gender["Code"] == "SG.VAW.1549.ZS", "SubTopic3"] = "Age 15-49"
gender.loc[gender["Code"] == "SG.DMK.SRCR.FN.ZS", "SubTopic3"] = "Age 15-49"

In [75]:
with pd.option_context('display.max_colwidth', 200):

    display(gender)

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
0,SP.M18.2024.FE.ZS,Women who were first married by age 18 (% of women ages 20-24),Gender: Agency,Gender,Agency,Married by age 18,Age 20-24
1,SP.M15.2024.FE.ZS,Women who were first married by age 15 (% of women ages 20-24),Gender: Agency,Gender,Agency,Married by age 15,Age 20-24
2,SG.VAW.REAS.ZS,Women who believe a husband is justified in beating his wife (any of five reasons) (%),Gender: Health,Gender,Health,Violence against women,Reason
3,SG.VAW.ARGU.ZS,Women who believe a husband is justified in beating his wife when she argues with him (%),Gender: Health,Gender,Health,Violence against women,Argue
4,SG.VAW.1549.ZS,Proportion of women subjected to physical and/or sexual violence in the last 12 months (% of ever-partnered women ages 15-49),Gender: Health,Gender,Health,Violence against women,Age 15-49
5,SG.VAW.BURN.ZS,Women who believe a husband is justified in beating his wife when she burns the food (%),Gender: Health,Gender,Health,Violence against women,Burns food
6,SG.VAW.GOES.ZS,Women who believe a husband is justified in beating his wife when she goes out without telling him (%),Gender: Health,Gender,Health,Violence against women,Goes out
7,SG.VAW.NEGL.ZS,Women who believe a husband is justified in beating his wife when she neglects the children (%),Gender: Health,Gender,Health,Violence against women,Neglects children
8,SG.VAW.REFU.ZS,Women who believe a husband is justified in beating his wife when she refuses sex with him (%),Gender: Health,Gender,Health,Violence against women,Refuse
9,SG.TIM.UWRK.MA,"Proportion of time spent on unpaid domestic and care work, male (% of 24 hour day)",Gender: Participation & access,Gender,Participation & access,Time,Unpaid work


In [76]:
has_list = gender.apply(lambda row: isinstance(row['SubTopic2'], list) or isinstance(row['SubTopic3'], list), axis=1)


gender_with_lists = gender[has_list]
print(gender_with_lists)


Empty DataFrame
Columns: [Code, Indicator Name, General Topic, Topic, SubTopic1, SubTopic2, SubTopic3]
Index: []


## handling the (text (or text))

In [77]:

with pd.option_context('display.max_colwidth', 200):
    display(gender[gender['SubTopic2'].str.contains(r'\(or', na=False) ])


Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3


In [78]:
with pd.option_context('display.max_colwidth', 200):
    display(gender[gender['SubTopic3'].str.contains(r'\(or', na=False) ])


Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3


# Health  

In [79]:
health  = dfs["health"]

In [80]:
with pd.option_context('display.max_colwidth', 200):

    display(health)

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
0,SH.MLR.NETS.ZS,Use of insecticide-treated bed nets (% of under-5 population),Health: Disease prevention,Health,Disease prevention,Malaria,Insecticide-treated bed nets
1,SH.TBS.CURE.ZS,Tuberculosis treatment success rate (% of new cases),Health: Disease prevention,Health,Disease prevention,Tuberculosis,Cure (treatment)
2,SH.STA.BASS.ZS,People using at least basic sanitation services (% of population),Health: Disease prevention,Health,Disease prevention,Stationary,
3,SH.STA.BASS.RU.ZS,"People using at least basic sanitation services, rural (% of rural population)",Health: Disease prevention,Health,Disease prevention,Stationary,
4,SH.STA.BASS.UR.ZS,"People using at least basic sanitation services, urban (% of urban population)",Health: Disease prevention,Health,Disease prevention,Stationary,
...,...,...,...,...,...,...,...
243,SH.UHC.FBP1.ZS,Proportion of population pushed further below the $2.15 ($ 2017 PPP) poverty line by out-of-pocket health care expenditure (%),Health: Universal Health Coverage,Health,Universal Health Coverage,,
244,SH.UHC.NOPR.ZS,Proportion of population pushed below the 60% median consumption poverty line by out-of-pocket health expenditure (%),Health: Universal Health Coverage,Health,Universal Health Coverage,,
245,SH.UHC.NOP2.ZS,Proportion of population pushed below the $3.65 ($ 2017 PPP) poverty line by out-of-pocket health care expenditure (%),Health: Universal Health Coverage,Health,Universal Health Coverage,,Number of poor at $3.10 a day
246,SH.UHC.NOP1.ZS,Proportion of population pushed below the $2.15 ($ 2017 PPP) poverty line by out-of-pocket health care expenditure (%),Health: Universal Health Coverage,Health,Universal Health Coverage,,Number of poor at $1.90 a day


In [81]:
with pd.option_context('display.max_colwidth', 200):

    display(health[health['SubTopic2'].isna()])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
236,SH.UHC.FBPR.ZS,Proportion of population pushed further below the 60% median consumption poverty line by out-of-pocket health care expenditure (%),Health: Universal Health Coverage,Health,Universal Health Coverage,,
237,SH.UHC.OOPC.25.ZS,Proportion of population spending more than 25% of household consumption or income on out-of-pocket health care expenditure (%),Health: Universal Health Coverage,Health,Universal Health Coverage,,Out-of-pocket
238,SH.UHC.OOPC.10.ZS,Proportion of population spending more than 10% of household consumption or income on out-of-pocket health care expenditure (%),Health: Universal Health Coverage,Health,Universal Health Coverage,,Out-of-pocket
239,SH.UHC.TOTR.ZS,Proportion of population pushed or further pushed below the 60% median consumption poverty line by out-of-pocket health expenditure (%),Health: Universal Health Coverage,Health,Universal Health Coverage,,
240,SH.UHC.TOT2.ZS,Proportion of population pushed or further pushed below the $3.65 ($ 2017 PPP) poverty line by out-of-pocket health care expenditure (%),Health: Universal Health Coverage,Health,Universal Health Coverage,,
241,SH.UHC.TOT1.ZS,Proportion of population pushed or further pushed below the $2.15 ($ 2017 PPP) poverty line by out-of-pocket health care expenditure (%),Health: Universal Health Coverage,Health,Universal Health Coverage,,
242,SH.UHC.FBP2.ZS,Proportion of population pushed further below the $3.65 ($ 2017 PPP) poverty line by out-of-pocket health care expenditure (%),Health: Universal Health Coverage,Health,Universal Health Coverage,,
243,SH.UHC.FBP1.ZS,Proportion of population pushed further below the $2.15 ($ 2017 PPP) poverty line by out-of-pocket health care expenditure (%),Health: Universal Health Coverage,Health,Universal Health Coverage,,
244,SH.UHC.NOPR.ZS,Proportion of population pushed below the 60% median consumption poverty line by out-of-pocket health expenditure (%),Health: Universal Health Coverage,Health,Universal Health Coverage,,
245,SH.UHC.NOP2.ZS,Proportion of population pushed below the $3.65 ($ 2017 PPP) poverty line by out-of-pocket health care expenditure (%),Health: Universal Health Coverage,Health,Universal Health Coverage,,Number of poor at $3.10 a day


In [82]:
health.loc[health["Code"] == "SH.UHC.SRVS.CV.XD", ["SubTopic2", "SubTopic3"]] = ["Service Coverage", "Index"]

In [83]:
health.loc[health["Code"] == "SH.UHC.NOP2.ZS",  "SubTopic3"] = ["3.65(2017 PPP) poverty line"]
health.loc[health["Code"] == "SH.UHC.NOP1.ZS",  "SubTopic3"] = ["2.15(2017 PPP) poverty line"]
health.loc[health["Code"] == "SH.UHC.OOPC.10.ZS",  "SubTopic3"] = ["10% of household consumption or income"]
health.loc[health["Code"] == "SH.UHC.OOPC.25.ZS",  "SubTopic3"] = ["25% of household consumption or income"]

In [84]:
health.loc[health['Code'].str.startswith('SH.UHC') & 
           health['Code'].str.contains('NOP1|NOP2|NOPR', regex=True), "SubTopic2"] = "Poverty Line (pushed below)"

health.loc[health['Code'].str.startswith('SH.UHC') & 
           health['Code'].str.contains('FBP1|FBP2|FBPR', regex=True), "SubTopic2"] = "Poverty Line (pushed further below)"

health.loc[health['Code'].str.startswith('SH.UHC') & 
           health['Code'].str.contains('TOT1|TOT2|TOTR', regex=True), "SubTopic2"] = "Poverty Line (pushed or further pushed below)"

health.loc[health['Code'].str.startswith('SH.UHC') & 
           health['Code'].str.contains('OOPC', regex=True), "SubTopic2"] = "Poverty Line (spending more)"

In [85]:
health_subtopic3_mapping = {
    "NOPR": "60% median consumption poverty line",
    "FBP1": "2.15(2017 PPP) poverty line",
    "FBP2": "3.65(2017 PPP) poverty line",
    "TOT1": "2.15(2017 PPP) poverty line",
    "TOT2": "3.65(2017 PPP) poverty line",
    "TOTR": "60% median consumption poverty line",
    "FBPR": "60% median consumption poverty line"
}
health.loc[health["SubTopic3"].isna(), "SubTopic3"] = (
    health["Code"].str.split(".").str[2].map(health_subtopic3_mapping)
)

In [86]:
health[health['Code'].str.startswith('SH.UHC')]

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
236,SH.UHC.FBPR.ZS,Proportion of population pushed further below ...,Health: Universal Health Coverage,Health,Universal Health Coverage,Poverty Line (pushed further below),60% median consumption poverty line
237,SH.UHC.OOPC.25.ZS,Proportion of population spending more than 25...,Health: Universal Health Coverage,Health,Universal Health Coverage,Poverty Line (spending more),25% of household consumption or income
238,SH.UHC.OOPC.10.ZS,Proportion of population spending more than 10...,Health: Universal Health Coverage,Health,Universal Health Coverage,Poverty Line (spending more),10% of household consumption or income
239,SH.UHC.TOTR.ZS,Proportion of population pushed or further pus...,Health: Universal Health Coverage,Health,Universal Health Coverage,Poverty Line (pushed or further pushed below),60% median consumption poverty line
240,SH.UHC.TOT2.ZS,Proportion of population pushed or further pus...,Health: Universal Health Coverage,Health,Universal Health Coverage,Poverty Line (pushed or further pushed below),3.65(2017 PPP) poverty line
241,SH.UHC.TOT1.ZS,Proportion of population pushed or further pus...,Health: Universal Health Coverage,Health,Universal Health Coverage,Poverty Line (pushed or further pushed below),2.15(2017 PPP) poverty line
242,SH.UHC.FBP2.ZS,Proportion of population pushed further below ...,Health: Universal Health Coverage,Health,Universal Health Coverage,Poverty Line (pushed further below),3.65(2017 PPP) poverty line
243,SH.UHC.FBP1.ZS,Proportion of population pushed further below ...,Health: Universal Health Coverage,Health,Universal Health Coverage,Poverty Line (pushed further below),2.15(2017 PPP) poverty line
244,SH.UHC.NOPR.ZS,Proportion of population pushed below the 60% ...,Health: Universal Health Coverage,Health,Universal Health Coverage,Poverty Line (pushed below),60% median consumption poverty line
245,SH.UHC.NOP2.ZS,Proportion of population pushed below the $3.6...,Health: Universal Health Coverage,Health,Universal Health Coverage,Poverty Line (pushed below),3.65(2017 PPP) poverty line


In [87]:
with pd.option_context('display.max_colwidth', 200):

    display(health[health['SubTopic3'].isna()])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
2,SH.STA.BASS.ZS,People using at least basic sanitation services (% of population),Health: Disease prevention,Health,Disease prevention,Stationary,
3,SH.STA.BASS.RU.ZS,"People using at least basic sanitation services, rural (% of rural population)",Health: Disease prevention,Health,Disease prevention,Stationary,
4,SH.STA.BASS.UR.ZS,"People using at least basic sanitation services, urban (% of urban population)",Health: Disease prevention,Health,Disease prevention,Stationary,
5,SH.H2O.SMDW.ZS,People using safely managed drinking water services (% of population),Health: Disease prevention,Health,Disease prevention,Water,
6,SH.H2O.SMDW.RU.ZS,"People using safely managed drinking water services, rural (% of rural population)",Health: Disease prevention,Health,Disease prevention,Water,
7,SH.STA.SMSS.ZS,People using safely managed sanitation services (% of population),Health: Disease prevention,Health,Disease prevention,Stationary,
8,SH.STA.SMSS.RU.ZS,"People using safely managed sanitation services, rural (% of rural population)",Health: Disease prevention,Health,Disease prevention,Stationary,
9,SH.STA.SMSS.UR.ZS,"People using safely managed sanitation services, urban (% of urban population)",Health: Disease prevention,Health,Disease prevention,Stationary,
10,SH.H2O.SMDW.UR.ZS,"People using safely managed drinking water services, urban (% of urban population)",Health: Disease prevention,Health,Disease prevention,Water,
11,SH.H2O.BASW.UR.ZS,"People using at least basic drinking water services, urban (% of urban population)",Health: Disease prevention,Health,Disease prevention,Water,


In [88]:
health.loc[health['Code'].str.startswith('SH.STA') & 
           health['Code'].str.contains('SMSS', regex=True), "SubTopic3"] = "Safely Managed Sanitation Services"

health.loc[health['Code'].str.startswith('SH.STA') & 
           health['Code'].str.contains('SMDW', regex=True), "SubTopic3"] = "Safely Managed Drinking Water Services"

health.loc[health['Code'].str.startswith('SH.STA') & 
           health['Code'].str.contains('BASS', regex=True), "SubTopic3"] = "Basic Sanitation Services"

health.loc[health['Code'].str.startswith('SH.STA') & 
           health['Code'].str.contains('HYGN', regex=True), "SubTopic3"] = "Handwashing Facilities Including Soap and Water"

health.loc[health['Code'].str.startswith('SH.STA') & 
           health['Code'].str.contains('POIS', regex=True), "SubTopic3"] = "Unintentional Poisoning"

health.loc[health['Code'].str.startswith('SH.STA') & 
           health['Code'].str.contains('AIRP', regex=True), "SubTopic3"] = "Household and Ambient Air Pollution"


health.loc[health['Code'].str.startswith('SH.STA') & 
           health['Code'].str.contains('WASH', regex=True), "SubTopic3"] = "Unsafe Water, Unsafe Sanitation and Lack of Hygiene"

# H2O

health.loc[health['Code'].str.startswith('SH.H2O') & 
           health['Code'].str.contains('BASW', regex=True), "SubTopic3"] = "Basic Drinking Water Services"

health.loc[health['Code'].str.startswith('SH.H2O') & 
           health['Code'].str.contains('HYGN', regex=True), "SubTopic3"] = "Basic Handwashing Facilities Including Soap and Water"

health.loc[health['Code'].str.startswith('SH.H2O') & 
           health['Code'].str.contains('SMDW', regex=True), "SubTopic3"] = "Safely Managed Drinking Water Services"

# XPD

health.loc[health['Code'].str.startswith('SH.XPD') & 
           health['Code'].str.contains('PVTD', regex=True), "SubTopic3"] = "Domestic Private Health Expenditure"

health.loc[health['Code'].str.startswith('SH.XPD') & 
           health['Code'].str.contains('GHED', regex=True), "SubTopic3"] = "Domestic General Government Health Expenditure"

health.loc[health['Code'].str.startswith('SH.XPD') & 
           health['Code'].str.contains('EHEX', regex=True), "SubTopic3"] = "Current Health Expenditure"

health.loc[health['Code'].str.startswith('SH.XPD') & 
           health['Code'].str.contains('CHEX', regex=True), "SubTopic3"] = "Current Health Expenditure per capita "


# ITK

health.loc[health['Code'].str.startswith('SN.ITK') & 
           health['Code'].str.contains('SVFI', regex=True), "SubTopic3"] = "Severe Food Insecurity"

health.loc[health['Code'].str.startswith('SN.ITK') & 
           health['Code'].str.contains('MSFI', regex=True), "SubTopic3"] = "Moderate or Severe Food Insecurity"

In [89]:
with pd.option_context('display.max_colwidth', 200):

    display(health[health['SubTopic3'].isna()])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3


In [90]:
has_list = health.apply(lambda row: isinstance(row['SubTopic2'], list) or isinstance(row['SubTopic3'], list), axis=1)


health_with_lists = health[has_list]
health_with_lists


Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
117,SP.DYN.CDRT.IN,"Death rate, crude (per 1,000 people)",Health: Population: Dynamics,Health,Population,"[Dynamics, Dynamic]",Crude death rate
118,SP.HOU.FEMA.ZS,Female headed households (% of households with...,Health: Population: Dynamics,Health,Population,"[Dynamics, Household]",Female
119,SP.REG.BRTH.ZS,Completeness of birth registration (%),Health: Population: Dynamics,Health,Population,"[Dynamics, Registration (or regulatory)]",Birth
120,SP.REG.BRTH.MA.ZS,"Completeness of birth registration, male (%)",Health: Population: Dynamics,Health,Population,"[Dynamics, Registration (or regulatory)]",Birth
121,SP.REG.BRTH.RU.ZS,"Completeness of birth registration, rural (%)",Health: Population: Dynamics,Health,Population,"[Dynamics, Registration (or regulatory)]",Birth
...,...,...,...,...,...,...,...
183,SP.POP.0004.FE.5Y,"Population ages 00-04, female (% of female pop...",Health: Population: Structure,Health,Population,"[Structure, Population]",Age 0-4
184,SP.POP.0004.MA.5Y,"Population ages 00-04, male (% of male populat...",Health: Population: Structure,Health,Population,"[Structure, Population]",Age 0-4
185,SP.POP.0014.TO.ZS,Population ages 0-14 (% of total population),Health: Population: Structure,Health,Population,"[Structure, Population]",Age 0-14
186,SP.POP.0014.FE.IN,"Population ages 0-14, female",Health: Population: Structure,Health,Population,"[Structure, Population]",Age 0-14


In [91]:
health["SubTopic2"] = health["SubTopic2"].apply(select_first_item)

In [92]:
has_list = health.apply(lambda row: isinstance(row['SubTopic2'], list) or isinstance(row['SubTopic3'], list), axis=1)


health_with_lists = health[has_list]
health_with_lists


Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3


## handling the (text (or text))

In [93]:

with pd.option_context('display.max_colwidth', 200):
    display(health[health['SubTopic2'].str.contains(r'\(or', na=False) ])


Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
25,SH.CON.1524.MA.ZS,"Condom use, population ages 15-24, male (% of males ages 15-24)",Health: Disease prevention,Health,Disease prevention,Consumption (or condom),Age 15-24
26,SH.CON.1524.FE.ZS,"Condom use, population ages 15-24, female (% of females ages 15-24)",Health: Disease prevention,Health,Disease prevention,Consumption (or condom),Age 15-24
109,SH.ANM.CHLD.ZS,Prevalence of anemia among children (% of children ages 6-59 months),Health: Nutrition,Health,Nutrition,Anemia (or animal),Child
110,SH.ANM.NPRG.ZS,Prevalence of anemia among non-pregnant women (% of women ages 15-49),Health: Nutrition,Health,Nutrition,Anemia (or animal),Non-pregnant
112,SH.ANM.ALLW.ZS,Prevalence of anemia among women of reproductive age (% of women ages 15-49),Health: Nutrition,Health,Nutrition,Anemia (or animal),All women


In [94]:
health.loc[health['Code'].str.startswith('SH.CON') , "SubTopic2"] = "Condom"
health.loc[health['Code'].str.startswith('SH.ANM') , "SubTopic2"] = "Anemia"

In [95]:
with pd.option_context('display.max_colwidth', 200):
    display(health[health['SubTopic3'].str.contains(r'\(or', na=False) ])


Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3


# Infrastructure   

In [96]:
infrastructure   = dfs["infrastructure"]

In [97]:
with pd.option_context('display.max_colwidth', 200):

    display(infrastructure)

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
0,IT.NET.USER.MA.ZS,"Individuals using the Internet, male (% of male population)",Infrastructure: Communications,Infrastructure,Communications,Internet,Users
1,IT.NET.USER.ZS,Individuals using the Internet (% of population),Infrastructure: Communications,Infrastructure,Communications,Internet,Users
2,IT.MLT.MAIN,Fixed telephone subscriptions,Infrastructure: Communications,Infrastructure,Communications,Mainline telephones,Telephone main lines
3,IT.MLT.MAIN.P2,Fixed telephone subscriptions (per 100 people),Infrastructure: Communications,Infrastructure,Communications,Mainline telephones,Telephone main lines
4,BX.GSR.CCIS.CD,"ICT service exports (BoP, current US$)",Infrastructure: Communications,Infrastructure,Communications,Goods and services,Computer and information systems
5,BX.GSR.CCIS.ZS,"ICT service exports (% of service exports, BoP)",Infrastructure: Communications,Infrastructure,Communications,Goods and services,Computer and information systems
6,TM.VAL.ICTG.ZS.UN,ICT goods imports (% total goods imports),Infrastructure: Communications,Infrastructure,Communications,Value,ICT goods
7,TX.VAL.ICTG.ZS.UN,ICT goods exports (% of total goods exports),Infrastructure: Communications,Infrastructure,Communications,Value,ICT goods
8,IT.NET.BBND,Fixed broadband subscriptions,Infrastructure: Communications,Infrastructure,Communications,Internet,Broadband
9,IT.NET.USER.FE.ZS,"Individuals using the Internet, female (% of female population)",Infrastructure: Communications,Infrastructure,Communications,Internet,Users


In [98]:
with pd.option_context('display.max_colwidth', 200):

    display(infrastructure[infrastructure['SubTopic2'].isna()])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3


In [99]:
with pd.option_context('display.max_colwidth', 200):

    display(infrastructure[infrastructure['SubTopic3'].isna()])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3


In [100]:
has_list = infrastructure.apply(lambda row: isinstance(row['SubTopic2'], list) or isinstance(row['SubTopic3'], list), axis=1)


infrastructure_with_lists = infrastructure[has_list]
print(infrastructure_with_lists)


Empty DataFrame
Columns: [Code, Indicator Name, General Topic, Topic, SubTopic1, SubTopic2, SubTopic3]
Index: []


## handling the (text (or text))

In [101]:

with pd.option_context('display.max_colwidth', 200):
    display(infrastructure[infrastructure['SubTopic2'].str.contains(r'\(or', na=False) ])


Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3


In [102]:
with pd.option_context('display.max_colwidth', 200):
    display(infrastructure[infrastructure['SubTopic3'].str.contains(r'\(or', na=False) ])


Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3


# Poverty   

In [103]:
poverty   = dfs["poverty"]

In [104]:
with pd.option_context('display.max_colwidth', 200):

    display(poverty)

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
0,SI.DST.50MD,Proportion of people living below 50 percent of median income (%),Poverty: Income distribution,Poverty,Income distribution,Distribution,
1,SI.DST.05TH.20,Income share held by highest 20%,Poverty: Income distribution,Poverty,Income distribution,Distribution,Fifth (highest) quintile
2,SI.DST.02ND.20,Income share held by second 20%,Poverty: Income distribution,Poverty,Income distribution,Distribution,Second
3,SI.DST.FRST.10,Income share held by lowest 10%,Poverty: Income distribution,Poverty,Income distribution,Distribution,"First, lowest (also Forest)"
4,SI.DST.10TH.10,Income share held by highest 10%,Poverty: Income distribution,Poverty,Income distribution,Distribution,Highest decile
5,SI.DST.04TH.20,Income share held by fourth 20%,Poverty: Income distribution,Poverty,Income distribution,Distribution,Fourth
6,SI.DST.FRST.20,Income share held by lowest 20%,Poverty: Income distribution,Poverty,Income distribution,Distribution,"First, lowest (also Forest)"
7,SI.DST.03RD.20,Income share held by third 20%,Poverty: Income distribution,Poverty,Income distribution,Distribution,Third
8,SI.POV.GINI,Gini index,Poverty: Income distribution,Poverty,Income distribution,Poverty,GINI index
9,SI.POV.LMIC.GP,Poverty gap at $3.65 a day (2017 PPP) (%),Poverty: Poverty rates,Poverty,Poverty rates,Poverty,


In [105]:
with pd.option_context('display.max_colwidth', 200):

    display(poverty[poverty['SubTopic2'].isna()])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3


In [106]:
with pd.option_context('display.max_colwidth', 200):

    display(poverty[poverty['SubTopic3'].isna()])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
0,SI.DST.50MD,Proportion of people living below 50 percent of median income (%),Poverty: Income distribution,Poverty,Income distribution,Distribution,
9,SI.POV.LMIC.GP,Poverty gap at $3.65 a day (2017 PPP) (%),Poverty: Poverty rates,Poverty,Poverty rates,Poverty,
11,SI.POV.UMIC.GP,Poverty gap at $6.85 a day (2017 PPP) (%),Poverty: Poverty rates,Poverty,Poverty rates,Poverty,
12,SI.POV.MPUN,Multidimensional poverty headcount ratio (UNDP) (% of population),Poverty: Poverty rates,Poverty,Poverty rates,Poverty,
13,SI.POV.MPWB,Multidimensional poverty headcount ratio (World Bank) (% of population),Poverty: Poverty rates,Poverty,Poverty rates,Poverty,
14,SI.POV.LMIC,Poverty headcount ratio at $3.65 a day (2017 PPP) (% of population),Poverty: Poverty rates,Poverty,Poverty rates,Poverty,
16,SI.POV.UMIC,Poverty headcount ratio at $6.85 a day (2017 PPP) (% of population),Poverty: Poverty rates,Poverty,Poverty rates,Poverty,
17,SI.POV.SOPO,Poverty headcount ratio at societal poverty line (% of population),Poverty: Poverty rates,Poverty,Poverty rates,Poverty,
19,SI.SPR.PGAP,Prosperity gap (average shortfall from a prosperity standard of $25/day),Poverty: Shared prosperity,Poverty,Shared prosperity,Shared prosperity,


In [107]:
with pd.option_context('display.max_colwidth', 200):
    display(poverty[poverty['Code'].str.startswith('SI.DST')])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
0,SI.DST.50MD,Proportion of people living below 50 percent of median income (%),Poverty: Income distribution,Poverty,Income distribution,Distribution,
1,SI.DST.05TH.20,Income share held by highest 20%,Poverty: Income distribution,Poverty,Income distribution,Distribution,Fifth (highest) quintile
2,SI.DST.02ND.20,Income share held by second 20%,Poverty: Income distribution,Poverty,Income distribution,Distribution,Second
3,SI.DST.FRST.10,Income share held by lowest 10%,Poverty: Income distribution,Poverty,Income distribution,Distribution,"First, lowest (also Forest)"
4,SI.DST.10TH.10,Income share held by highest 10%,Poverty: Income distribution,Poverty,Income distribution,Distribution,Highest decile
5,SI.DST.04TH.20,Income share held by fourth 20%,Poverty: Income distribution,Poverty,Income distribution,Distribution,Fourth
6,SI.DST.FRST.20,Income share held by lowest 20%,Poverty: Income distribution,Poverty,Income distribution,Distribution,"First, lowest (also Forest)"
7,SI.DST.03RD.20,Income share held by third 20%,Poverty: Income distribution,Poverty,Income distribution,Distribution,Third


In [108]:
poverty.loc[poverty["Code"] == "SI.DST.50MD",  "SubTopic3"] = "Below 50% Median Income"


In [109]:
with pd.option_context('display.max_colwidth', 200):
    display(poverty[poverty['Code'].str.startswith('SI.POV')])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
8,SI.POV.GINI,Gini index,Poverty: Income distribution,Poverty,Income distribution,Poverty,GINI index
9,SI.POV.LMIC.GP,Poverty gap at $3.65 a day (2017 PPP) (%),Poverty: Poverty rates,Poverty,Poverty rates,Poverty,
10,SI.POV.GAPS,Poverty gap at $2.15 a day (2017 PPP) (%),Poverty: Poverty rates,Poverty,Poverty rates,Poverty,Poverty gap at $1.25 a day
11,SI.POV.UMIC.GP,Poverty gap at $6.85 a day (2017 PPP) (%),Poverty: Poverty rates,Poverty,Poverty rates,Poverty,
12,SI.POV.MPUN,Multidimensional poverty headcount ratio (UNDP) (% of population),Poverty: Poverty rates,Poverty,Poverty rates,Poverty,
13,SI.POV.MPWB,Multidimensional poverty headcount ratio (World Bank) (% of population),Poverty: Poverty rates,Poverty,Poverty rates,Poverty,
14,SI.POV.LMIC,Poverty headcount ratio at $3.65 a day (2017 PPP) (% of population),Poverty: Poverty rates,Poverty,Poverty rates,Poverty,
15,SI.POV.DDAY,Poverty headcount ratio at $2.15 a day (2017 PPP) (% of population),Poverty: Poverty rates,Poverty,Poverty rates,Poverty,$1.25 a day
16,SI.POV.UMIC,Poverty headcount ratio at $6.85 a day (2017 PPP) (% of population),Poverty: Poverty rates,Poverty,Poverty rates,Poverty,
17,SI.POV.SOPO,Poverty headcount ratio at societal poverty line (% of population),Poverty: Poverty rates,Poverty,Poverty rates,Poverty,


In [110]:
poverty.loc[poverty["Code"] == "SI.POV.LMIC.GP", "SubTopic3"] = ["Poverty gap at $3.65 a day"]
poverty.loc[poverty["Code"] == "SI.POV.SOPO", "SubTopic3"] = ["Societal poverty headcount"]
poverty.loc[poverty["Code"] == "SI.POV.UMIC", "SubTopic3"] = ["Poverty headcount at $6.85 a day"]
poverty.loc[poverty["Code"] == "SI.POV.DDAY", "SubTopic3"] = ["$2.15 a day"]
poverty.loc[poverty["Code"] == "SI.POV.MPUN", "SubTopic3"] = ["Multidimensional poverty (UNDP)"]
poverty.loc[poverty["Code"] == "SI.POV.MPWB", "SubTopic3"] = ["Multidimensional poverty (World Bank)"]
poverty.loc[poverty["Code"] == "SI.POV.UMIC.GP", "SubTopic3"] = ["Poverty gap at $6.85 a day"]
poverty.loc[poverty["Code"] == "SI.POV.GAPS", "SubTopic3"] = ["Poverty gap at $2.15 a day"]
poverty.loc[poverty["Code"] == "SI.POV.LMIC", "SubTopic3"] = ["Poverty headcount at $3.65 a day"]
poverty.loc[poverty["Code"] == "SI.POV.DDAY", "SubTopic3"] = ["Poverty headcount at $2.15 a day"]

In [111]:
with pd.option_context('display.max_colwidth', 200):
    display(poverty[poverty['Code'].str.startswith('SI.SPR')])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
19,SI.SPR.PGAP,Prosperity gap (average shortfall from a prosperity standard of $25/day),Poverty: Shared prosperity,Poverty,Shared prosperity,Shared prosperity,
20,SI.SPR.PC40,"Survey mean consumption or income per capita, bottom 40% of population (2017 PPP $ per day)",Poverty: Shared prosperity,Poverty,Shared prosperity,Shared prosperity,Bottom 40% of population
21,SI.SPR.PC40.ZG,"Annualized average growth rate in per capita real survey mean consumption or income, bottom 40% of population (%)",Poverty: Shared prosperity,Poverty,Shared prosperity,Shared prosperity,Bottom 40% of population
22,SI.SPR.PCAP.ZG,"Annualized average growth rate in per capita real survey mean consumption or income, total population (%)",Poverty: Shared prosperity,Poverty,Shared prosperity,Shared prosperity,Per capita
23,SI.SPR.PCAP,"Survey mean consumption or income per capita, total population (2017 PPP $ per day)",Poverty: Shared prosperity,Poverty,Shared prosperity,Shared prosperity,Per capita


In [112]:
poverty.loc[poverty["Code"] == "SI.SPR.PCAP.ZG", "SubTopic3"] = ["Total"]
poverty.loc[poverty["Code"] == "SI.SPR.PGAP", "SubTopic3"] = ["Prosperity gap at $25/day"]
poverty.loc[poverty["Code"] == "SI.SPR.PCAP", "SubTopic3"] = ["Total"]


In [113]:
has_list = poverty.apply(lambda row: isinstance(row['SubTopic2'], list) or isinstance(row['SubTopic3'], list), axis=1)


poverty_with_lists = poverty[has_list]
poverty_with_lists


Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3


## handling the (text (or text))

In [114]:

with pd.option_context('display.max_colwidth', 200):
    display(poverty[poverty['SubTopic2'].str.contains(r'\(or', na=False) ])


Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3


In [115]:
with pd.option_context('display.max_colwidth', 200):
    display(poverty[poverty['SubTopic3'].str.contains(r'\(or', na=False) ])


Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3


# Private    

In [116]:
private  = dfs["private"]

In [117]:
with pd.option_context('display.max_colwidth', 200):

    display(private)

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
0,IC.FRM.FEMO.ZS,Firms with female participation in ownership (% of firms),Private Sector & Trade: Business environment,Private Sector & Trade,Business environment,Firms,Female ownership
1,IC.FRM.METG.ZS,Firms visited or required meetings with tax officials (% of firms),Private Sector & Trade: Business environment,Private Sector & Trade,Business environment,Firms,Meetings
2,IC.TAX.PRFT.CP.ZS,Profit tax (% of commercial profits),Private Sector & Trade: Business environment,Private Sector & Trade,Business environment,Tax revenue (or tariff) (or tax related),Profit
3,IC.FRM.FEMM.ZS,Firms with female top manager (% of firms),Private Sector & Trade: Business environment,Private Sector & Trade,Business environment,Firms,Female manager
4,IC.LGL.CRED.XQ,Strength of legal rights index (0=weak to 12=strong),Private Sector & Trade: Business environment,Private Sector & Trade,Business environment,Legal,Credit
...,...,...,...,...,...,...,...
133,ST.INT.DPRT,"International tourism, number of departures",Private Sector & Trade: Travel & tourism,Private Sector & Trade,Travel & tourism,Interest payments (or international),Departures
134,ST.INT.TVLR.CD,"International tourism, receipts for travel items (current US$)",Private Sector & Trade: Travel & tourism,Private Sector & Trade,Travel & tourism,Interest payments (or international),Travel receipts
135,ST.INT.RCPT.XP.ZS,"International tourism, receipts (% of total exports)",Private Sector & Trade: Travel & tourism,Private Sector & Trade,Travel & tourism,Interest payments (or international),Receipts
136,ST.INT.TRNR.CD,"International tourism, receipts for passenger transport items (current US$)",Private Sector & Trade: Travel & tourism,Private Sector & Trade,Travel & tourism,Interest payments (or international),Transport receipts (or Turnover ratio)


In [118]:
with pd.option_context('display.max_colwidth', 200):

    display(private[private['SubTopic2'].isna()])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
100,IE.PPN.ENGY.CD,Public private partnerships investment in energy (current US$),Private Sector & Trade: Private infrastructure investment,Private Sector & Trade,Private infrastructure investment,,Energy
101,IE.PPN.ICTI.CD,Public private partnerships investment in ICT (current US$),Private Sector & Trade: Private infrastructure investment,Private Sector & Trade,Private infrastructure investment,,ICT investment
102,IE.PPN.TRAN.CD,Public private partnerships investment in transport (current US$),Private Sector & Trade: Private infrastructure investment,Private Sector & Trade,Private infrastructure investment,,Transport
103,IE.PPN.WATR.CD,Public private partnerships investment in water and sanitation (current US$),Private Sector & Trade: Private infrastructure investment,Private Sector & Trade,Private infrastructure investment,,Water and sanitation
126,TX.UVI.MRCH.XD.WD,Export unit value index (2015 = 100),Private Sector & Trade: Trade price indices,Private Sector & Trade,Trade price indices,,Goods (merchandise)
127,TM.UVI.MRCH.XD.WD,Import unit value index (2015 = 100),Private Sector & Trade: Trade price indices,Private Sector & Trade,Trade price indices,,Goods (merchandise)


In [119]:
private.loc[private['Code'].str.startswith('IE.PPN'), "SubTopic2"] = "Public-Private Partnerships (PPP) Investments"
private.loc[private['Code'].str.startswith('TX.UVI') | private['Code'].str.startswith('TM.UVI'), "SubTopic2"] = "Unit Value Index "


In [120]:
private[private['Code'].str.startswith('TX.UVI') | private['Code'].str.startswith('TM.UVI')| private['Code'].str.startswith('IE.PPN')]

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
100,IE.PPN.ENGY.CD,Public private partnerships investment in ener...,Private Sector & Trade: Private infrastructure...,Private Sector & Trade,Private infrastructure investment,Public-Private Partnerships (PPP) Investments,Energy
101,IE.PPN.ICTI.CD,Public private partnerships investment in ICT ...,Private Sector & Trade: Private infrastructure...,Private Sector & Trade,Private infrastructure investment,Public-Private Partnerships (PPP) Investments,ICT investment
102,IE.PPN.TRAN.CD,Public private partnerships investment in tran...,Private Sector & Trade: Private infrastructure...,Private Sector & Trade,Private infrastructure investment,Public-Private Partnerships (PPP) Investments,Transport
103,IE.PPN.WATR.CD,Public private partnerships investment in wate...,Private Sector & Trade: Private infrastructure...,Private Sector & Trade,Private infrastructure investment,Public-Private Partnerships (PPP) Investments,Water and sanitation
126,TX.UVI.MRCH.XD.WD,Export unit value index (2015 = 100),Private Sector & Trade: Trade price indices,Private Sector & Trade,Trade price indices,Unit Value Index,Goods (merchandise)
127,TM.UVI.MRCH.XD.WD,Import unit value index (2015 = 100),Private Sector & Trade: Trade price indices,Private Sector & Trade,Trade price indices,Unit Value Index,Goods (merchandise)


In [121]:
with pd.option_context('display.max_colwidth', 200):

    display(private[private['SubTopic3'].isna()])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
29,IC.FRM.THEV.ZS,Firms experiencing losses due to theft and vandalism (% of firms),Private Sector & Trade: Business environment,Private Sector & Trade,Business environment,Firms,


In [122]:
private[private['Code'].str.startswith('IC.FRM')]

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
0,IC.FRM.FEMO.ZS,Firms with female participation in ownership (...,Private Sector & Trade: Business environment,Private Sector & Trade,Business environment,Firms,Female ownership
1,IC.FRM.METG.ZS,Firms visited or required meetings with tax of...,Private Sector & Trade: Business environment,Private Sector & Trade,Business environment,Firms,Meetings
3,IC.FRM.FEMM.ZS,Firms with female top manager (% of firms),Private Sector & Trade: Business environment,Private Sector & Trade,Business environment,Firms,Female manager
6,IC.FRM.BKWC.ZS,Firms using banks to finance working capital (...,Private Sector & Trade: Business environment,Private Sector & Trade,Business environment,Firms,Bank's working capital
7,IC.FRM.BNKS.ZS,Firms using banks to finance investment (% of ...,Private Sector & Trade: Business environment,Private Sector & Trade,Business environment,Firms,Banks
8,IC.FRM.RSDV.ZS,Firms that spend on R&D (% of firms),Private Sector & Trade: Business environment,Private Sector & Trade,Business environment,Firms,Research and development
15,IC.FRM.TRNG.ZS,Firms offering formal training (% of firms),Private Sector & Trade: Business environment,Private Sector & Trade,Business environment,Firms,Training
16,IC.FRM.INFM.ZS,Firms that do not report all sales for tax pur...,Private Sector & Trade: Business environment,Private Sector & Trade,Business environment,Firms,Informal
18,IC.FRM.CMPU.ZS,Firms competing against unregistered firms (% ...,Private Sector & Trade: Business environment,Private Sector & Trade,Business environment,Firms,Unregistered competition
23,IC.FRM.FREG.ZS,Firms formally registered when operations star...,Private Sector & Trade: Business environment,Private Sector & Trade,Business environment,Firms,Firms formally registered


In [123]:
private.loc[private["Code"] == "IC.FRM.THEV.ZS", "SubTopic3"] = ["Theft and Vandalism"]

In [124]:
has_list = private.apply(lambda row: isinstance(row['SubTopic2'], list) or isinstance(row['SubTopic3'], list), axis=1)


private_with_lists = private[has_list]
private_with_lists


Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3


## handling the (text (or text))

In [125]:

with pd.option_context('display.max_colwidth', 200):
    display(private[private['SubTopic2'].str.contains(r'\(or', na=False) ])


Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
2,IC.TAX.PRFT.CP.ZS,Profit tax (% of commercial profits),Private Sector & Trade: Business environment,Private Sector & Trade,Business environment,Tax revenue (or tariff) (or tax related),Profit
5,IC.REG.COST.PC.MA.ZS,"Cost of business start-up procedures, male (% of GNI per capita)",Private Sector & Trade: Business environment,Private Sector & Trade,Business environment,Registration (or regulatory),Cost
9,IC.REG.PROC.FE,"Start-up procedures to register a business, female (number)",Private Sector & Trade: Business environment,Private Sector & Trade,Business environment,Registration (or regulatory),Procedures
10,IC.REG.PROC,Start-up procedures to register a business (number),Private Sector & Trade: Business environment,Private Sector & Trade,Business environment,Registration (or regulatory),Procedures
13,IC.REG.PROC.MA,"Start-up procedures to register a business, male (number)",Private Sector & Trade: Business environment,Private Sector & Trade,Business environment,Registration (or regulatory),Procedures
14,IC.REG.COST.PC.FE.ZS,"Cost of business start-up procedures, female (% of GNI per capita)",Private Sector & Trade: Business environment,Private Sector & Trade,Business environment,Registration (or regulatory),Cost
21,IC.TAX.GIFT.ZS,Firms expected to give gifts in meetings with tax officials (% of firms),Private Sector & Trade: Business environment,Private Sector & Trade,Business environment,Tax revenue (or tariff) (or tax related),Gifts
25,IC.TAX.PAYM,Tax payments (number),Private Sector & Trade: Business environment,Private Sector & Trade,Business environment,Tax revenue (or tariff) (or tax related),Payments
28,IC.REG.COST.PC.ZS,Cost of business start-up procedures (% of GNI per capita),Private Sector & Trade: Business environment,Private Sector & Trade,Business environment,Registration (or regulatory),Cost
30,IC.REG.DURS,Time required to start a business (days),Private Sector & Trade: Business environment,Private Sector & Trade,Business environment,Registration (or regulatory),Duration


In [126]:
private.loc[private['Code'].str.startswith('IC.TAX') , "SubTopic2"] = "Tax revenue (or tax related)"
private.loc[private['Code'].str.startswith('IC.REG') , "SubTopic2"] = "Registration "
private.loc[private['Code'].str.startswith('IC.ISV') , "SubTopic2"] = "Insolvency"
private.loc[private['Code'].str.startswith('IE.PPI') , "SubTopic2"] = "Private participation in infrastructure"
private.loc[private['Code'].str.startswith('ST.INT') , "SubTopic2"] = "International"

In [127]:
with pd.option_context('display.max_colwidth', 200):
    display(private[private['SubTopic3'].str.contains(r'\(or', na=False) ])


Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
114,LP.LPI.INFR.XQ,Logistics performance index: Quality of trade and transport-related infrastructure (1=low to 5=high),Private Sector & Trade: Trade facilitation,Private Sector & Trade,Trade facilitation,Logistics performance index,Infrastructure (or infant reported)
136,ST.INT.TRNR.CD,"International tourism, receipts for passenger transport items (current US$)",Private Sector & Trade: Travel & tourism,Private Sector & Trade,Travel & tourism,International,Transport receipts (or Turnover ratio)


In [128]:
private.loc[private['Code'].str.startswith('LP.LPI.INFR') , "SubTopic3"] = "Infrastructure"
private.loc[private['Code'].str.startswith('ST.INT.TRNR') , "SubTopic3"] = "Transport receipts"

# Public     

In [129]:
public  = dfs["public"]

In [130]:
with pd.option_context('display.max_colwidth', 200):

    display(public)

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
0,VC.BTL.DETH,Battle-related deaths (number of people),Public Sector: Conflict & fragility,Public Sector,Conflict & fragility,Battle,Deaths
1,VC.IHR.PSRC.P5,"Intentional homicides (per 100,000 people)",Public Sector: Conflict & fragility,Public Sector,Conflict & fragility,Intentional homicides,Primary and secondary
2,VC.IHR.PSRC.FE.P5,"Intentional homicides, female (per 100,000 female)",Public Sector: Conflict & fragility,Public Sector,Conflict & fragility,Intentional homicides,Primary and secondary
3,VC.IHR.PSRC.MA.P5,"Intentional homicides, male (per 100,000 male)",Public Sector: Conflict & fragility,Public Sector,Conflict & fragility,Intentional homicides,Primary and secondary
4,VC.IDP.NWCV,"Internally displaced persons, new displacement associated with conflict and violence (number of cases)",Public Sector: Conflict & fragility,Public Sector,Conflict & fragility,Internally displaced persons,New conflict and violence (IDPs)
...,...,...,...,...,...,...,...
128,IQ.CPA.SOCI.XQ,CPIA policies for social inclusion/equity cluster average (1=low to 6=high),Public Sector: Policy & institutions,Public Sector,Policy & institutions,Country Policy and Institutional Assessment,Social inclusion/equity
129,IQ.CPA.FISP.XQ,CPIA fiscal policy rating (1=low to 6=high),Public Sector: Policy & institutions,Public Sector,Policy & institutions,Country Policy and Institutional Assessment,Fiscal policy
130,IQ.CPA.REVN.XQ,CPIA efficiency of revenue mobilization rating (1=low to 6=high),Public Sector: Policy & institutions,Public Sector,Policy & institutions,Country Policy and Institutional Assessment,Revenue mobilization
131,RQ.STD.ERR,Regulatory Quality: Standard Error,Public Sector: Policy & institutions,Public Sector,Policy & institutions,,


In [131]:
with pd.option_context('display.max_colwidth', 200):

    display(public[public['SubTopic2'].isna()])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
61,RL.PER.RNK.LOWER,"Rule of Law: Percentile Rank, Lower Bound of 90% Confidence Interval",Public Sector: Policy & institutions,Public Sector,Policy & institutions,,
62,RL.EST,Rule of Law: Estimate,Public Sector: Policy & institutions,Public Sector,Policy & institutions,,
63,RL.NO.SRC,Rule of Law: Number of Sources,Public Sector: Policy & institutions,Public Sector,Policy & institutions,,
64,RL.PER.RNK,Rule of Law: Percentile Rank,Public Sector: Policy & institutions,Public Sector,Policy & institutions,,
65,RL.PER.RNK.UPPER,"Rule of Law: Percentile Rank, Upper Bound of 90% Confidence Interval",Public Sector: Policy & institutions,Public Sector,Policy & institutions,,
66,CC.STD.ERR,Control of Corruption: Standard Error,Public Sector: Policy & institutions,Public Sector,Policy & institutions,,
67,RL.STD.ERR,Rule of Law: Standard Error,Public Sector: Policy & institutions,Public Sector,Policy & institutions,,
68,CC.NO.SRC,Control of Corruption: Number of Sources,Public Sector: Policy & institutions,Public Sector,Policy & institutions,,
69,CC.PER.RNK,Control of Corruption: Percentile Rank,Public Sector: Policy & institutions,Public Sector,Policy & institutions,,
70,CC.PER.RNK.UPPER,"Control of Corruption: Percentile Rank, Upper Bound of 90% Confidence Interval",Public Sector: Policy & institutions,Public Sector,Policy & institutions,,


In [132]:
a = public[public['SubTopic2'].isna()].copy()
a["Code_Prefix"] = a["Code"].str.split(".").str[1]


unique_prefixes = a["Code_Prefix"].unique()
print(unique_prefixes)


['PER' 'EST' 'NO' 'STD' 'SPI' 'HCI']


In [133]:
public[public['Code'].str.contains('SPI')]

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
73,IQ.SPI.OVRL,Statistical performance indicators (SPI): Over...,Public Sector: Policy & institutions,Public Sector,Policy & institutions,,Overall
74,IQ.SPI.PIL1,Statistical performance indicators (SPI): Pill...,Public Sector: Policy & institutions,Public Sector,Policy & institutions,,
75,IQ.SPI.PIL2,Statistical performance indicators (SPI): Pill...,Public Sector: Policy & institutions,Public Sector,Policy & institutions,,
76,IQ.SPI.PIL3,Statistical performance indicators (SPI): Pill...,Public Sector: Policy & institutions,Public Sector,Policy & institutions,,
77,IQ.SPI.PIL4,Statistical performance indicators (SPI): Pill...,Public Sector: Policy & institutions,Public Sector,Policy & institutions,,
78,IQ.SPI.PIL5,Statistical performance indicators (SPI): Pill...,Public Sector: Policy & institutions,Public Sector,Policy & institutions,,


In [134]:
public[public['Code'].str.contains('EST')]

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
62,RL.EST,Rule of Law: Estimate,Public Sector: Policy & institutions,Public Sector,Policy & institutions,,
71,CC.EST,Control of Corruption: Estimate,Public Sector: Policy & institutions,Public Sector,Policy & institutions,,
79,VA.EST,Voice and Accountability: Estimate,Public Sector: Policy & institutions,Public Sector,Policy & institutions,,
87,PV.EST,Political Stability and Absence of Violence/Te...,Public Sector: Policy & institutions,Public Sector,Policy & institutions,,
108,GE.EST,Government Effectiveness: Estimate,Public Sector: Policy & institutions,Public Sector,Policy & institutions,,
118,RQ.EST,Regulatory Quality: Estimate,Public Sector: Policy & institutions,Public Sector,Policy & institutions,,


In [135]:
public[public['Code'].str.contains('PER')]

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
61,RL.PER.RNK.LOWER,"Rule of Law: Percentile Rank, Lower Bound of 9...",Public Sector: Policy & institutions,Public Sector,Policy & institutions,,
64,RL.PER.RNK,Rule of Law: Percentile Rank,Public Sector: Policy & institutions,Public Sector,Policy & institutions,,
65,RL.PER.RNK.UPPER,"Rule of Law: Percentile Rank, Upper Bound of 9...",Public Sector: Policy & institutions,Public Sector,Policy & institutions,,
69,CC.PER.RNK,Control of Corruption: Percentile Rank,Public Sector: Policy & institutions,Public Sector,Policy & institutions,,
70,CC.PER.RNK.UPPER,"Control of Corruption: Percentile Rank, Upper ...",Public Sector: Policy & institutions,Public Sector,Policy & institutions,,
72,CC.PER.RNK.LOWER,"Control of Corruption: Percentile Rank, Lower ...",Public Sector: Policy & institutions,Public Sector,Policy & institutions,,
80,VA.PER.RNK,Voice and Accountability: Percentile Rank,Public Sector: Policy & institutions,Public Sector,Policy & institutions,,
81,VA.PER.RNK.LOWER,"Voice and Accountability: Percentile Rank, Low...",Public Sector: Policy & institutions,Public Sector,Policy & institutions,,
82,VA.PER.RNK.UPPER,"Voice and Accountability: Percentile Rank, Upp...",Public Sector: Policy & institutions,Public Sector,Policy & institutions,,
88,PV.PER.RNK,Political Stability and Absence of Violence/Te...,Public Sector: Policy & institutions,Public Sector,Policy & institutions,,


# for PER This is a good example where we need to further add a new column to add more informations like in this case (LOWER , UPPER)

In [136]:
public[public['Code'].str.contains('STD')]

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
66,CC.STD.ERR,Control of Corruption: Standard Error,Public Sector: Policy & institutions,Public Sector,Policy & institutions,,
67,RL.STD.ERR,Rule of Law: Standard Error,Public Sector: Policy & institutions,Public Sector,Policy & institutions,,
83,VA.STD.ERR,Voice and Accountability: Standard Error,Public Sector: Policy & institutions,Public Sector,Policy & institutions,,
93,PV.STD.ERR,Political Stability and Absence of Violence/Te...,Public Sector: Policy & institutions,Public Sector,Policy & institutions,,
97,GE.STD.ERR,Government Effectiveness: Standard Error,Public Sector: Policy & institutions,Public Sector,Policy & institutions,,
131,RQ.STD.ERR,Regulatory Quality: Standard Error,Public Sector: Policy & institutions,Public Sector,Policy & institutions,,


In [137]:
public.loc[public['Code'].str.startswith('HD.HCI'), "SubTopic2"] = "Human capital index"


public.loc[public['Code'].str.startswith('RQ'), "SubTopic2"] = "Regulatory Quality"
public.loc[public['Code'].str.startswith('GE'), "SubTopic2"] = "Government Effectiveness"
public.loc[public['Code'].str.startswith('RL'), "SubTopic2"] = "Rule of Law"
public.loc[public['Code'].str.startswith('CC'), "SubTopic2"] = "Control of Corruption"
public.loc[public['Code'].str.startswith('PV'), "SubTopic2"] = "Political Stability and Absence of Violence/Terrorism"
public.loc[public['Code'].str.startswith('VA.'), "SubTopic2"] = "Voice and Accountability"
public.loc[public['Code'].str.startswith('IQ.'), "SubTopic2"] = "Statistical performance indicators"

public.loc[public['Code'].str.contains('PER'), "SubTopic3"] = "Percentile Rank"
public.loc[public["Code"].str.contains("EST"), "SubTopic3"] = "Estimate"
public.loc[public["Code"].str.contains("STD.ERR"), "SubTopic3"] = "Standard Error"
public.loc[public["Code"].str.contains("NO") , "SubTopic3"] = "Number of Sources"


public.loc[public["Code"].str.contains("PIL1") , "SubTopic3"] = "Data use score"
public.loc[public["Code"].str.contains("PIL2") , "SubTopic3"] = "Data services score"
public.loc[public["Code"].str.contains("PIL3") , "SubTopic3"] = "Data products score"
public.loc[public["Code"].str.contains("PIL4") , "SubTopic3"] = "Data sources score"
public.loc[public["Code"].str.contains("PIL5") , "SubTopic3"] = "Data infrastructure score"
public.loc[public["Code"].str.contains("OVRL") , "SubTopic3"] = "Overall score"

In [138]:
with pd.option_context('display.max_colwidth', 200):

    display(public[public['SubTopic3'].isna()])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
60,GF.XPD.BUDG.ZS,Primary government expenditures as a proportion of original approved budget (%),Public Sector: Government finance,Public Sector,Government finance,Expenditure,


In [139]:
public.loc[public["Code"] == "GF.XPD.BUDG.ZS", "SubTopic3"] = ["Budget"]

In [140]:
has_list = public.apply(lambda row: isinstance(row['SubTopic2'], list) or isinstance(row['SubTopic3'], list), axis=1)


public_with_lists = public[has_list]
with pd.option_context('display.max_colwidth', 200):
    display(public_with_lists)


Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
15,GC.DOD.TOTL.CN,"Central government debt, total (current LCU)",Public Sector: Government finance: Deficit & financing,Public Sector,Government finance,"[Deficit & financing, Debt outstanding and disbursed]",Total
16,GC.DOD.TOTL.GD.ZS,"Central government debt, total (% of GDP)",Public Sector: Government finance: Deficit & financing,Public Sector,Government finance,"[Deficit & financing, Debt outstanding and disbursed]",Total
17,GC.AST.TOTL.GD.ZS,Net acquisition of financial assets (% of GDP),Public Sector: Government finance: Deficit & financing,Public Sector,Government finance,"[Deficit & financing, Asset]",Total
18,GC.NLD.TOTL.CN,Net lending (+) / net borrowing (-) (current LCU),Public Sector: Government finance: Deficit & financing,Public Sector,Government finance,"[Deficit & financing, Net lending/net borrowing]",Total
19,GC.LBL.TOTL.GD.ZS,"Net incurrence of liabilities, total (% of GDP)",Public Sector: Government finance: Deficit & financing,Public Sector,Government finance,"[Deficit & financing, Liability]",Total
20,GC.NLD.TOTL.GD.ZS,Net lending (+) / net borrowing (-) (% of GDP),Public Sector: Government finance: Deficit & financing,Public Sector,Government finance,"[Deficit & financing, Net lending/net borrowing]",Total
21,GC.NFN.TOTL.GD.ZS,Net investment in nonfinancial assets (% of GDP),Public Sector: Government finance: Deficit & financing,Public Sector,Government finance,"[Deficit & financing, Nonfinancial assets]",Total
22,GC.NFN.TOTL.CN,Net investment in nonfinancial assets (current LCU),Public Sector: Government finance: Deficit & financing,Public Sector,Government finance,"[Deficit & financing, Nonfinancial assets]",Total
23,GC.LBL.TOTL.CN,"Net incurrence of liabilities, total (current LCU)",Public Sector: Government finance: Deficit & financing,Public Sector,Government finance,"[Deficit & financing, Liability]",Total
24,GC.AST.TOTL.CN,Net acquisition of financial assets (current LCU),Public Sector: Government finance: Deficit & financing,Public Sector,Government finance,"[Deficit & financing, Asset]",Total


In [141]:
with pd.option_context('display.max_colwidth', 200):

    display(public[public['Code'].str.startswith('GC')])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
15,GC.DOD.TOTL.CN,"Central government debt, total (current LCU)",Public Sector: Government finance: Deficit & financing,Public Sector,Government finance,"[Deficit & financing, Debt outstanding and disbursed]",Total
16,GC.DOD.TOTL.GD.ZS,"Central government debt, total (% of GDP)",Public Sector: Government finance: Deficit & financing,Public Sector,Government finance,"[Deficit & financing, Debt outstanding and disbursed]",Total
17,GC.AST.TOTL.GD.ZS,Net acquisition of financial assets (% of GDP),Public Sector: Government finance: Deficit & financing,Public Sector,Government finance,"[Deficit & financing, Asset]",Total
18,GC.NLD.TOTL.CN,Net lending (+) / net borrowing (-) (current LCU),Public Sector: Government finance: Deficit & financing,Public Sector,Government finance,"[Deficit & financing, Net lending/net borrowing]",Total
19,GC.LBL.TOTL.GD.ZS,"Net incurrence of liabilities, total (% of GDP)",Public Sector: Government finance: Deficit & financing,Public Sector,Government finance,"[Deficit & financing, Liability]",Total
20,GC.NLD.TOTL.GD.ZS,Net lending (+) / net borrowing (-) (% of GDP),Public Sector: Government finance: Deficit & financing,Public Sector,Government finance,"[Deficit & financing, Net lending/net borrowing]",Total
21,GC.NFN.TOTL.GD.ZS,Net investment in nonfinancial assets (% of GDP),Public Sector: Government finance: Deficit & financing,Public Sector,Government finance,"[Deficit & financing, Nonfinancial assets]",Total
22,GC.NFN.TOTL.CN,Net investment in nonfinancial assets (current LCU),Public Sector: Government finance: Deficit & financing,Public Sector,Government finance,"[Deficit & financing, Nonfinancial assets]",Total
23,GC.LBL.TOTL.CN,"Net incurrence of liabilities, total (current LCU)",Public Sector: Government finance: Deficit & financing,Public Sector,Government finance,"[Deficit & financing, Liability]",Total
24,GC.AST.TOTL.CN,Net acquisition of financial assets (current LCU),Public Sector: Government finance: Deficit & financing,Public Sector,Government finance,"[Deficit & financing, Asset]",Total


In [142]:
public["SubTopic2"] = public["SubTopic2"].apply(select_second_item)

## handling the (text (or text))

In [143]:

with pd.option_context('display.max_colwidth', 200):
    display(public[public['SubTopic2'].str.contains(r'\(or', na=False) ])


Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
38,GC.TAX.YPKG.CN,"Taxes on income, profits and capital gains (current LCU)",Public Sector: Government finance: Revenue,Public Sector,Government finance,Tax revenue (or tariff) (or tax related),"Income, profits and capital gains"
40,GC.TAX.GSRV.CN,Taxes on goods and services (current LCU),Public Sector: Government finance: Revenue,Public Sector,Government finance,Tax revenue (or tariff) (or tax related),Goods and services
41,GC.TAX.INTT.RV.ZS,Taxes on international trade (% of revenue),Public Sector: Government finance: Revenue,Public Sector,Government finance,Tax revenue (or tariff) (or tax related),International trade
42,GC.TAX.INTT.CN,Taxes on international trade (current LCU),Public Sector: Government finance: Revenue,Public Sector,Government finance,Tax revenue (or tariff) (or tax related),International trade
45,GC.TAX.YPKG.ZS,"Taxes on income, profits and capital gains (% of total taxes)",Public Sector: Government finance: Revenue,Public Sector,Government finance,Tax revenue (or tariff) (or tax related),"Income, profits and capital gains"
46,GC.TAX.TOTL.GD.ZS,Tax revenue (% of GDP),Public Sector: Government finance: Revenue,Public Sector,Government finance,Tax revenue (or tariff) (or tax related),Total
48,GC.TAX.TOTL.CN,Tax revenue (current LCU),Public Sector: Government finance: Revenue,Public Sector,Government finance,Tax revenue (or tariff) (or tax related),Total
49,GC.TAX.EXPT.ZS,Taxes on exports (% of tax revenue),Public Sector: Government finance: Revenue,Public Sector,Government finance,Tax revenue (or tariff) (or tax related),Exports
50,GC.TAX.EXPT.CN,Taxes on exports (current LCU),Public Sector: Government finance: Revenue,Public Sector,Government finance,Tax revenue (or tariff) (or tax related),Exports
51,GC.TAX.GSRV.RV.ZS,Taxes on goods and services (% of revenue),Public Sector: Government finance: Revenue,Public Sector,Government finance,Tax revenue (or tariff) (or tax related),Goods and services


In [144]:
public.loc[public['Code'].str.startswith('GC.TAX') , "SubTopic2"] = "Tax revenue (or tax related)"


In [145]:
with pd.option_context('display.max_colwidth', 200):
    display(public[public['SubTopic3'].str.contains(r'\(or', na=False) ])


Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3


# Social 

In [146]:
social  = dfs["social"]

In [147]:
with pd.option_context('display.max_colwidth', 200):

    display(social)

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
0,SL.TLF.0714.WK.TM,"Average working hours of children, working only, ages 7-14 (hours per week)",Social Protection & Labor: Economic activity,Social Protection & Labor,Economic activity,Total labor force,Age 7-14
1,SL.TLF.0714.SW.TM,"Average working hours of children, study and work, ages 7-14 (hours per week)",Social Protection & Labor: Economic activity,Social Protection & Labor,Economic activity,Total labor force,Age 7-14
2,SL.TLF.0714.SW.FE.TM,"Average working hours of children, study and work, female, ages 7-14 (hours per week)",Social Protection & Labor: Economic activity,Social Protection & Labor,Economic activity,Total labor force,Age 7-14
3,SL.TLF.0714.SW.MA.TM,"Average working hours of children, study and work, male, ages 7-14 (hours per week)",Social Protection & Labor: Economic activity,Social Protection & Labor,Economic activity,Total labor force,Age 7-14
4,SL.TLF.0714.WK.FE.TM,"Average working hours of children, working only, female, ages 7-14 (hours per week)",Social Protection & Labor: Economic activity,Social Protection & Labor,Economic activity,Total labor force,Age 7-14
...,...,...,...,...,...,...,...
156,SL.UEM.NEET.FE.ZS,"Share of youth not in education, employment or training, female (% of female youth population)",Social Protection & Labor: Unemployment,Social Protection & Labor,Unemployment,Unemployment,Not in education or training
157,SL.UEM.NEET.FE.ME.ZS,"Share of youth not in education, employment or training, female (% of female youth population) (modeled ILO estimate)",Social Protection & Labor: Unemployment,Social Protection & Labor,Unemployment,Unemployment,Not in education or training
158,SL.UEM.NEET.MA.ZS,"Share of youth not in education, employment or training, male (% of male youth population)",Social Protection & Labor: Unemployment,Social Protection & Labor,Unemployment,Unemployment,Not in education or training
159,SL.UEM.NEET.ZS,"Share of youth not in education, employment or training, total (% of youth population)",Social Protection & Labor: Unemployment,Social Protection & Labor,Unemployment,Unemployment,Not in education or training


In [148]:
with pd.option_context('display.max_colwidth', 200):

    display(social[social['SubTopic2'].isna()])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
107,per_si_allsi.ben_q1_tot,Benefit incidence of social insurance programs to poorest quintile (% of total social insurance benefits),Social Protection & Labor: Performance,Social Protection & Labor,Performance,,
108,per_sa_allsa.ben_q1_tot,Benefit incidence of social safety net programs to poorest quintile (% of total safety net benefits),Social Protection & Labor: Performance,Social Protection & Labor,Performance,,
109,per_lm_alllm.ben_q1_tot,Benefit incidence of unemployment benefits and ALMP to poorest quintile (% of total U/ALMP benefits),Social Protection & Labor: Performance,Social Protection & Labor,Performance,,
110,per_allsp.ben_q1_tot,Benefit incidence of social protection and labor programs to poorest quintile (% of total SPL benefits),Social Protection & Labor: Performance,Social Protection & Labor,Performance,,
111,per_allsp.adq_pop_tot,Adequacy of social protection and labor programs (% of total welfare of beneficiary households),Social Protection & Labor: Performance,Social Protection & Labor,Performance,,
112,per_si_allsi.adq_pop_tot,Adequacy of social insurance programs (% of total welfare of beneficiary households),Social Protection & Labor: Performance,Social Protection & Labor,Performance,,
113,per_lm_alllm.adq_pop_tot,Adequacy of unemployment benefits and ALMP (% of total welfare of beneficiary households),Social Protection & Labor: Performance,Social Protection & Labor,Performance,,
114,per_sa_allsa.adq_pop_tot,Adequacy of social safety net programs (% of total welfare of beneficiary households),Social Protection & Labor: Performance,Social Protection & Labor,Performance,,
115,per_sa_allsa.cov_q2_tot,Coverage of social safety net programs in 2nd quintile (% of population),Social Protection & Labor: Performance,Social Protection & Labor,Performance,,
116,per_lm_alllm.cov_q5_tot,Coverage of unemployment benefits and ALMP in richest quintile (% of population),Social Protection & Labor: Performance,Social Protection & Labor,Performance,,


# These are unexpected type of codes that are found  in the wdi metadata so for better work we will convert them to more suitable structure and we will store them into list 

In [149]:
# SP = Social Protection and Labor Programs

def transform_code(code):
    if code.startswith("per_"):  
        parts = code.replace('.', '_').split('_') 
        if parts[1] == "allsp":  # Special case for "SP"
            return "SP" + '.' + parts[2].upper() + '.' + parts[3].upper() + '.' + parts[4].upper()
        else:
            return parts[1].upper() + '.' + parts[3].upper() + '.' + parts[4].upper() + '.' + parts[5].upper()
    return code



## For Storing the new codes

In [150]:
changes = social[social['SubTopic2'].isna()]["Code"]
changes = changes.to_frame(name="Old Code")
changes.reset_index(drop=True, inplace=True)



In [151]:
changes["Code"] = changes["Old Code"].apply(transform_code)
changes.to_excel("../data/hierarchy/changed_codes_social.xlsx", index=False)

In [152]:
social["Code"] = social["Code"].apply(transform_code)

In [153]:
with pd.option_context('display.max_colwidth', 200):

    display(social[social['SubTopic2'].isna()])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
107,SI.BEN.Q1.TOT,Benefit incidence of social insurance programs to poorest quintile (% of total social insurance benefits),Social Protection & Labor: Performance,Social Protection & Labor,Performance,,
108,SA.BEN.Q1.TOT,Benefit incidence of social safety net programs to poorest quintile (% of total safety net benefits),Social Protection & Labor: Performance,Social Protection & Labor,Performance,,
109,LM.BEN.Q1.TOT,Benefit incidence of unemployment benefits and ALMP to poorest quintile (% of total U/ALMP benefits),Social Protection & Labor: Performance,Social Protection & Labor,Performance,,
110,SP.BEN.Q1.TOT,Benefit incidence of social protection and labor programs to poorest quintile (% of total SPL benefits),Social Protection & Labor: Performance,Social Protection & Labor,Performance,,
111,SP.ADQ.POP.TOT,Adequacy of social protection and labor programs (% of total welfare of beneficiary households),Social Protection & Labor: Performance,Social Protection & Labor,Performance,,
112,SI.ADQ.POP.TOT,Adequacy of social insurance programs (% of total welfare of beneficiary households),Social Protection & Labor: Performance,Social Protection & Labor,Performance,,
113,LM.ADQ.POP.TOT,Adequacy of unemployment benefits and ALMP (% of total welfare of beneficiary households),Social Protection & Labor: Performance,Social Protection & Labor,Performance,,
114,SA.ADQ.POP.TOT,Adequacy of social safety net programs (% of total welfare of beneficiary households),Social Protection & Labor: Performance,Social Protection & Labor,Performance,,
115,SA.COV.Q2.TOT,Coverage of social safety net programs in 2nd quintile (% of population),Social Protection & Labor: Performance,Social Protection & Labor,Performance,,
116,LM.COV.Q5.TOT,Coverage of unemployment benefits and ALMP in richest quintile (% of population),Social Protection & Labor: Performance,Social Protection & Labor,Performance,,


In [154]:
social.loc[social['Code'].str.startswith('SI'), "SubTopic2"] = "Social insurance programs"
social.loc[social['Code'].str.startswith('SA'), "SubTopic2"] = "Social safety net programs"
social.loc[social['Code'].str.startswith('LM'), "SubTopic2"] = "Unemployment benefits and ALMP"
social.loc[social['Code'].str.startswith('SP'), "SubTopic2"] = "Social protection and labor programs"


social.loc[social['Code'].str.contains('COV'), "SubTopic3"] = "Coverage"
social.loc[social["Code"].str.contains("ADQ"), "SubTopic3"] = "Adequacy"
social.loc[social["Code"].str.contains("BEN") , "SubTopic3"] = "Benefit incidence"



In [155]:
social.loc[social['Code'].str.startswith('SI') |social['Code'].str.startswith('SA')|social['Code'].str.startswith('LM')|social['Code'].str.startswith('SP')]

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
107,SI.BEN.Q1.TOT,Benefit incidence of social insurance programs...,Social Protection & Labor: Performance,Social Protection & Labor,Performance,Social insurance programs,Benefit incidence
108,SA.BEN.Q1.TOT,Benefit incidence of social safety net program...,Social Protection & Labor: Performance,Social Protection & Labor,Performance,Social safety net programs,Benefit incidence
109,LM.BEN.Q1.TOT,Benefit incidence of unemployment benefits and...,Social Protection & Labor: Performance,Social Protection & Labor,Performance,Unemployment benefits and ALMP,Benefit incidence
110,SP.BEN.Q1.TOT,Benefit incidence of social protection and lab...,Social Protection & Labor: Performance,Social Protection & Labor,Performance,Social protection and labor programs,Benefit incidence
111,SP.ADQ.POP.TOT,Adequacy of social protection and labor progra...,Social Protection & Labor: Performance,Social Protection & Labor,Performance,Social protection and labor programs,Adequacy
112,SI.ADQ.POP.TOT,Adequacy of social insurance programs (% of to...,Social Protection & Labor: Performance,Social Protection & Labor,Performance,Social insurance programs,Adequacy
113,LM.ADQ.POP.TOT,Adequacy of unemployment benefits and ALMP (% ...,Social Protection & Labor: Performance,Social Protection & Labor,Performance,Unemployment benefits and ALMP,Adequacy
114,SA.ADQ.POP.TOT,Adequacy of social safety net programs (% of t...,Social Protection & Labor: Performance,Social Protection & Labor,Performance,Social safety net programs,Adequacy
115,SA.COV.Q2.TOT,Coverage of social safety net programs in 2nd ...,Social Protection & Labor: Performance,Social Protection & Labor,Performance,Social safety net programs,Coverage
116,LM.COV.Q5.TOT,Coverage of unemployment benefits and ALMP in ...,Social Protection & Labor: Performance,Social Protection & Labor,Performance,Unemployment benefits and ALMP,Coverage


In [156]:
with pd.option_context('display.max_colwidth', 200):

    display(social[social['SubTopic3'].isna()])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3


In [157]:
has_list = social.apply(lambda row: isinstance(row['SubTopic2'], list) or isinstance(row['SubTopic3'], list), axis=1)


social_with_lists = social[has_list]
with pd.option_context('display.max_colwidth', 200):
    display(social_with_lists)


Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3


## handling the (text (or text))

In [158]:

with pd.option_context('display.max_colwidth', 200):
    display(social[social['SubTopic2'].str.contains(r'\(or', na=False) ])


Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
22,SL.IND.EMPL.ZS,Employment in industry (% of total employment) (modeled ILO estimate),Social Protection & Labor: Economic activity,Social Protection & Labor,Economic activity,Industry (or interest due) (or index),Economically active population (employees)
23,SL.IND.EMPL.FE.ZS,"Employment in industry, female (% of female employment) (modeled ILO estimate)",Social Protection & Labor: Economic activity,Social Protection & Labor,Economic activity,Industry (or interest due) (or index),Economically active population (employees)
24,SL.IND.EMPL.MA.ZS,"Employment in industry, male (% of male employment) (modeled ILO estimate)",Social Protection & Labor: Economic activity,Social Protection & Labor,Economic activity,Industry (or interest due) (or index),Economically active population (employees)


In [159]:
social.loc[social['Code'].str.startswith('SL.IND') , "SubTopic2"] = "Industry"


In [160]:
with pd.option_context('display.max_colwidth', 200):
    display(social[social['SubTopic3'].str.contains(r'\(or', na=False) ])


Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3


# Trade  

In [161]:
trade  = dfs["trade"]

In [162]:
with pd.option_context('display.max_colwidth', 200):

    display(trade)

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
0,TM.TAX.TCOM.BR.ZS,"Bound rate, simple mean, primary products (%)",Trade,Trade,Tariffs,Tax revenue (or tariff) (or tax related),Primary products (commodities)
1,TM.TAX.MANF.IP.ZS,"Share of tariff lines with international peaks, manufactured products (%)",Trade,Trade,Tariffs,Tax revenue (or tariff) (or tax related),Manufacturing
2,TM.TAX.MRCH.BR.ZS,"Bound rate, simple mean, all products (%)",Trade,Trade,Tariffs,Tax revenue (or tariff) (or tax related),Goods (merchandise)
3,TM.TAX.TCOM.WM.FN.ZS,"Tariff rate, most favored nation, weighted mean, primary products (%)",Trade,Trade,Tariffs,Tax revenue (or tariff) (or tax related),Primary products (commodities)
4,TM.TAX.TCOM.BC.ZS,"Binding coverage, primary products (%)",Trade,Trade,Tariffs,Tax revenue (or tariff) (or tax related),Primary products (commodities)
5,TM.TAX.MANF.BC.ZS,"Binding coverage, manufactured products (%)",Trade,Trade,Tariffs,Tax revenue (or tariff) (or tax related),Manufacturing
6,TM.TAX.MRCH.BC.ZS,"Binding coverage, all products (%)",Trade,Trade,Tariffs,Tax revenue (or tariff) (or tax related),Goods (merchandise)
7,TM.TAX.MANF.WM.FN.ZS,"Tariff rate, most favored nation, weighted mean, manufactured products (%)",Trade,Trade,Tariffs,Tax revenue (or tariff) (or tax related),Manufacturing
8,TM.TAX.MRCH.WM.FN.ZS,"Tariff rate, most favored nation, weighted mean, all products (%)",Trade,Trade,Tariffs,Tax revenue (or tariff) (or tax related),Goods (merchandise)
9,TM.TAX.TCOM.SM.FN.ZS,"Tariff rate, most favored nation, simple mean, primary products (%)",Trade,Trade,Tariffs,Tax revenue (or tariff) (or tax related),Primary products (commodities)


In [163]:
with pd.option_context('display.max_colwidth', 200):

    display(trade[trade['SubTopic2'].isna()])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3


In [164]:
with pd.option_context('display.max_colwidth', 200):

    display(trade[trade['SubTopic3'].isna()])

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3


In [165]:
has_list = trade.apply(lambda row: isinstance(row['SubTopic2'], list) or isinstance(row['SubTopic3'], list), axis=1)


trade_with_lists = trade[has_list]
with pd.option_context('display.max_colwidth', 200):
    display(trade_with_lists)


Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3


## handling the (text (or text))

In [166]:

with pd.option_context('display.max_colwidth', 200):
    display(trade[trade['SubTopic2'].str.contains(r'\(or', na=False) ])


Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
0,TM.TAX.TCOM.BR.ZS,"Bound rate, simple mean, primary products (%)",Trade,Trade,Tariffs,Tax revenue (or tariff) (or tax related),Primary products (commodities)
1,TM.TAX.MANF.IP.ZS,"Share of tariff lines with international peaks, manufactured products (%)",Trade,Trade,Tariffs,Tax revenue (or tariff) (or tax related),Manufacturing
2,TM.TAX.MRCH.BR.ZS,"Bound rate, simple mean, all products (%)",Trade,Trade,Tariffs,Tax revenue (or tariff) (or tax related),Goods (merchandise)
3,TM.TAX.TCOM.WM.FN.ZS,"Tariff rate, most favored nation, weighted mean, primary products (%)",Trade,Trade,Tariffs,Tax revenue (or tariff) (or tax related),Primary products (commodities)
4,TM.TAX.TCOM.BC.ZS,"Binding coverage, primary products (%)",Trade,Trade,Tariffs,Tax revenue (or tariff) (or tax related),Primary products (commodities)
5,TM.TAX.MANF.BC.ZS,"Binding coverage, manufactured products (%)",Trade,Trade,Tariffs,Tax revenue (or tariff) (or tax related),Manufacturing
6,TM.TAX.MRCH.BC.ZS,"Binding coverage, all products (%)",Trade,Trade,Tariffs,Tax revenue (or tariff) (or tax related),Goods (merchandise)
7,TM.TAX.MANF.WM.FN.ZS,"Tariff rate, most favored nation, weighted mean, manufactured products (%)",Trade,Trade,Tariffs,Tax revenue (or tariff) (or tax related),Manufacturing
8,TM.TAX.MRCH.WM.FN.ZS,"Tariff rate, most favored nation, weighted mean, all products (%)",Trade,Trade,Tariffs,Tax revenue (or tariff) (or tax related),Goods (merchandise)
9,TM.TAX.TCOM.SM.FN.ZS,"Tariff rate, most favored nation, simple mean, primary products (%)",Trade,Trade,Tariffs,Tax revenue (or tariff) (or tax related),Primary products (commodities)


# As we can see here both subtopic1 and 2 will have tarrif as value after further anlysing the best way is to change the values of subtopic1 as the wdi indicates  TM => imports and TX => Exports

In [167]:
trade

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
0,TM.TAX.TCOM.BR.ZS,"Bound rate, simple mean, primary products (%)",Trade,Trade,Tariffs,Tax revenue (or tariff) (or tax related),Primary products (commodities)
1,TM.TAX.MANF.IP.ZS,Share of tariff lines with international peaks...,Trade,Trade,Tariffs,Tax revenue (or tariff) (or tax related),Manufacturing
2,TM.TAX.MRCH.BR.ZS,"Bound rate, simple mean, all products (%)",Trade,Trade,Tariffs,Tax revenue (or tariff) (or tax related),Goods (merchandise)
3,TM.TAX.TCOM.WM.FN.ZS,"Tariff rate, most favored nation, weighted mea...",Trade,Trade,Tariffs,Tax revenue (or tariff) (or tax related),Primary products (commodities)
4,TM.TAX.TCOM.BC.ZS,"Binding coverage, primary products (%)",Trade,Trade,Tariffs,Tax revenue (or tariff) (or tax related),Primary products (commodities)
5,TM.TAX.MANF.BC.ZS,"Binding coverage, manufactured products (%)",Trade,Trade,Tariffs,Tax revenue (or tariff) (or tax related),Manufacturing
6,TM.TAX.MRCH.BC.ZS,"Binding coverage, all products (%)",Trade,Trade,Tariffs,Tax revenue (or tariff) (or tax related),Goods (merchandise)
7,TM.TAX.MANF.WM.FN.ZS,"Tariff rate, most favored nation, weighted mea...",Trade,Trade,Tariffs,Tax revenue (or tariff) (or tax related),Manufacturing
8,TM.TAX.MRCH.WM.FN.ZS,"Tariff rate, most favored nation, weighted mea...",Trade,Trade,Tariffs,Tax revenue (or tariff) (or tax related),Goods (merchandise)
9,TM.TAX.TCOM.SM.FN.ZS,"Tariff rate, most favored nation, simple mean,...",Trade,Trade,Tariffs,Tax revenue (or tariff) (or tax related),Primary products (commodities)


In [168]:
trade.loc[trade['Code'].str.startswith('TM') , "SubTopic1"] = "Imports"
trade.loc[trade['Code'].str.startswith('TX') , "SubTopic1"] = "Exports"
trade.loc[trade['Code'].str.startswith('TT') , "SubTopic1"] = "Terms of trade"


trade.loc[trade['Code'].str.startswith('TM.TAX') , "SubTopic2"] = "Tariff"


In [169]:
with pd.option_context('display.max_colwidth', 200):
    display(trade[trade['SubTopic3'].str.contains(r'\(or', na=False) ])


Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3


## Save to Excel

In [171]:

with pd.ExcelWriter('../data/hierarchy/hierarchy_code_by_sheet.xlsx') as writer:
    for sheet_name, df in dfs.items():
        topic_value = df['Topic'].iloc[0] 
        df = df.sort_values(['SubTopic1', 'SubTopic2', 'SubTopic3'])
        df.to_excel(writer, sheet_name=str(topic_value), index=False)



### Combine into one

In [172]:

combined_df = pd.concat(dfs.values(), ignore_index=True)
combined_df = combined_df.sort_values(['Topic','SubTopic1', 'SubTopic2', 'SubTopic3'])
combined_df.to_excel('../data/hierarchy/hierarchy_code.xlsx', index=False)

In [173]:
combined_df

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3
0,BN.KAC.EOMS.CD,"Net errors and omissions (BoP, current US$)",Economic Policy & Debt: Balance of payments: C...,Economic Policy & Debt,Balance of payments,Capital account,Errors and omissions
1,BM.KLT.DINV.CD.WD,"Foreign direct investment, net outflows (BoP, ...",Economic Policy & Debt: Balance of payments: C...,Economic Policy & Debt,Balance of payments,Capital account: long term capital,Direct investment
2,BM.KLT.DINV.WD.GD.ZS,"Foreign direct investment, net outflows (% of ...",Economic Policy & Debt: Balance of payments: C...,Economic Policy & Debt,Balance of payments,Capital account: long term capital,Direct investment
3,BX.KLT.DINV.WD.GD.ZS,"Foreign direct investment, net inflows (% of GDP)",Economic Policy & Debt: Balance of payments: C...,Economic Policy & Debt,Balance of payments,Capital account: long term capital,Direct investment
4,BN.KLT.DINV.CD,"Foreign direct investment, net (BoP, current US$)",Economic Policy & Debt: Balance of payments: C...,Economic Policy & Debt,Balance of payments,Capital account: long term capital,Direct investment
...,...,...,...,...,...,...,...
1482,TM.TAX.TCOM.SM.AR.ZS,"Tariff rate, applied, simple mean, primary pro...",Trade,Trade,Imports,Tariff,Primary products (commodities)
1486,TM.TAX.TCOM.SR.ZS,"Share of tariff lines with specific rates, pri...",Trade,Trade,Imports,Tariff,Primary products (commodities)
1489,TM.TAX.TCOM.IP.ZS,Share of tariff lines with international peaks...,Trade,Trade,Imports,Tariff,Primary products (commodities)
1493,TM.VAL.MRCH.XD.WD,Import value index (2015 = 100),Trade,Trade,Imports,Value,Goods (merchandise)


# Add the new values to the metada

In [174]:
metadata_code = pd.read_excel('../data/metadata/metadata_topic.xlsx', sheet_name=0)
metadata_code

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3,Short definition,Long definition,License Type,...,Statistical concept and methodology,Development relevance,Limitations and exceptions,General comments,Other notes,Notes from original source,Related source links,Other web links,Related indicators,License URL
0,BX.KLT.DINV.CD.WD,"Foreign direct investment, net inflows (BoP, c...",Economic Policy & Debt: Balance of payments: C...,Economic Policy & Debt,Balance of payments,Capital & financial account,,,Foreign direct investment refers to direct inv...,CC BY-4.0,...,Data on equity flows are based on balance of p...,Private financial flows - equity and debt - ac...,FDI data do not give a complete picture of int...,Note: Data starting from 2005 are based on the...,,,,,,https://datacatalog.worldbank.org/public-licen...
1,BN.KLT.DINV.CD,"Foreign direct investment, net (BoP, current US$)",Economic Policy & Debt: Balance of payments: C...,Economic Policy & Debt,Balance of payments,Capital & financial account,,,Foreign direct investment are the net inflows ...,CC BY-4.0,...,,,,Note: Data are based on the sixth edition of t...,,,,,,https://datacatalog.worldbank.org/public-licen...
2,BX.KLT.DINV.WD.GD.ZS,"Foreign direct investment, net inflows (% of GDP)",Economic Policy & Debt: Balance of payments: C...,Economic Policy & Debt,Balance of payments,Capital & financial account,,,Foreign direct investment are the net inflows ...,CC BY-4.0,...,Data on equity flows are based on balance of p...,Private financial flows - equity and debt - ac...,FDI data do not give a complete picture of int...,Note: Data starting from 2005 are based on the...,,,,,,https://datacatalog.worldbank.org/public-licen...
3,BM.KLT.DINV.WD.GD.ZS,"Foreign direct investment, net outflows (% of ...",Economic Policy & Debt: Balance of payments: C...,Economic Policy & Debt,Balance of payments,Capital & financial account,,,Foreign direct investment refers to direct inv...,CC BY-4.0,...,Data on equity flows are based on balance of p...,Private financial flows - equity and debt - ac...,FDI data do not give a complete picture of int...,Note: Data starting from 2005 are based on the...,,,,,,https://datacatalog.worldbank.org/public-licen...
4,BM.KLT.DINV.CD.WD,"Foreign direct investment, net outflows (BoP, ...",Economic Policy & Debt: Balance of payments: C...,Economic Policy & Debt,Balance of payments,Capital & financial account,,,Foreign direct investment refers to direct inv...,CC BY-4.0,...,Data on equity flows are based on balance of p...,Private financial flows - equity and debt - ac...,FDI data do not give a complete picture of int...,Note: Data starting from 2005 are based on the...,,,,,,https://datacatalog.worldbank.org/public-licen...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1491,TT.PRI.MRCH.XD.WD,Net barter terms of trade index (2015 = 100),Trade,Trade,Trade indexes,,,Net barter terms of trade index is calculated ...,Net barter terms of trade index is calculated ...,CC BY-4.0,...,,,,,,,,,,https://datacatalog.worldbank.org/public-licen...
1492,TX.VAL.MRCH.XD.WD,Export value index (2015 = 100),Trade,Trade,Trade indexes,,,Export values are the current value of exports...,Export values are the current value of exports...,CC BY-4.0,...,,,,,,,,,,https://datacatalog.worldbank.org/public-licen...
1493,TX.QTY.MRCH.XD.WD,Export volume index (2015 = 100),Trade,Trade,Trade indexes,,,Export volume indexes are derived from UNCTAD'...,Export volume indexes are derived from UNCTAD'...,CC BY-4.0,...,,,,,,,,,,https://datacatalog.worldbank.org/public-licen...
1494,TM.VAL.MRCH.XD.WD,Import value index (2015 = 100),Trade,Trade,Trade indexes,,,Import value indexes are the current value of ...,Import value indexes are the current value of ...,CC BY-4.0,...,,,,,,,,,,https://datacatalog.worldbank.org/public-licen...


In [175]:
# We can use the function update to replace the valeus of the topics data as they have the same column names
metadata_code.update(combined_df)

metadata_code

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3,Short definition,Long definition,License Type,...,Statistical concept and methodology,Development relevance,Limitations and exceptions,General comments,Other notes,Notes from original source,Related source links,Other web links,Related indicators,License URL
0,BN.KAC.EOMS.CD,"Net errors and omissions (BoP, current US$)",Economic Policy & Debt: Balance of payments: C...,Economic Policy & Debt,Balance of payments,Capital account,Errors and omissions,,Foreign direct investment refers to direct inv...,CC BY-4.0,...,Data on equity flows are based on balance of p...,Private financial flows - equity and debt - ac...,FDI data do not give a complete picture of int...,Note: Data starting from 2005 are based on the...,,,,,,https://datacatalog.worldbank.org/public-licen...
1,BM.KLT.DINV.CD.WD,"Foreign direct investment, net outflows (BoP, ...",Economic Policy & Debt: Balance of payments: C...,Economic Policy & Debt,Balance of payments,Capital account: long term capital,Direct investment,,Foreign direct investment are the net inflows ...,CC BY-4.0,...,,,,Note: Data are based on the sixth edition of t...,,,,,,https://datacatalog.worldbank.org/public-licen...
2,BM.KLT.DINV.WD.GD.ZS,"Foreign direct investment, net outflows (% of ...",Economic Policy & Debt: Balance of payments: C...,Economic Policy & Debt,Balance of payments,Capital account: long term capital,Direct investment,,Foreign direct investment are the net inflows ...,CC BY-4.0,...,Data on equity flows are based on balance of p...,Private financial flows - equity and debt - ac...,FDI data do not give a complete picture of int...,Note: Data starting from 2005 are based on the...,,,,,,https://datacatalog.worldbank.org/public-licen...
3,BX.KLT.DINV.WD.GD.ZS,"Foreign direct investment, net inflows (% of GDP)",Economic Policy & Debt: Balance of payments: C...,Economic Policy & Debt,Balance of payments,Capital account: long term capital,Direct investment,,Foreign direct investment refers to direct inv...,CC BY-4.0,...,Data on equity flows are based on balance of p...,Private financial flows - equity and debt - ac...,FDI data do not give a complete picture of int...,Note: Data starting from 2005 are based on the...,,,,,,https://datacatalog.worldbank.org/public-licen...
4,BN.KLT.DINV.CD,"Foreign direct investment, net (BoP, current US$)",Economic Policy & Debt: Balance of payments: C...,Economic Policy & Debt,Balance of payments,Capital account: long term capital,Direct investment,,Foreign direct investment refers to direct inv...,CC BY-4.0,...,Data on equity flows are based on balance of p...,Private financial flows - equity and debt - ac...,FDI data do not give a complete picture of int...,Note: Data starting from 2005 are based on the...,,,,,,https://datacatalog.worldbank.org/public-licen...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1491,TT.PRI.MRCH.XD.WD,Net barter terms of trade index (2015 = 100),Trade,Trade,Terms of trade,Price,Goods (merchandise),Net barter terms of trade index is calculated ...,Net barter terms of trade index is calculated ...,CC BY-4.0,...,,,,,,,,,,https://datacatalog.worldbank.org/public-licen...
1492,TX.QTY.MRCH.XD.WD,Export volume index (2015 = 100),Trade,Trade,Exports,Quantity (volume),Goods (merchandise),Export values are the current value of exports...,Export values are the current value of exports...,CC BY-4.0,...,,,,,,,,,,https://datacatalog.worldbank.org/public-licen...
1493,TM.VAL.MRCH.XD.WD,Import value index (2015 = 100),Trade,Trade,Imports,Value,Goods (merchandise),Export volume indexes are derived from UNCTAD'...,Export volume indexes are derived from UNCTAD'...,CC BY-4.0,...,,,,,,,,,,https://datacatalog.worldbank.org/public-licen...
1494,TM.QTY.MRCH.XD.WD,Import volume index (2015 = 100),Trade,Trade,Imports,Quantity (volume),Goods (merchandise),Import value indexes are the current value of ...,Import value indexes are the current value of ...,CC BY-4.0,...,,,,,,,,,,https://datacatalog.worldbank.org/public-licen...


In [176]:
summary(metadata_code).style.background_gradient(cmap='Blues')

data shape: (1496, 27)


Unnamed: 0,data type,#missing,%missing,#unique
Code,object,0,0.0,1496
Indicator Name,object,0,0.0,1496
General Topic,object,0,0.0,87
Topic,object,0,0.0,12
SubTopic1,object,0,0.0,60
SubTopic2,object,0,0.0,178
SubTopic3,object,0,0.0,515
Short definition,object,1289,0.861631,176
Long definition,object,0,0.0,1130
License Type,object,2,0.001337,5


In [177]:
metadata_code = metadata_code.sort_values(['Topic','SubTopic1', 'SubTopic2', 'SubTopic3'])
with pd.ExcelWriter('../data/metadata/metadata_code.xlsx') as writer:
    metadata_code.to_excel(writer, index=False)

## Orgonize into different sheets for metadata

In [178]:
metadata_code = metadata_code.groupby('Topic')

In [179]:
with pd.ExcelWriter('../data/metadata/metadata_code_by_sheet.xlsx') as writer:
    for topic, group in metadata_code:
        sheet_name = str(topic)[:50] 
        group = group.sort_values(['SubTopic1', 'SubTopic2', 'SubTopic3'])
        group.to_excel(writer, sheet_name=sheet_name, index=False)