In [1]:
import numpy as np 
import pandas as pd
import re

# Extension Processing and Categorization in WDI Indicator Data

## Objective
The goal of this notebook is to process and categorize the **extensions** found in the WDI (World Development Indicators) dataset. Extensions provide additional details such as **gender, demographic groups, education level, and geographic location**. By extracting and structuring these extensions into separate columns ( or jsut one column), we aim to enhance the hierarchical organization of the data for better analysis and visualization.

## Challenges Encountered when analysing the data 
1. **Missing Extensions**: Some indicators in the WDI dataset lack extensions, making it difficult to determine additional attributes for certain data points.  
2. **Ambiguous Extensions**: Certain extensions, such as **"FE"**, do not always correspond to a single category (e.g., "FE" may indicate **Female** in some cases but represent something else in other contexts). This requires additional context or mapping strategies to ensure accurate classification.  




In [2]:
# codes_dict = {
#     "Gender": ["FE", "MA", "FM", "MF", "WF", "WH"],
#     "Age": ["14", "YG", "OL", "C3"],
#     "Degree": ["BA", "DO", "MS"],
#     "Geography": ["R1", "R2", "R3", "R4", "R5", "R6", "RU", "UR"],
# }
# ignore_codes = {code for sublist in codes_dict.values() for code in sublist}

In [3]:
def clean_text(text):
    if isinstance(text, str): 
        return re.sub(r'\s+', ' ', text).strip()
    return text

In [4]:
def summary(df):
    print(f'data shape: {df.shape}')  
    summ = pd.DataFrame(df.dtypes, columns=['data type'])
    summ['#missing'] = df.isnull().sum().values 
    summ['%missing'] = df.isnull().sum().values / len(df)
    summ['#unique'] = df.nunique().values
    desc = pd.DataFrame(df.describe(include='all').transpose())
#     summ['min'] = desc['min'].values
#     summ['max'] = desc['max'].values
    return summ

In [5]:
def extract_relevant_parts(code):
    parts = code.split(".")[2:]
    
#     relevant_parts = [p for p in parts if (len(p) == 2) and (p not in ignore_codes) and (p != 'ZS')]
    relevant_parts = [p for p in parts if (len(p) == 2) and  (p != 'ZS')]

    return relevant_parts

In [6]:
def get_extensions_and_unmatched(code):
    extracted_parts = extract_relevant_parts(code)
    matched = []
    unmatched = []

    for part in extracted_parts:
        if part in extensions_dict:
            matched.append(extensions_dict[part]) 
        else:
            unmatched.append(part) 

    return matched, unmatched

In [7]:
hierarchy_ext = pd.read_excel('../data/hierarchy/hierarchy_code.xlsx', sheet_name=0)
wdi_codes = pd.read_excel('../data/WDI_Indicators.xlsx', sheet_name="Coding")

In [8]:
extensions_dict = dict(zip(wdi_codes["Extensions"], wdi_codes["Extensions description"]))

In [9]:
hierarchy_ext["extensions"], hierarchy_ext["unmatchedCodes"] = zip(*hierarchy_ext["Code"].apply(get_extensions_and_unmatched))

In [10]:
hierarchy_ext

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3,extensions,unmatchedCodes
0,BN.KAC.EOMS.CD,"Net errors and omissions (BoP, current US$)",Economic Policy & Debt: Balance of payments: C...,Economic Policy & Debt,Balance of payments,Capital account,Errors and omissions,[Current US$],[]
1,BM.KLT.DINV.CD.WD,"Foreign direct investment, net outflows (BoP, ...",Economic Policy & Debt: Balance of payments: C...,Economic Policy & Debt,Balance of payments,Capital account: long term capital,Direct investment,"[Current US$, World Development Indicators]",[]
2,BM.KLT.DINV.WD.GD.ZS,"Foreign direct investment, net outflows (% of ...",Economic Policy & Debt: Balance of payments: C...,Economic Policy & Debt,Balance of payments,Capital account: long term capital,Direct investment,"[World Development Indicators, GDP]",[]
3,BX.KLT.DINV.WD.GD.ZS,"Foreign direct investment, net inflows (% of GDP)",Economic Policy & Debt: Balance of payments: C...,Economic Policy & Debt,Balance of payments,Capital account: long term capital,Direct investment,"[World Development Indicators, GDP]",[]
4,BN.KLT.DINV.CD,"Foreign direct investment, net (BoP, current US$)",Economic Policy & Debt: Balance of payments: C...,Economic Policy & Debt,Balance of payments,Capital account: long term capital,Direct investment,[Current US$],[]
...,...,...,...,...,...,...,...,...,...
1491,TM.TAX.TCOM.SM.AR.ZS,"Tariff rate, applied, simple mean, primary pro...",Trade,Trade,Imports,Tariff,Primary products (commodities),"[Simple mean, Applied rate]",[]
1492,TM.TAX.TCOM.SR.ZS,"Share of tariff lines with specific rates, pri...",Trade,Trade,Imports,Tariff,Primary products (commodities),[Specific rates],[]
1493,TM.TAX.TCOM.IP.ZS,Share of tariff lines with international peaks...,Trade,Trade,Imports,Tariff,Primary products (commodities),[International peaks (or intimate partner)],[]
1494,TM.VAL.MRCH.XD.WD,Import value index (2015 = 100),Trade,Trade,Imports,Value,Goods (merchandise),"[Index, World Development Indicators]",[]


In [11]:
filtered_rows =  hierarchy_ext[hierarchy_ext["unmatchedCodes"].apply(lambda x: len(x) > 0)]
with pd.option_context('display.max_colwidth', 200):
    display(filtered_rows)

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3,extensions,unmatchedCodes
425,SE.LPV.PRIM.SD.FE,Female primary school age children out-of-school (%),Education: Outcomes,Education,Outcomes,Learning poverty,Primary education,[Female],[SD]
427,SE.LPV.PRIM.SD,Primary school age children out-of-school (%),Education: Outcomes,Education,Outcomes,Learning poverty,Primary education,[],[SD]
429,SE.LPV.PRIM.SD.MA,Male primary school age children out-of-school (%),Education: Outcomes,Education,Outcomes,Learning poverty,Primary education,[Male],[SD]
586,EN.GHG.ALL.LU.MT.CE.AR5,Total greenhouse gas emissions including LULUCF (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,All Greenhouse Gases,Including Land Use and Land-Use Change (LULUC),"[Metric tons, CO2 equivalent]",[LU]
588,EN.GHG.CO2.BU.MT.CE.AR5,Carbon dioxide (CO2) emissions from Building (Energy) (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Carbon dioxide emissions,Buildings,"[Metric tons, CO2 equivalent]",[BU]
595,EN.GHG.CO2.LU.DF.MT.CE.AR5,Carbon dioxide (CO2) net fluxes from LULUCF - Deforestation (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Carbon dioxide emissions,Including Land Use and Land-Use Change (LULUC),"[Metric tons, CO2 equivalent]","[LU, DF]"
596,EN.GHG.CO2.LU.FL.MT.CE.AR5,Carbon dioxide (CO2) net fluxes from LULUCF - Forest Land (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Carbon dioxide emissions,Including Land Use and Land-Use Change (LULUC),"[Metric tons, CO2 equivalent]","[LU, FL]"
597,EN.GHG.CO2.LU.OS.MT.CE.AR5,Carbon dioxide (CO2) net fluxes from LULUCF - Organic Soil (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Carbon dioxide emissions,Including Land Use and Land-Use Change (LULUC),"[Metric tons, CO2 equivalent]","[LU, OS]"
598,EN.GHG.CO2.LU.MT.CE.AR5,Carbon dioxide (CO2) net fluxes from LULUCF - Total excluding non-tropical fires (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Carbon dioxide emissions,Including Land Use and Land-Use Change (LULUC),"[Metric tons, CO2 equivalent]",[LU]
599,EN.GHG.CO2.LU.OL.MT.CE.AR5,Carbon dioxide (CO2) net fluxes from LULUCF - Other Land (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Carbon dioxide emissions,Including Land Use and Land-Use Change (LULUC),"[Old (population), Metric tons, CO2 equivalent]",[LU]


In [12]:
unmatched_dict = {
    "FL": "Forest Land",  
    "DF": "Deforestation",  
    "OL": "Organic Soil",
    "40": "Poorest 40%",
    "60": "Richest 60%",
    "CH" : "Current Health Expenditure",
    "SO" : "Secondary education or more",
    "ME" : "Modeled Estimate",
    "UB" : "Upper bound",
    "LB" : "Lower bound"
    
}

In [13]:
def append_unmatched_descriptions(row):
    # Get unmatched codes from the row
    unmatched = row["unmatchedCodes"]
    matched_extensions = row["extensions"]

    # Retrieve the descriptions for unmatched codes from the unmatched dictionary
    additional_descriptions = [unmatched_dict[code] for code in unmatched if code in unmatched_dict]

    # Append those descriptions to the matched extensions list
    matched_extensions.extend(additional_descriptions)

    # Return the updated list of descriptions
    return matched_extensions

In [14]:
hierarchy_ext["extensions"] = hierarchy_ext.apply(append_unmatched_descriptions, axis=1)


In [15]:
filtered_rows =  hierarchy_ext[hierarchy_ext["unmatchedCodes"].apply(lambda x: len(x) > 0)]
with pd.option_context('display.max_colwidth', 200):
    display(filtered_rows)

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3,extensions,unmatchedCodes
425,SE.LPV.PRIM.SD.FE,Female primary school age children out-of-school (%),Education: Outcomes,Education,Outcomes,Learning poverty,Primary education,[Female],[SD]
427,SE.LPV.PRIM.SD,Primary school age children out-of-school (%),Education: Outcomes,Education,Outcomes,Learning poverty,Primary education,[],[SD]
429,SE.LPV.PRIM.SD.MA,Male primary school age children out-of-school (%),Education: Outcomes,Education,Outcomes,Learning poverty,Primary education,[Male],[SD]
586,EN.GHG.ALL.LU.MT.CE.AR5,Total greenhouse gas emissions including LULUCF (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,All Greenhouse Gases,Including Land Use and Land-Use Change (LULUC),"[Metric tons, CO2 equivalent]",[LU]
588,EN.GHG.CO2.BU.MT.CE.AR5,Carbon dioxide (CO2) emissions from Building (Energy) (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Carbon dioxide emissions,Buildings,"[Metric tons, CO2 equivalent]",[BU]
595,EN.GHG.CO2.LU.DF.MT.CE.AR5,Carbon dioxide (CO2) net fluxes from LULUCF - Deforestation (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Carbon dioxide emissions,Including Land Use and Land-Use Change (LULUC),"[Metric tons, CO2 equivalent, Deforestation]","[LU, DF]"
596,EN.GHG.CO2.LU.FL.MT.CE.AR5,Carbon dioxide (CO2) net fluxes from LULUCF - Forest Land (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Carbon dioxide emissions,Including Land Use and Land-Use Change (LULUC),"[Metric tons, CO2 equivalent, Forest Land]","[LU, FL]"
597,EN.GHG.CO2.LU.OS.MT.CE.AR5,Carbon dioxide (CO2) net fluxes from LULUCF - Organic Soil (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Carbon dioxide emissions,Including Land Use and Land-Use Change (LULUC),"[Metric tons, CO2 equivalent]","[LU, OS]"
598,EN.GHG.CO2.LU.MT.CE.AR5,Carbon dioxide (CO2) net fluxes from LULUCF - Total excluding non-tropical fires (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Carbon dioxide emissions,Including Land Use and Land-Use Change (LULUC),"[Metric tons, CO2 equivalent]",[LU]
599,EN.GHG.CO2.LU.OL.MT.CE.AR5,Carbon dioxide (CO2) net fluxes from LULUCF - Other Land (Mt CO2e),Environment: Greenhouse Gas Emissions,Environment,Greenhouse Gas Emissions,Carbon dioxide emissions,Including Land Use and Land-Use Change (LULUC),"[Old (population), Metric tons, CO2 equivalent]",[LU]


In [16]:
hierarchy_ext.drop(columns="unmatchedCodes", inplace=True)

## Handling Gender-Specific Codes and Adding "Total" to Extensions  

In the dataset, some codes contain gender identifiers:  

- `.FE.` or `.FE` → Female  
- `.MA.` or `.MA` → Male  

If both Male and Female versions exist, `"Total"` should be added to the **base code** (the version without `.MA` or `.FE`).  

### **Logic of the Python Code**  
1. Identify codes with **".MA."**, **".FE."**, or ending with **".MA"** / **".FE"**.  
2. Remove these parts to get the **base code**.  
3. If the base code exists in the dataset and **"Total"** is not already in its extensions, add it.  

### **Example**  

| Code                      | Extensions |
|---------------------------|------------|
| SH.STA.BASS.UR.FE.ZS      | Female     |
| SH.STA.BASS.UR.MA.ZS      | Male       |
| SH.STA.BASS.UR.ZS         | Total      |

This ensures the dataset properly reflects **Male, Female, and Total** categories.  


In [17]:
def add_total_if_both_exist(df):
    codes_set = set(df["Code"])  

    for code in df["Code"]:
        if ".MA." in code or ".FE." in code or code.endswith(".MA") or code.endswith(".FE"):
            if ".ZS" in code:
                base_code = code.replace(".ZS", ".")
                    
            base_code = code.replace(".MA.", ".").replace(".FE.", ".")  
            base_code = base_code.rstrip(".MA").rstrip(".FE") 

            if base_code in codes_set:
                base_idx = df[df["Code"] == base_code].index[0]

                if "Total" not in df.at[base_idx, "extensions"]:
                    df.at[base_idx, "extensions"].append("Total")

In [18]:
add_total_if_both_exist(hierarchy_ext)

In [19]:
with pd.ExcelWriter('../data/hierarchy/hierarchy_extension.xlsx') as writer:
    hierarchy_ext.to_excel(writer, index=False)
    

## Orgonize into different sheets for hierarchy

In [20]:
hierarchy_ext_grouped = hierarchy_ext.groupby('Topic')

In [21]:
with pd.ExcelWriter('../data/hierarchy/hierarchy_extension_by_sheet.xlsx') as writer:
    for topic, group in hierarchy_ext_grouped:
        group.to_excel(writer, sheet_name=str(topic), index=False)

# Add new values to metadata

In [22]:
metadata_ext = pd.read_excel('../data/metadata/metadata_code.xlsx', sheet_name=0)

In [23]:
col_index = metadata_ext.columns.get_loc("SubTopic3") + 1

metadata_ext.insert(col_index, "extensions", hierarchy_ext["extensions"])

In [24]:
metadata_ext

Unnamed: 0,Code,Indicator Name,General Topic,Topic,SubTopic1,SubTopic2,SubTopic3,extensions,Short definition,Long definition,...,Statistical concept and methodology,Development relevance,Limitations and exceptions,General comments,Other notes,Notes from original source,Related source links,Other web links,Related indicators,License URL
0,BN.KAC.EOMS.CD,"Net errors and omissions (BoP, current US$)",Economic Policy & Debt: Balance of payments: C...,Economic Policy & Debt,Balance of payments,Capital account,Errors and omissions,[Current US$],,Foreign direct investment refers to direct inv...,...,Data on equity flows are based on balance of p...,Private financial flows - equity and debt - ac...,FDI data do not give a complete picture of int...,Note: Data starting from 2005 are based on the...,,,,,,https://datacatalog.worldbank.org/public-licen...
1,BM.KLT.DINV.CD.WD,"Foreign direct investment, net outflows (BoP, ...",Economic Policy & Debt: Balance of payments: C...,Economic Policy & Debt,Balance of payments,Capital account: long term capital,Direct investment,"[Current US$, World Development Indicators]",,Foreign direct investment are the net inflows ...,...,,,,Note: Data are based on the sixth edition of t...,,,,,,https://datacatalog.worldbank.org/public-licen...
2,BM.KLT.DINV.WD.GD.ZS,"Foreign direct investment, net outflows (% of ...",Economic Policy & Debt: Balance of payments: C...,Economic Policy & Debt,Balance of payments,Capital account: long term capital,Direct investment,"[World Development Indicators, GDP]",,Foreign direct investment are the net inflows ...,...,Data on equity flows are based on balance of p...,Private financial flows - equity and debt - ac...,FDI data do not give a complete picture of int...,Note: Data starting from 2005 are based on the...,,,,,,https://datacatalog.worldbank.org/public-licen...
3,BX.KLT.DINV.WD.GD.ZS,"Foreign direct investment, net inflows (% of GDP)",Economic Policy & Debt: Balance of payments: C...,Economic Policy & Debt,Balance of payments,Capital account: long term capital,Direct investment,"[World Development Indicators, GDP]",,Foreign direct investment refers to direct inv...,...,Data on equity flows are based on balance of p...,Private financial flows - equity and debt - ac...,FDI data do not give a complete picture of int...,Note: Data starting from 2005 are based on the...,,,,,,https://datacatalog.worldbank.org/public-licen...
4,BN.KLT.DINV.CD,"Foreign direct investment, net (BoP, current US$)",Economic Policy & Debt: Balance of payments: C...,Economic Policy & Debt,Balance of payments,Capital account: long term capital,Direct investment,[Current US$],,Foreign direct investment refers to direct inv...,...,Data on equity flows are based on balance of p...,Private financial flows - equity and debt - ac...,FDI data do not give a complete picture of int...,Note: Data starting from 2005 are based on the...,,,,,,https://datacatalog.worldbank.org/public-licen...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1491,TM.TAX.TCOM.SM.AR.ZS,"Tariff rate, applied, simple mean, primary pro...",Trade,Trade,Imports,Tariff,Primary products (commodities),"[Simple mean, Applied rate]",Weighted mean applied tariff is the average of...,Weighted mean applied tariff is the average of...,...,,,,The tariff data for the European Union (EU) ap...,,,,,,https://datacatalog.worldbank.org/public-licen...
1492,TM.TAX.TCOM.SR.ZS,"Share of tariff lines with specific rates, pri...",Trade,Trade,Imports,Tariff,Primary products (commodities),[Specific rates],Simple mean most favored nation tariff rate is...,Simple mean most favored nation tariff rate is...,...,,,,The tariff data for the European Union (EU) ap...,,,,,,https://datacatalog.worldbank.org/public-licen...
1493,TM.TAX.TCOM.IP.ZS,Share of tariff lines with international peaks...,Trade,Trade,Imports,Tariff,Primary products (commodities),[International peaks (or intimate partner)],Weighted mean most favored nations tariff is t...,Weighted mean most favored nations tariff is t...,...,,,,The tariff data for the European Union (EU) ap...,,,,,,https://datacatalog.worldbank.org/public-licen...
1494,TM.VAL.MRCH.XD.WD,Import value index (2015 = 100),Trade,Trade,Imports,Value,Goods (merchandise),"[Index, World Development Indicators]",Export volume indexes are derived from UNCTAD'...,Export volume indexes are derived from UNCTAD'...,...,,,,,,,,,,https://datacatalog.worldbank.org/public-licen...


In [25]:
with pd.ExcelWriter('../data/metadata/metadata_extension.xlsx') as writer:
    metadata_ext.to_excel(writer, index=False)

In [26]:
metadata_ext_grouped = metadata_ext.groupby('Topic')

In [27]:
with pd.ExcelWriter('../data/metadata/metadata_extension_by_sheet.xlsx') as writer:
    for topic, group in metadata_ext_grouped:
        sheet_name = str(topic)[:50] 
        group = group.sort_values(['SubTopic1', 'SubTopic2', 'SubTopic3'])
        group.to_excel(writer, sheet_name=sheet_name, index=False)