# Merged Dataset

In [5]:
import os
import pandas as pd
import csv
import re

In [6]:
INPUT_DIR = '../input_data/IFs'
OUTPUT_DIR = '../input_data/IFs_cleaned'
final_columns = ['indicator','year','country','unit','value_name','jmp_category','commitment','value']

In [7]:
files_to_keep = [
    "01. Deaths by Category of Cause - Millions (2nd Dimensions = Diarrhea).csv",
    "11. Governance Effectiveness - WB index.csv",
    "12. Value Added by Sector, Currency - Billion dollars.csv",
    "13. Sanitation Services, Access, percent of population (2nd Dimensions = Basic + Safely Managed).csv",
    "14. Sanitation Services, Access, Number of people, million (2nd Dimensions = Basic + Safely Managed).csv",
    "15. Sanitation Services, Expenditure, Capital, Billion $ (2nd Dimensions = Basic + Safely Managed).csv",
    "16. Sanitation Services, Expenditure, Maintenance, Billion $ (2nd Dimensions = Basic + Safely Managed).csv",
    "17. Water Services, Access, percent of population (2nd Dimension = Basic + Safely Managed).csv",
    "18. Water Services, Access, Number of people, million (2nd Dimensions = Basic + Safely Managed).csv",
    "19. Water Services, Expenditure, Capital, Billion $ (2nd Dimensions = Basic + Safely Managed).csv",
    "20. Water Services, Expenditure, Maintenance, Billion $ (2nd Dimensions = Basic + Safely Managed).csv",
    "21. Population - Millions.csv",
    "23. GDP (PPP) - Billion dollars.csv",
    "24. Stunted children, History and Forecast - Million.csv",
    "25. Population under 5 Years, Headcount - Millions.csv",
    "26. Malnourished Children, Headcount - Millions.csv"
]

In [8]:
files = [
    f"{INPUT_DIR}/{f}" for f in os.listdir(INPUT_DIR)
    if os.path.isfile(os.path.join(INPUT_DIR, f))
]

In [9]:
files = [f"{INPUT_DIR}/{file}" for file in files_to_keep]

In [10]:
def get_ifs_name(source):
    return re.sub(r'^\d+\. ', '', source.replace(f"{INPUT_DIR}/", "")).replace(".csv", "")

In [11]:
def cleanup_semicolon(source):
    with open(source, 'r') as file:
        content = file.read()
    updated_content = content.replace(';', '')
    with open(source, 'w') as file:
        file.write(updated_content)

In [12]:
def cleanup_data(dataframe):
    dataframe['commitment'] = dataframe['commitment'].apply(lambda x: str(x).split(".")[0] if x else None)
    dataframe['unit'] = dataframe['unit'].apply(lambda x: x.replace("2017","") if x else None)
    dataframe['value_name'] = dataframe['value_name'].apply(lambda x: str(x).split(".")[0] if x else None)

In [13]:
combined_df = pd.DataFrame(columns=final_columns)
for file in files:
    print(f"Process {file}")
    cleanup_semicolon(file)
    data = pd.read_csv(file, header=[1,4,5], sep=',')
    new_columns = list(data.columns)
    for i, col in enumerate(new_columns):
        if col == ('Unnamed: 0_level_0', 'Unnamed: 0_level_1', 'Unnamed: 0_level_2'):
            new_columns[i] = 'Year'
    data.columns = new_columns
    df = pd.DataFrame(data.to_dict('records'))
    df_melted = df.melt(id_vars=['Year'], var_name='variable', value_name='value')
    new_data = []
    for value_list in df_melted.to_dict('records'):
        new_data.append({
            "year": value_list["Year"],
            "country": value_list["variable"][0],
            "unit": value_list["variable"][1],
            "value_type": list(filter(lambda v:v,value_list["variable"][2].split("_"))),
            "value": value_list["value"]
        })
    df = pd.DataFrame(new_data)
    df_split = pd.DataFrame(df['value_type'].tolist(), index=df.index)
    df_split.columns = ['value_name', 'jmp_category', 'commitment','other']
    df_final = pd.concat([df, df_split], axis=1)
    df_final['indicator'] = get_ifs_name(file)
    df_final = df_final[final_columns]
    combined_df = pd.concat([combined_df, df_final], ignore_index=True)

Process ../input_data/IFs/01. Deaths by Category of Cause - Millions (2nd Dimensions = Diarrhea).csv


  combined_df = pd.concat([combined_df, df_final], ignore_index=True)


Process ../input_data/IFs/11. Governance Effectiveness - WB index.csv
Process ../input_data/IFs/12. Value Added by Sector, Currency - Billion dollars.csv
Process ../input_data/IFs/13. Sanitation Services, Access, percent of population (2nd Dimensions = Basic + Safely Managed).csv
Process ../input_data/IFs/14. Sanitation Services, Access, Number of people, million (2nd Dimensions = Basic + Safely Managed).csv
Process ../input_data/IFs/15. Sanitation Services, Expenditure, Capital, Billion $ (2nd Dimensions = Basic + Safely Managed).csv
Process ../input_data/IFs/16. Sanitation Services, Expenditure, Maintenance, Billion $ (2nd Dimensions = Basic + Safely Managed).csv
Process ../input_data/IFs/17. Water Services, Access, percent of population (2nd Dimension = Basic + Safely Managed).csv
Process ../input_data/IFs/18. Water Services, Access, Number of people, million (2nd Dimensions = Basic + Safely Managed).csv
Process ../input_data/IFs/19. Water Services, Expenditure, Capital, Billion $ (

In [14]:
cleanup_data(combined_df)

In [15]:
combined_df.to_csv("./testing.csv",index=False)

# Table of Keys

In [16]:
def create_table_key(dataframe, column):
    table = pd.DataFrame(dataframe[column].unique(), columns=[column]).dropna().sort_values(column)
    table = table.reset_index()
    table['id'] = table.index + 1
    return table[['id',column]]

## 1. Indicators

In [17]:
indicator_table = create_table_key(combined_df, 'indicator')
indicator_table

Unnamed: 0,id,indicator
0,1,Deaths by Category of Cause - Millions (2nd Di...
1,2,GDP (PPP) - Billion dollars
2,3,Governance Effectiveness - WB index
3,4,"Malnourished Children, Headcount - Millions"
4,5,Population - Millions
5,6,"Population under 5 Years, Headcount - Millions"
6,7,"Sanitation Services, Access, Number of people,..."
7,8,"Sanitation Services, Access, percent of popula..."
8,9,"Sanitation Services, Expenditure, Capital, Bil..."
9,10,"Sanitation Services, Expenditure, Maintenance,..."


## 2. Units

In [18]:
units_table = create_table_key(combined_df, 'unit')
units_table

Unnamed: 0,id,unit
0,1,Billion $
1,2,Index 0-5
2,3,Mil People
3,4,Million
4,5,Percent
5,6,Trillion $


## 3. Value Names

In [19]:
value_names_table = create_table_key(combined_df, 'value_name')
value_names_table

Unnamed: 0,id,value_name
0,1,Base
1,2,FS
2,3,FW
3,4,FWS
4,5,SI
5,6,WI
6,7,WSI


## 4. JMP Categories

In [20]:
jmp_categories_table = create_table_key(combined_df, 'jmp_category')
jmp_categories_table

Unnamed: 0,id,jmp_category
0,1,ALB
1,2,BS
2,3,SM


## 5. Commitments

In [21]:
commitments_table = create_table_key(combined_df, 'commitment')
commitments_table

Unnamed: 0,id,commitment
0,1,0
1,2,2030
2,3,2050
3,4,2x
4,5,4x
5,6,6x


## 6. Countries

In [22]:
countries_table = create_table_key(combined_df, 'country')
countries_table

Unnamed: 0,id,country
0,1,All countries WHHS Tool1
1,2,Congo Dem. Republic of the
2,3,Ethiopia
3,4,Ghana
4,5,Guatemala
5,6,Haiti
6,7,India
7,8,Indonesia
8,9,Kenya
9,10,Liberia


# Table Results

In [37]:
def merge_id(prev_table, keys_table, name):
    merged_df = prev_table.merge(keys_table, left_on=name, right_on=name, how='left')
    merged_df = merged_df.rename(columns={'id': f'{name}_id'})
    merged_df = merged_df.drop(columns=[name])
    return merged_df

In [38]:
table_with_id = merge_id(combined_df, indicator_table, 'indicator')
table_with_id = merge_id(table_with_id, units_table, 'unit')
table_with_id = merge_id(table_with_id, value_names_table, 'value_name')
table_with_id = merge_id(table_with_id, jmp_categories_table, 'jmp_category')
table_with_id = merge_id(table_with_id, commitments_table, 'commitment')
table_with_id = merge_id(table_with_id, countries_table, 'country')

In [40]:
table_with_id.sort_values('year')

Unnamed: 0,year,value,indicator_id,unit_id,value_name_id,jmp_category_id,commitment_id,country_id
1544523,1960,,11,4,3,1.0,3.0,16
1528167,1960,,11,4,2,1.0,2.0,13
1564968,1960,,11,4,2,3.0,2.0,20
1495173,1960,,11,4,6,3.0,4.0,6
1493340,1960,,11,4,5,2.0,1.0,6
...,...,...,...,...,...,...,...,...
426317,2100,67.32,7,4,5,3.0,4.0,13
939473,2100,0.013,13,4,5,2.0,4.0,6
1608290,2100,11.92,6,3,5,3.0,5.0,8
939965,2100,0.0,13,4,3,3.0,3.0,6
