# Import Necessary Packages

In [2]:
import pandas as pd
import numpy as np
import warnings
import copy as cp

from datetime import datetime, timedelta

warnings.filterwarnings("ignore")

# Dimensions

## Country

In [23]:
country_list = ["Bangladesh", "Canada", "Chad", "Guinea", "Indonesia", "Mexico", "Philippines", "Togo", "United States"]

country_dimension_df = pd.DataFrame(
    columns=["Country_key", "Surrogate_year", "Name", "Region", "Continent", "Currency", "Capital", "Total_population","Population_growth_percent", "Total_labour_force", "Birth_rate", "Death_rate",  "GNI_per_capita", "Income_group"]
)

dict_capital = {
    'Bangladesh': 'Dhaka',
    'Canada': 'Ottawa',
    'Chad': 'N\'Djamena',
    'Guinea': 'Conakry',
    'Indonesia': 'Jakarta',
    'Mexico': 'Mexico City',
    'Philippines': 'Manila',
    'Togo': 'Lomé',
    'United States': 'Washington, D.C.'
}

dict_continent = {
    'Bangladesh': 'Asia',
    'Canada': 'North America',
    'Chad': 'Africa',
    'Guinea': 'Africa',
    'Indonesia': 'Asia',
    'Mexico': 'North America',
    'Philippines': 'Asia',
    'Togo': 'Africa',
    'United States': 'North America'
}

# CSV file of HNP country data
raw_geo_data = pd.read_csv('raw_data/HNP_CountryData.csv')
# CSV file of HNP country dimension statistics
raw_att_data = pd.read_csv('raw_data/HNP_CountryAttributes.csv')
raw_att_data = raw_att_data.drop(
    ['Series Code', 'Country Code'], axis=1
)
raw_att_data = raw_att_data.rename(
    columns={
        'Series Name': 'Series',
        'Country Name': 'Country',
        '2005 [YR2005]': 'y2005',
        '2006 [YR2006]': 'y2006',
        '2007 [YR2007]': 'y2007',
        '2008 [YR2008]': 'y2008',
        '2009 [YR2009]': 'y2009',
        '2010 [YR2010]': 'y2010',
        '2011 [YR2011]': 'y2011',
        '2012 [YR2012]': 'y2012',
        '2013 [YR2013]': 'y2013',
        '2014 [YR2014]': 'y2014',
        '2015 [YR2015]': 'y2015',
        '2016 [YR2016]': 'y2016',
        '2017 [YR2017]': 'y2017',
        '2018 [YR2018]': 'y2018',
        '2019 [YR2019]': 'y2019',
        '2020 [YR2020]': 'y2020'
    }
)

x = 1000
for country in country_list:
    country_row = {}
    # Add the country attributes that don't change
    country_row['Name'] = country
    country_row['Region'] = raw_geo_data.loc[raw_geo_data['Table Name'] == country, 'Region'].values[0]
    country_row['Continent'] = dict_continent[country]
    country_row['Currency'] = raw_geo_data.loc[raw_geo_data['Table Name'] == country, 'Currency Unit'].values[0]
    country_row['Capital'] = dict_capital[country]
    country_row['Income_group'] = raw_geo_data.loc[raw_geo_data['Table Name'] == country, 'Income Group'].values[0]

    specific_row = cp.deepcopy(country_row)
    # Add the country attributes that change every year
    for year in range(2005,2020):
        specific_row['Country_key'] = x
        x = x + 1
        specific_row['Surrogate_year'] = year
        specific_row['Total_population'] = raw_att_data.loc[((raw_att_data.Series == 'Population, total') & (raw_att_data.Country == country)), 'y' + str(year)].values[0]
        specific_row['Population_growth_percent'] = raw_att_data.loc[((raw_att_data.Series == 'Population growth (annual %)') & (raw_att_data.Country == country)), 'y' + str(year)].values[0]
        specific_row['Total_labour_force'] = raw_att_data.loc[((raw_att_data.Series == 'Labor force, total') & (raw_att_data.Country == country)), 'y' + str(year)].values[0]
        specific_row['Birth_rate'] = raw_att_data.loc[((raw_att_data.Series == 'Birth rate, crude (per 1,000 people)') & (raw_att_data.Country == country)), 'y' + str(year)].values[0]
        specific_row['Death_rate'] = raw_att_data.loc[((raw_att_data.Series == 'Death rate, crude (per 1,000 people)') & (raw_att_data.Country == country)), 'y' + str(year)].values[0]
        specific_row['GNI_per_capita'] = raw_att_data.loc[((raw_att_data.Series == 'GNI per capita, Atlas method (current US$)') & (raw_att_data.Country == country)), 'y' + str(year)].values[0]
        # Adding the row into the country dimension dataframe
        country_dimension_df = country_dimension_df.append(pd.DataFrame([specific_row]), ignore_index=True)

country_dimension_df.head(50)

Unnamed: 0,Country_key,Surrogate_year,Name,Region,Continent,Currency,Capital,Total_population,Population_growth_percent,Total_labour_force,Birth_rate,Death_rate,GNI_per_capita,Income_group
0,1000,2005,Bangladesh,South Asia,Asia,Bangladeshi taka,Dhaka,139035505.0,1.484747,53123178.0,24.053,6.205,550.0,Lower middle income
1,1001,2006,Bangladesh,South Asia,Asia,Bangladeshi taka,Dhaka,140921154.0,1.347121,54083636.0,23.384,6.092,570.0,Lower middle income
2,1002,2007,Bangladesh,South Asia,Asia,Bangladeshi taka,Dhaka,142660381.0,1.226631,54954286.0,22.747,5.987,610.0,Lower middle income
3,1003,2008,Bangladesh,South Asia,Asia,Bangladeshi taka,Dhaka,144304164.0,1.145647,55799460.0,22.153,5.892,660.0,Lower middle income
4,1004,2009,Bangladesh,South Asia,Asia,Bangladeshi taka,Dhaka,145924795.0,1.116806,56639145.0,21.605,5.807,730.0,Lower middle income
5,1005,2010,Bangladesh,South Asia,Asia,Bangladeshi taka,Dhaka,147575433.0,1.124807,57493263.0,21.107,5.734,800.0,Lower middle income
6,1006,2011,Bangladesh,South Asia,Asia,Bangladeshi taka,Dhaka,149273134.0,1.143829,58436990.0,20.657,5.673,890.0,Lower middle income
7,1007,2012,Bangladesh,South Asia,Asia,Bangladeshi taka,Dhaka,151005733.0,1.154006,59377977.0,20.245,5.625,970.0,Lower middle income
8,1008,2013,Bangladesh,South Asia,Asia,Bangladeshi taka,Dhaka,152761413.0,1.155951,60318372.0,19.862,5.589,1040.0,Lower middle income
9,1009,2014,Bangladesh,South Asia,Asia,Bangladeshi taka,Dhaka,154517385.0,1.14293,61262254.0,19.501,5.565,1110.0,Lower middle income


## Month

In [11]:
month_values = ["January","February","March","April","May","June","July","August","September","October","November","December"]
first_quarter = ["January","February","March"]
secound_quarter = ["April","May","June"]
third_quarter = ["July","August","September"]
forth_quarter = ["October","November","December"]
year_values = list(range(2005,2021))#this gives a list of values starting at 2005 to 2020
decade_values = [1,2]

#create list of dicts that represent each row
key_value = 2000
month_row_list = []
for year in year_values:
    for month in month_values:

        key_value += 1
        mount_row = {}
        mount_row["Month_Key"] = key_value
        mount_row["Month"] = month
        mount_row["Year"] = year

        if month in first_quarter:
            mount_row["Quarter"] = 1
        elif month in secound_quarter:
            mount_row["Quarter"] = 2
        elif month in third_quarter:
            mount_row["Quarter"] = 3
        else:
            mount_row["Quarter"] = 4
        
        if year <= 2010:
            mount_row["Decade"] = 1
        else:
            mount_row["Decade"] = 2

        month_row_list.append(mount_row)

month_df = pd.DataFrame(month_row_list)
print(month_df.head())

   Month_Key     Month  Year  Quarter  Decade
0       2001   January  2005        1       1
1       2002  February  2005        1       1
2       2003     March  2005        1       1
3       2004     April  2005        2       1
4       2005       May  2005        2       1


## Education

In [3]:
hnp_data = pd.read_csv("raw_data/education.csv")
hnp_data.head()

Unnamed: 0,SeriesName,SeriesCode,CountryName,CountryCode,2005 [YR2005],2006 [YR2006],2007 [YR2007],2008 [YR2008],2009 [YR2009],2010 [YR2010],2011 [YR2011],2012 [YR2012],2013 [YR2013],2014 [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 [YR2020]
0,"Literacy rate, adult female (% of females ages...",SE.ADT.LITR.FE.ZS,Africa Eastern and Southern,AFE,..,..,..,..,..,..,..,..,..,..,..,..,..,..,..,..
1,"Literacy rate, adult female (% of females ages...",SE.ADT.LITR.FE.ZS,Africa Western and Central,AFW,..,..,..,..,..,..,..,..,..,..,..,..,..,..,..,..
2,"Literacy rate, adult female (% of females ages...",SE.ADT.LITR.FE.ZS,Arab World,ARB,58.8121681213379,58.6921691894531,59.1082191467285,60.4463195800781,58.9698181152344,60.2376518249512,62.4384498596191,66.0942764282227,65.2387313842773,67.7367477416992,65.5670394897461,67.3514862060547,68.6543579101563,65.5303192138672,65.8576126098633,66.1968994140625
3,"Literacy rate, adult female (% of females ages...",SE.ADT.LITR.FE.ZS,Caribbean small states,CSS,91.4608535766602,91.6320877075195,92.3097915649414,92.5576705932617,92.7684631347656,92.9917678833008,93.1499099731445,93.0394897460938,92.8167495727539,92.9547271728516,93.0601425170898,..,..,..,..,..
4,"Literacy rate, adult female (% of females ages...",SE.ADT.LITR.FE.ZS,Central Europe and the Baltics,CEB,98.1644287109375,98.184440612793,98.2189102172852,98.2749710083008,98.5478668212891,98.5772323608398,98.6045684814453,98.6247100830078,98.6415405273438,98.6618728637695,98.6808471679688,98.7023162841797,..,..,..,..


In [4]:
def get_country_rows(list_country, data):
    country_idx = {}
    for country in list_country:
        country_idx[country] = []
    for idx, row in data.iterrows():
        if row.CountryName in list_country:
            country_idx[row.CountryName].append(idx)
    return country_idx

x = get_country_rows(country_list,hnp_data)
print(x)

{'Bangladesh': [64, 330, 596, 862, 1128, 1394, 1660, 1926, 2192, 2458, 2724, 2990, 3256, 3522, 3788, 4054, 4320, 4586, 4852, 5118, 5384, 5650], 'Cambodia': [82, 348, 614, 880, 1146, 1412, 1678, 1944, 2210, 2476, 2742, 3008, 3274, 3540, 3806, 4072, 4338, 4604, 4870, 5136, 5402, 5668], 'Canada': [84, 350, 616, 882, 1148, 1414, 1680, 1946, 2212, 2478, 2744, 3010, 3276, 3542, 3808, 4074, 4340, 4606, 4872, 5138, 5404, 5670], 'Chad': [87, 353, 619, 885, 1151, 1417, 1683, 1949, 2215, 2481, 2747, 3013, 3279, 3545, 3811, 4077, 4343, 4609, 4875, 5141, 5407, 5673], 'Eritrea': [110, 376, 642, 908, 1174, 1440, 1706, 1972, 2238, 2504, 2770, 3036, 3302, 3568, 3834, 4100, 4366, 4632, 4898, 5164, 5430, 5696], 'Mexico': [176, 442, 708, 974, 1240, 1506, 1772, 2038, 2304, 2570, 2836, 3102, 3368, 3634, 3900, 4166, 4432, 4698, 4964, 5230, 5496, 5762], 'Philippines': [204, 470, 736, 1002, 1268, 1534, 1800, 2066, 2332, 2598, 2864, 3130, 3396, 3662, 3928, 4194, 4460, 4726, 4992, 5258, 5524, 5790], 'South Sudan

In [6]:

education_df = pd.DataFrame(
    columns = ["education_key", "literacy_rate_percent_m_15-24","literacy_rate_percent_15-24","literacy_rate_percent_f_15-plus","literacy_rate_percent_m_15-plus","literacy_percent_15-plus","primary_completion_rate_percent_f","primary_completion_rate_percent_m","primary_completion_rate_percent","primary_enroll_rate_percent_gross","pimary_enroll_rate_percent_gross_f","pimary_enroll_rate_percent_gross_m","primary_enroll_rate_percent_net","primary_enroll_rate_percent_net_f","primary_enroll_rate_percent_net_m","secondary_enroll_rate_percent_gross","secondary_enroll_rate_percent_gross_f","secondary_enroll_rate_percent_gross_m","secondary_enroll_rate_percent_net","secondary_enroll_rate_percent_net_f","secondary_enroll_rate_percent_net_m","tertiary_enroll_rate_percent_gross","spending_education_percent_gdp"]
)
hnp_data.head()



Unnamed: 0,SeriesName,SeriesCode,CountryName,CountryCode,2005 [YR2005],2006 [YR2006],2007 [YR2007],2008 [YR2008],2009 [YR2009],2010 [YR2010],2011 [YR2011],2012 [YR2012],2013 [YR2013],2014 [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 [YR2020]
0,"Literacy rate, adult female (% of females ages...",SE.ADT.LITR.FE.ZS,Africa Eastern and Southern,AFE,..,..,..,..,..,..,..,..,..,..,..,..,..,..,..,..
1,"Literacy rate, adult female (% of females ages...",SE.ADT.LITR.FE.ZS,Africa Western and Central,AFW,..,..,..,..,..,..,..,..,..,..,..,..,..,..,..,..
2,"Literacy rate, adult female (% of females ages...",SE.ADT.LITR.FE.ZS,Arab World,ARB,58.8121681213379,58.6921691894531,59.1082191467285,60.4463195800781,58.9698181152344,60.2376518249512,62.4384498596191,66.0942764282227,65.2387313842773,67.7367477416992,65.5670394897461,67.3514862060547,68.6543579101563,65.5303192138672,65.8576126098633,66.1968994140625
3,"Literacy rate, adult female (% of females ages...",SE.ADT.LITR.FE.ZS,Caribbean small states,CSS,91.4608535766602,91.6320877075195,92.3097915649414,92.5576705932617,92.7684631347656,92.9917678833008,93.1499099731445,93.0394897460938,92.8167495727539,92.9547271728516,93.0601425170898,..,..,..,..,..
4,"Literacy rate, adult female (% of females ages...",SE.ADT.LITR.FE.ZS,Central Europe and the Baltics,CEB,98.1644287109375,98.184440612793,98.2189102172852,98.2749710083008,98.5478668212891,98.5772323608398,98.6045684814453,98.6247100830078,98.6415405273438,98.6618728637695,98.6808471679688,98.7023162841797,..,..,..,..


In [7]:
x = get_country_rows(country_list, hnp_data)
year_list = ["2005","2006","2007","2008","2009","2010","2011","2012","2013","2014","2015","2016","2017","2018","2019","2020"]

ed_country_dim_data = {}
for country in country_list:
    ed_country_dim_data[country]={}
    for year in year_list:
        ed_country_dim_data[country][year] = {}
counter = 1  

year_idx = {}
idx = 4
for year in year_list:
    year_idx[year] = idx
    idx=idx+1
    
def populate_dim_data(dim_data,country,column,row):
    for key, value in year_idx.items():
        dim_data[country][key][column] = row[value]


hnp_data.values.tolist()
for row in hnp_data.values.tolist():
    if row[2] in country_list:
        if row[0] == "Literacy rate, adult female (% of females ages 15 and above)":
            populate_dim_data(ed_country_dim_data,row[2],"literacy_rate_percent_f_15-plus",row)
        elif row[0] == "Literacy rate, adult male (% of males ages 15 and above)":
            populate_dim_data(ed_country_dim_data,row[2],"literacy_rate_percent_m_15-plus",row)
        elif row[0] == "Literacy rate, adult total (% of people ages 15 and above)":
            populate_dim_data(ed_country_dim_data,row[2],"literacy_percent_15-plus",row)
        elif row[0] == "Literacy rate, youth total (% of people ages 15-24)":
            populate_dim_data(ed_country_dim_data,row[2],"literacy_rate_percent_15-24",row)
        elif row[0] == "Literacy rate, youth male (% of males ages 15-24)":
            populate_dim_data(ed_country_dim_data,row[2],"literacy_rate_percent_m_15-24",row)
        elif row[0] == "Primary completion rate, female (% of relevant age group)":
            populate_dim_data(ed_country_dim_data,row[2],"primary_completion_rate_percent_f",row)
        elif row[0] == "Primary completion rate, male (% of relevant age group)":
            populate_dim_data(ed_country_dim_data,row[2],"primary_completion_rate_percent_m",row)
        elif row[0] == "Primary completion rate, total (% of relevant age group)":
            populate_dim_data(ed_country_dim_data,row[2],"primary_completion_rate_percent",row)
        elif row[0] == "School enrollment, primary (% gross)":
            populate_dim_data(ed_country_dim_data,row[2],"primary_enroll_rate_percent_gross",row)
        elif row[0] == "School enrollment, primary, female (% gross)":
            populate_dim_data(ed_country_dim_data,row[2],"pimary_enroll_rate_percent_gross_f",row)
        elif row[0] == "School enrollment, primary, male (% gross)":
            populate_dim_data(ed_country_dim_data,row[2],"pimary_enroll_rate_percent_gross_m",row)
        elif row[0] == "School enrollment, primary (% net)":
            populate_dim_data(ed_country_dim_data,row[2],"primary_enroll_rate_percent_net",row)
        elif row[0] == "School enrollment, primary, female (% net)":
            populate_dim_data(ed_country_dim_data,row[2],"primary_enroll_rate_percent_net_f",row)
        elif row[0] == "School enrollment, primary, male (% net)":
            populate_dim_data(ed_country_dim_data,row[2],"primary_enroll_rate_percent_net_m",row)
        elif row[0] == "School enrollment, secondary (% gross)":
            populate_dim_data(ed_country_dim_data,row[2],"secondary_enroll_rate_percent_gross",row)
        elif row[0] == "School enrollment, secondary, female (% gross)":
            populate_dim_data(ed_country_dim_data,row[2],"secondary_enroll_rate_percent_gross_f",row)
        elif row[0] == "School enrollment, secondary, male (% gross)":
            populate_dim_data(ed_country_dim_data,row[2],"secondary_enroll_rate_percent_gross_m",row)
        elif row[0] == "School enrollment, secondary (% net)":
            populate_dim_data(ed_country_dim_data,row[2],"secondary_enroll_rate_percent_net",row)
        elif row[0] == "School enrollment, secondary, female (% net)":
            populate_dim_data(ed_country_dim_data,row[2],"secondary_enroll_rate_percent_net_f",row)
        elif row[0] == "School enrollment, secondary, male (% net)":
            populate_dim_data(ed_country_dim_data,row[2],"secondary_enroll_rate_percent_net_m",row)
        elif row[0] == "School enrollment, tertiary (% gross)":
            populate_dim_data(ed_country_dim_data,row[2],"tertiary_enroll_rate_percent_gross",row)
        elif row[0] == "Government expenditure on education, total (% of GDP)":
            populate_dim_data(ed_country_dim_data,row[2],"spending_education_percent_gdp",row)

list_data = []
for country in country_list:
    for year in year_list:
        list_data.append([ed_country_dim_data[country][year]])
#         education_df.append(pd.DataFrame(), ignore_index=True)


for i in list_data:
    education_df = education_df.append(pd.DataFrame(i), ignore_index=True)



In [102]:
education_df.head()

Unnamed: 0,education_key,literacy_rate_percent_m_15-24,literacy_rate_percent_15-24,literacy_rate_percent_f_15-plus,literacy_rate_percent_m_15-plus,literacy_percent_15-plus,primary_completion_rate_percent_f,primary_completion_rate_percent_m,primary_completion_rate_percent,primary_enroll_rate_percent_gross,...,primary_enroll_rate_percent_net_f,primary_enroll_rate_percent_net_m,secondary_enroll_rate_percent_gross,secondary_enroll_rate_percent_gross_f,secondary_enroll_rate_percent_gross_m,secondary_enroll_rate_percent_net,secondary_enroll_rate_percent_net_f,secondary_enroll_rate_percent_net_m,tertiary_enroll_rate_percent_gross,spending_education_percent_gdp
0,,..,..,..,..,..,..,..,..,97.3645706176758,...,..,..,101.391189575195,100.234382629395,102.487060546875,..,..,..,..,
1,,..,..,..,..,..,..,..,..,98.8806304931641,...,..,..,101.728248596191,100.669502258301,102.732360839844,..,..,..,..,
2,,..,..,..,..,..,..,..,..,98.9664764404297,...,..,..,101.90348815918,100.74836730957,103.001899719238,..,..,..,63.6000595092773,
3,,..,..,..,..,..,..,..,..,98.013313293457,...,..,..,102.344436645508,101.104248046875,103.525932312012,..,..,..,63.7673988342285,
4,,..,..,..,..,..,..,..,..,99.0087966918945,...,..,..,102.734443664551,101.503608703613,103.908081054688,..,..,..,63.0657691955566,


## Quality of Life

In [None]:
# add CSV stuff here
hnp_data = pd.read_csv("raw_data/education.csv")#change to proper csv


In [None]:
#needs to contain
#Quality of Life (QualityofLifeKey, Quality of Services e.g., {Access to Drinking
#Water, Access to Sanitation, Access to Basic Handwashing Facilities},
#Unemployment rate attributes e.g., {Female, Male, Total}, Maternal Leave benefits,
#…)