In [10]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from email.iterators import body_line_iterator
from tkinter import END
import pandas as pd
import numpy as np
from IPython.display import display

# Extract the data

country_list = ["Bolivia", "Burundi", "Canada", "Chad", "Indonesia", "Mexico", "Niger", "Philippines", "USA"]
begin_date = 2005
end_date = 2020
nb_dates = end_date - begin_date + 1

attribute_names = ["Adults_And_Children_Living_With_HIV",
                    "Adults_Newly_Infected_With_HIV",
                    "%Births_By_Skilled_Health_Staff",
                    "Capital_Health_Expenditure_%GDP",
                    "DeathByNonCommunicableDiseasePercent",
                    "Domestic_General_Health_Expenditure_%GDP",
                    "Hospital_Beds_Per_1000ppl",
                    "Immunization_HepB3_%OneYearOld",
                    "ImmunizationMeasles%Children12To23Months",
                    "Number_Under-five_Deaths",
                    "Nurses_And_Midwives_Per_1000ppl",
                    "Physicians_Per_1000ppl",
                    "Prevalence_of_Anemia_%Non-pregnant",
                    "Prevalence_of_Anemia_%Pregnant",
                    "Prevalence_of_Overweight_%Adult",
                    "Suicide_Mortality_Rate_per_100000ppl",
                    "Survival_To_Age_65_Female",
                    "Survival_To_Age_65_Male",
                    "Total_Alcohol_Consumption_Per_Capita",
                    "Tuberculosis_Death_Rate_per_100000ppl"
                    ]

nb_attributes = len(attribute_names)

health_data_raw = pd.read_csv("Health_Data_Raw.csv")

health_transposed = health_data_raw.transpose()#flip dataframe

health_dimension = health_transposed.iloc[2,0:nb_attributes].transpose()#create data frame header columns
health_dimension = pd.DataFrame(health_dimension.values.reshape(1,-1))

count = 0
for k in range(len(country_list)):
    transposed_country = health_transposed.iloc[4+begin_date-1960:4+end_date-1960+1, count: count+nb_attributes]
    transposed_country.columns = range(transposed_country.columns.size)
    health_dimension = pd.concat([health_dimension, transposed_country], axis=0, ignore_index=True)
    count += nb_attributes


# add dates
date_table = pd.DataFrame(['Date'])
count = 0
for k in range(len(country_list)):
    date_table = pd.concat([date_table, pd.DataFrame([np.arange(begin_date, end_date+1)]).transpose()], axis=0)
date_table = date_table.reset_index().iloc[: , 1:]

health_dimension = pd.concat([date_table, health_dimension], axis=1, ignore_index=True)

health_dimension = health_dimension.drop(0)


country_table = pd.DataFrame(['Country_Ref'])
count = 0
for k in range(len(country_list)):
    country_table = pd.concat([country_table, pd.DataFrame([country_list[k]]*nb_dates)])
country_table = country_table.reset_index().iloc[: , 1:]

health_dimension = pd.concat([country_table, health_dimension], axis=1, ignore_index=True)

health_dimension = health_dimension.drop(0)


health_dimension.columns = ['Country_Ref', 'Date'] + attribute_names

health_dimension.insert(0, "Health_Key",  np.arange(len(health_dimension)))


# fill in the missing data
health_dimension.fillna(method="ffill", inplace=True)
health_dimension.fillna(method="bfill", inplace=True)


# Save it into a new csv file
health_dimension.to_csv('Health_dimension.csv', index = False, header=True)
health_dimension.head()


Unnamed: 0,Health_Key,Country_Ref,Date,Adults_And_Children_Living_With_HIV,Adults_Newly_Infected_With_HIV,%Births_By_Skilled_Health_Staff,Capital_Health_Expenditure_%GDP,DeathByNonCommunicableDiseasePercent,Domestic_General_Health_Expenditure_%GDP,Hospital_Beds_Per_1000ppl,...,Nurses_And_Midwives_Per_1000ppl,Physicians_Per_1000ppl,Prevalence_of_Anemia_%Non-pregnant,Prevalence_of_Anemia_%Pregnant,Prevalence_of_Overweight_%Adult,Suicide_Mortality_Rate_per_100000ppl,Survival_To_Age_65_Female,Survival_To_Age_65_Male,Total_Alcohol_Consumption_Per_Capita,Tuberculosis_Death_Rate_per_100000ppl
1,0,Bolivia,2005,10000.0,1000.0,71.1,0.500505,64.616274,2.753696,1.1,...,0.6084,0.3316,32.3,36.3,49.2,6.3,69.465577,61.458391,4.37,20.0
2,1,Bolivia,2006,10000.0,1000.0,71.1,0.551467,64.616274,2.611063,1.1,...,0.6084,0.3316,32.0,36.3,49.9,5.9,70.139673,62.037042,4.37,19.0
3,2,Bolivia,2007,11000.0,1000.0,71.1,0.57667,64.616274,2.775543,1.1,...,0.6084,0.3316,31.4,36.2,50.5,6.1,70.813769,62.615693,4.37,18.0
4,3,Bolivia,2008,11000.0,1000.0,71.1,0.498547,64.616274,2.701888,1.1,...,0.6084,0.3316,30.8,36.3,51.2,6.0,71.406911,63.135941,4.37,17.0
5,4,Bolivia,2009,11000.0,1000.0,71.1,0.613951,64.616274,3.158307,1.1,...,0.6084,0.3316,30.2,36.2,51.8,6.0,72.000052,63.656188,4.37,16.0


Health Data for DB

In [11]:
final_table = health_dimension.drop(columns=["Country_Ref", "Date"])
display(final_table.head())
final_table.to_csv("Health_Dimension_DB.csv", index = False, header = True)

Unnamed: 0,Health_Key,Adults_And_Children_Living_With_HIV,Adults_Newly_Infected_With_HIV,%Births_By_Skilled_Health_Staff,Capital_Health_Expenditure_%GDP,DeathByNonCommunicableDiseasePercent,Domestic_General_Health_Expenditure_%GDP,Hospital_Beds_Per_1000ppl,Immunization_HepB3_%OneYearOld,ImmunizationMeasles%Children12To23Months,...,Nurses_And_Midwives_Per_1000ppl,Physicians_Per_1000ppl,Prevalence_of_Anemia_%Non-pregnant,Prevalence_of_Anemia_%Pregnant,Prevalence_of_Overweight_%Adult,Suicide_Mortality_Rate_per_100000ppl,Survival_To_Age_65_Female,Survival_To_Age_65_Male,Total_Alcohol_Consumption_Per_Capita,Tuberculosis_Death_Rate_per_100000ppl
1,0,10000.0,1000.0,71.1,0.500505,64.616274,2.753696,1.1,85.0,89.0,...,0.6084,0.3316,32.3,36.3,49.2,6.3,69.465577,61.458391,4.37,20.0
2,1,10000.0,1000.0,71.1,0.551467,64.616274,2.611063,1.1,83.0,88.0,...,0.6084,0.3316,32.0,36.3,49.9,5.9,70.139673,62.037042,4.37,19.0
3,2,11000.0,1000.0,71.1,0.57667,64.616274,2.775543,1.1,84.0,86.0,...,0.6084,0.3316,31.4,36.2,50.5,6.1,70.813769,62.615693,4.37,18.0
4,3,11000.0,1000.0,71.1,0.498547,64.616274,2.701888,1.1,88.0,92.0,...,0.6084,0.3316,30.8,36.3,51.2,6.0,71.406911,63.135941,4.37,17.0
5,4,11000.0,1000.0,71.1,0.613951,64.616274,3.158307,1.1,93.0,93.0,...,0.6084,0.3316,30.2,36.2,51.8,6.0,72.000052,63.656188,4.37,16.0
