Import Packages

In [223]:
import pandas as pd
import numpy as np
import warnings

from datetime import datetime, timedelta
from IPython.display import display

warnings.filterwarnings("ignore")

Read Data with Pandas

In [224]:
country_data_raw = pd.read_csv("Country_Data_Raw.csv")#read input data

country_transposed = country_data_raw.transpose()#flip dataframe

final_table = country_transposed.iloc[:1,[0,9,18,27,36,45,54,63,72,81,90,99,108]]#create data frame header columns
final_table.columns = range(final_table.columns.size)


count_start = 0
count_end = 12
for i in range(9):
     transposed_country = country_transposed.iloc[4:,[0+count_start,9+count_start,18+count_start,27+count_start,36+count_start,45+count_start,54+count_start,63+count_start,72+count_start,81+count_start,90+count_start,99+count_start,108+count_start]]
     transposed_country.columns = range(transposed_country.columns.size)

     #Male and female birth rates appear to have an up and down variance (BUT LIMITED IN EITHER DIRECTION) as time progresses without a clear relationship. In this case we beleive it is safe to
     #average out the values for each country and use that to fill the gaps. The data has no outliers and is fairly tightly packed
     transposed_country[7] = pd.to_numeric(transposed_country[7], errors='coerce')
     transposed_country[7].replace({np.NaN: transposed_country[7].mean()}, inplace=True)

     transposed_country[8] = pd.to_numeric(transposed_country[8], errors='coerce')
     transposed_country[8].replace({np.NaN: transposed_country[8].mean()}, inplace=True)

     #lO
     transposed_country[11] = pd.to_numeric(transposed_country[11], errors='coerce')
     transposed_country[11].replace({np.NaN: transposed_country[11].mean()}, inplace=True)
     
     final_table = pd.concat([final_table, transposed_country], axis=0, ignore_index=True)

     count_start = count_start + 1

#Fill all empty values with NaN
final_table.replace({'..': np.nan}, inplace = True)
#Birth and death rate have limited variance, it is safe to use the previous fill given it is from only a year ago
final_table[0].fillna(method="ffill", inplace=True)
final_table[2].fillna(method="ffill", inplace=True)
# final_table.head(50)

Add Column Labels to Dataframe

In [225]:

final_table.columns=["Birth_Rate_Crude_Per_1000", "Population_Total_Count","Death_Rate_Crude_Per_1000", "GNI_Per_Capita_USD", "Labour_Force_Female_Percent", "Labour_Force_Total_Count", "Age_Dependency_Ratio_Percent_Working_Age", "Age_First_Marriage_Male", "Age_First_Marriage_Female", "Age_Dependency_Ratio_Old", "Age_Dependency_Ratio_Young", "Completeness_Birth_Registration_Percent", "Completeness_Death_Registration_Percent"]
final_table.head(2)

Unnamed: 0,Birth_Rate_Crude_Per_1000,Population_Total_Count,Death_Rate_Crude_Per_1000,GNI_Per_Capita_USD,Labour_Force_Female_Percent,Labour_Force_Total_Count,Age_Dependency_Ratio_Percent_Working_Age,Age_First_Marriage_Male,Age_First_Marriage_Female,Age_Dependency_Ratio_Old,Age_Dependency_Ratio_Young,Completeness_Birth_Registration_Percent,Completeness_Death_Registration_Percent
0,"Birth rate, crude (per 1,000 people)","Population, total","Death rate, crude (per 1,000 people)","GNI per capita, Atlas method (current US$)","Labor force, female (% of total labor force)","Labor force, total",Age dependency ratio (% of working-age populat...,"Age at first marriage, male","Age at first marriage, female","Age dependency ratio, old","Age dependency ratio, young",Completeness of birth registration (%),Completeness of death registration with cause-...
1,10.6,32243753,7.1,34810,46.63737609,17821886,44.495093,28.7,26.98,18.95421925,25.54087375,100.0,


Add Country Name, Year, Continent, Region, Capital, Currency columns

In [226]:
count = 5
iterator = 0;
countries = ["Canada", "USA", "Mexico", "Burundi", "Chad", "Niger", "Bolivia", "Philippines", "Indonesia"]

# regions from https://www.thoughtco.com/official-listing-of-countries-world-region-1435153
regions = ["North America", "North America", "North America", "Sub-Saharan Africa", "Sub-Saharan Africa", "Sub-Saharan Africa", "South America", "Asia", "Asia"]
continents = ["North America", "North America", "North America", "Africa", "Africa", "Africa", "South America", "Asia", "Asia"]

# currencies from https://www.countries-ofthe-world.com/world-currencies.html
currencies = ["CAD", "USD", "MXN", "BIF", "XAF", "XOF", "BOB", "PHP", "IDR"]

# capital cities from https://www.countries-ofthe-world.com/capitals-of-the-world.html
capitals = ["Ottawa", "Washington, D.C.", "Mexico City", "Gitega", "N'Djamena", "Niamey", "Sucre, La Paz", "Manila", "Jakarta"]

country_gen_df = pd.DataFrame(columns=["Country","Year"])

pop_data = {}
pop_data["Country_Key"] = -1
pop_data["Country"] = "Blank"
  
country_gen_df = country_gen_df.append(pd.DataFrame([pop_data]), ignore_index=True)


for i in range(144):
    if count>20:
        count = 5
        iterator = iterator + 1
    pop_data["Country_Key"] = i
    pop_data["Year"] = 2000+count
    pop_data["Continent"] = continents[iterator]
    pop_data["Region"] = regions[iterator]
    pop_data["Country"] = countries[iterator]
    pop_data["Capital"] = capitals[iterator]
    pop_data["Currency"] = currencies[iterator]

    count = count + 1
    country_gen_df = country_gen_df.append(pd.DataFrame([pop_data]), ignore_index=True)

display(country_gen_df.head(5))


Unnamed: 0,Country,Year,Country_Key,Continent,Region,Capital,Currency
0,Blank,,-1.0,,,,
1,Canada,2005.0,0.0,North America,North America,Ottawa,CAD
2,Canada,2006.0,1.0,North America,North America,Ottawa,CAD
3,Canada,2007.0,2.0,North America,North America,Ottawa,CAD
4,Canada,2008.0,3.0,North America,North America,Ottawa,CAD


Join the two dataframes

In [227]:
final_table = pd.concat([country_gen_df, final_table], axis=1)
display(final_table.head(2))

Unnamed: 0,Country,Year,Country_Key,Continent,Region,Capital,Currency,Birth_Rate_Crude_Per_1000,Population_Total_Count,Death_Rate_Crude_Per_1000,GNI_Per_Capita_USD,Labour_Force_Female_Percent,Labour_Force_Total_Count,Age_Dependency_Ratio_Percent_Working_Age,Age_First_Marriage_Male,Age_First_Marriage_Female,Age_Dependency_Ratio_Old,Age_Dependency_Ratio_Young,Completeness_Birth_Registration_Percent,Completeness_Death_Registration_Percent
0,Blank,,-1.0,,,,,"Birth rate, crude (per 1,000 people)","Population, total","Death rate, crude (per 1,000 people)","GNI per capita, Atlas method (current US$)","Labor force, female (% of total labor force)","Labor force, total",Age dependency ratio (% of working-age populat...,"Age at first marriage, male","Age at first marriage, female","Age dependency ratio, old","Age dependency ratio, young",Completeness of birth registration (%),Completeness of death registration with cause-...
1,Canada,2005.0,0.0,North America,North America,Ottawa,CAD,10.6,32243753,7.1,34810,46.63737609,17821886,44.495093,28.7,26.98,18.95421925,25.54087375,100.0,


Remove Columns and Rows that  will not be used

In [228]:
#Row 0 was a placeholder that can be removed
final_table = final_table.drop(0)

#Death certification had too little data to use
final_table = final_table.drop(["Completeness_Death_Registration_Percent"], axis=1)

final_table['Country_Key'] = final_table['Country_Key'].astype(int)
#Display final table
final_table.head(200)


Unnamed: 0,Country,Year,Country_Key,Continent,Region,Capital,Currency,Birth_Rate_Crude_Per_1000,Population_Total_Count,Death_Rate_Crude_Per_1000,GNI_Per_Capita_USD,Labour_Force_Female_Percent,Labour_Force_Total_Count,Age_Dependency_Ratio_Percent_Working_Age,Age_First_Marriage_Male,Age_First_Marriage_Female,Age_Dependency_Ratio_Old,Age_Dependency_Ratio_Young,Completeness_Birth_Registration_Percent
1,Canada,2005,0,North America,North America,Ottawa,CAD,10.6,32243753,7.1,34810,46.63737609,17821886,44.495093,28.7,26.98,18.95421925,25.54087375,100.0
2,Canada,2006,1,North America,North America,Ottawa,CAD,10.9,32571174,7,38510,46.8751721,18030705,44.23699432,28.6,26.6,19.13834588,25.09864844,100.0
3,Canada,2007,2,North America,North America,Ottawa,CAD,11.2,32889025,7.1,43090,47.0367127,18379605,44.03477573,28.7,26.98,19.35344295,24.68133278,100.0
4,Canada,2008,3,North America,North America,Ottawa,CAD,11.3,33247118,7.2,45650,46.97475419,18667574,43.93390259,28.7,26.98,19.62097814,24.31292445,100.0
5,Canada,2009,4,North America,North America,Ottawa,CAD,11.3,33628895,7.1,43230,47.20037683,18799155,43.97675318,28.7,26.98,19.96659698,24.0101562,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,Indonesia,2016,139,Asia,Asia,Jakarta,IDR,18.245,123333379,5.851,9390,37.60167967,53507498,51.9253744,25.9,23.2,10.44231738,41.48305702,94.2
141,Indonesia,2017,140,Asia,Asia,Jakarta,IDR,17.918,124777326,5.933,8920,37.3923288,54096291,51.46924186,25.885714,23.285714,10.6646015,40.80464036,94.2
142,Indonesia,2018,141,Asia,Asia,Jakarta,IDR,17.602,126190782,6.01,9180,37.71694562,55190039,51.01299485,25.9,23.2,10.90870355,40.1042913,94.2
143,Indonesia,2019,142,Asia,Asia,Jakarta,IDR,17.297,127575529,6.082,9470,38.43918557,56596004,50.60644416,25.9,23.2,11.16996881,39.43647534,94.2


Generate CSV File

In [229]:
final_table.to_csv("Country_Processed_Table.csv", index = False, header = True)

File for DB

In [230]:
final_table = final_table.drop(columns=["Year"])
shiftPos = final_table.pop("Country_Key")
final_table.insert(0,"Country_Key", shiftPos)
display(final_table.head())
final_table.to_csv("Country_Processed_DB.csv", index = False, header = True)

Unnamed: 0,Country_Key,Country,Continent,Region,Capital,Currency,Birth_Rate_Crude_Per_1000,Population_Total_Count,Death_Rate_Crude_Per_1000,GNI_Per_Capita_USD,Labour_Force_Female_Percent,Labour_Force_Total_Count,Age_Dependency_Ratio_Percent_Working_Age,Age_First_Marriage_Male,Age_First_Marriage_Female,Age_Dependency_Ratio_Old,Age_Dependency_Ratio_Young,Completeness_Birth_Registration_Percent
1,0,Canada,North America,North America,Ottawa,CAD,10.6,32243753,7.1,34810,46.63737609,17821886,44.495093,28.7,26.98,18.95421925,25.54087375,100.0
2,1,Canada,North America,North America,Ottawa,CAD,10.9,32571174,7.0,38510,46.8751721,18030705,44.23699432,28.6,26.6,19.13834588,25.09864844,100.0
3,2,Canada,North America,North America,Ottawa,CAD,11.2,32889025,7.1,43090,47.0367127,18379605,44.03477573,28.7,26.98,19.35344295,24.68133278,100.0
4,3,Canada,North America,North America,Ottawa,CAD,11.3,33247118,7.2,45650,46.97475419,18667574,43.93390259,28.7,26.98,19.62097814,24.31292445,100.0
5,4,Canada,North America,North America,Ottawa,CAD,11.3,33628895,7.1,43230,47.20037683,18799155,43.97675318,28.7,26.98,19.96659698,24.0101562,100.0
