In [1]:
import pandas as pd
import numpy as np

In [None]:
# See database_helpers.py
run database_helpers.py

In [6]:
# CAREFULLY read this file, it may be a "csv" but it is formatted
# like a report...
hdi_data = pd.read_csv("../Data/Human Development Index (HDI).csv", skiprows=5, nrows=189,
                      encoding_errors="ignore", usecols=["Country"]+[str(year) for year in range(1990,2020)])
hdi_data

Unnamed: 0,Country,1990,1991,1992,1993,1994,1995,1996,1997,1998,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Afghanistan,0.302,0.307,0.316,0.312,0.307,0.331,0.335,0.339,0.344,...,0.472,0.477,0.489,0.496,0.500,0.500,0.502,0.506,0.509,0.511
1,Albania,0.650,0.631,0.615,0.618,0.624,0.637,0.646,0.645,0.655,...,0.745,0.764,0.775,0.782,0.787,0.788,0.788,0.790,0.792,0.795
2,Algeria,0.572,0.576,0.582,0.586,0.590,0.595,0.602,0.611,0.621,...,0.721,0.728,0.728,0.729,0.736,0.740,0.743,0.745,0.746,0.748
3,Andorra,..,..,..,..,..,..,..,..,..,...,0.837,0.836,0.858,0.856,0.863,0.862,0.866,0.863,0.867,0.868
4,Angola,..,..,..,..,..,..,..,..,..,...,0.517,0.533,0.544,0.555,0.565,0.572,0.578,0.582,0.582,0.581
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184,Venezuela (Bolivarian Republic of),0.644,0.654,0.660,0.662,0.662,0.666,0.668,0.670,0.672,...,0.757,0.769,0.772,0.777,0.775,0.769,0.759,0.743,0.733,0.711
185,Viet Nam,0.483,0.493,0.504,0.514,0.525,0.537,0.548,0.547,0.567,...,0.661,0.671,0.676,0.681,0.683,0.688,0.693,0.696,0.700,0.704
186,Yemen,0.401,0.401,0.404,0.406,0.408,0.414,0.421,0.426,0.431,...,0.506,0.506,0.504,0.509,0.502,0.483,0.474,0.467,0.468,0.470
187,Zambia,0.421,0.417,0.416,0.419,0.414,0.415,0.416,0.416,0.416,...,0.527,0.534,0.549,0.557,0.561,0.569,0.571,0.578,0.582,0.584


In [7]:
# Now that we've read the data, process it for upload
# first, convert this "wide" format to a "long" format
hdi_data_long = pd.melt(hdi_data, id_vars=["Country"], var_name="HDIYear", value_name="HDI")
# Next, replace the NA values coded as ".." with better substitutes for python
hdi_data_long.replace({"HDI":{"..":None}}, inplace=True)
# Cast the numbers as numeric values
hdi_data_long["HDI"] = hdi_data_long["HDI"].astype("float", errors="ignore")
# Strip whitespace from country names
hdi_data_long["Country"] = hdi_data_long["Country"].str.strip()

# We also want to create the "current" flag. Since I know the data ends in 2019, I will
# make that the "current" year
hdi_data_long["HDICurrent"] = False
hdi_data_long.loc[hdi_data_long["HDIYear"] == "2019", "HDICurrent"] = True

# Create the IDs
hdi_data_long.reset_index(inplace=True)
hdi_data_long.rename(columns={"index":"Id"})

# In preparation for database inserts, replace nan values with None
hdi_data_long.replace(np.NaN, None, inplace=True)
hdi_data_long

Unnamed: 0,index,Country,HDIYear,HDI,HDICurrent
0,0,Afghanistan,1990,0.302,False
1,1,Albania,1990,0.650,False
2,2,Algeria,1990,0.572,False
3,3,Andorra,1990,0.572,False
4,4,Angola,1990,0.572,False
...,...,...,...,...,...
5665,5665,Venezuela (Bolivarian Republic of),2019,0.711,True
5666,5666,Viet Nam,2019,0.704,True
5667,5667,Yemen,2019,0.470,True
5668,5668,Zambia,2019,0.584,True


In [None]:
# Now we can insert the values into our table
hdiInsertQuery="""
INSERT INTO "Dim_HDI"
("Id", "HDICountryName", "HDIYear", "HDI", "HDICurrent")
VALUES
%s
"""

bulk_insert(hdiInsertQuery, hdi_data_long)