In [1]:
import pandas as pd

# load csvs
idh = pd.read_csv("datasets/human-development-index-cleaned.csv")
schooling = pd.read_csv("datasets/expected-years-of-schooling-cleaned.csv")
gni = pd.read_csv("datasets/gross-national-income-per-capita-cleaned.csv")
population = pd.read_csv("datasets/population_total_long-cleaned.csv")
alpha3 = pd.read_csv("datasets/iso_noc-merged.csv")

In [2]:
population

Unnamed: 0,Country Name,Year,Count,Code
0,Aruba,1960,54211,ABW
1,Afghanistan,1960,8996973,AFG
2,Angola,1960,5454933,AGO
3,Albania,1960,1608800,ALB
4,Andorra,1960,13411,AND
...,...,...,...,...
12590,Kosovo,2017,1830700,XKX
12591,"Yemen, Rep.",2017,27834821,YEM
12592,South Africa,2017,57000451,ZAF
12593,Zambia,2017,16853688,ZMB


In [3]:
data_merged = population.merge(
    idh[["Year", "Code", "Human Development Index (UNDP)"]],
    on=["Year", "Code"],
    how="left",
)
data_merged = data_merged.merge(
    schooling[["Year", "Code", "Expected Years of Schooling (years)"]],
    on=["Year", "Code"],
    how="left",
)
data_merged = data_merged.merge(
    gni[["Year", "Code", "GNI per capita, PPP (constant 2017 international $)"]],
    on=["Year", "Code"],
    how="left",
)

In [4]:
# get all entires with nan values in the Code column
nan_codes = data_merged[data_merged["Code"].isna()]
nan_codes

Unnamed: 0,Country Name,Year,Count,Code,Human Development Index (UNDP),Expected Years of Schooling (years),"GNI per capita, PPP (constant 2017 international $)"


In [5]:
data_merged

Unnamed: 0,Country Name,Year,Count,Code,Human Development Index (UNDP),Expected Years of Schooling (years),"GNI per capita, PPP (constant 2017 international $)"
0,Aruba,1960,54211,ABW,,,
1,Afghanistan,1960,8996973,AFG,,,
2,Angola,1960,5454933,AGO,,,
3,Albania,1960,1608800,ALB,,,
4,Andorra,1960,13411,AND,,,
...,...,...,...,...,...,...,...
12590,Kosovo,2017,1830700,XKX,,,
12591,"Yemen, Rep.",2017,27834821,YEM,0.452,9.0,
12592,South Africa,2017,57000451,ZAF,0.699,13.3,12320.426838
12593,Zambia,2017,16853688,ZMB,0.588,12.5,3330.552717


In [6]:
# show entries before 1990
before90 = data_merged[data_merged["Year"] < 1990]
before90


Unnamed: 0,Country Name,Year,Count,Code,Human Development Index (UNDP),Expected Years of Schooling (years),"GNI per capita, PPP (constant 2017 international $)"
0,Aruba,1960,54211,ABW,,,
1,Afghanistan,1960,8996973,AFG,,,
2,Angola,1960,5454933,AGO,,,
3,Albania,1960,1608800,ALB,,,
4,Andorra,1960,13411,AND,,,
...,...,...,...,...,...,...,...
6475,Kosovo,1989,1827000,XKX,,,
6476,"Yemen, Rep.",1989,11189177,YEM,,,
6477,South Africa,1989,35930050,ZAF,,,
6478,Zambia,1989,7820205,ZMB,,,


In [7]:
before90.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6480 entries, 0 to 6479
Data columns (total 7 columns):
 #   Column                                               Non-Null Count  Dtype  
---  ------                                               --------------  -----  
 0   Country Name                                         6480 non-null   object 
 1   Year                                                 6480 non-null   int64  
 2   Count                                                6480 non-null   int64  
 3   Code                                                 6480 non-null   object 
 4   Human Development Index (UNDP)                       216 non-null    float64
 5   Expected Years of Schooling (years)                  0 non-null      float64
 6   GNI per capita, PPP (constant 2017 international $)  0 non-null      float64
dtypes: float64(3), int64(2), object(2)
memory usage: 405.0+ KB


In [8]:
# show entries after 1990
after90 = data_merged[data_merged["Year"] >= 1990]
after90

Unnamed: 0,Country Name,Year,Count,Code,Human Development Index (UNDP),Expected Years of Schooling (years),"GNI per capita, PPP (constant 2017 international $)"
6480,Aruba,1990,62149,ABW,,,
6481,Afghanistan,1990,12412308,AFG,,2.6,
6482,Angola,1990,11848386,AGO,,3.8,
6483,Albania,1990,3286542,ALB,0.645,11.6,
6484,Andorra,1990,54509,AND,,10.8,
...,...,...,...,...,...,...,...
12590,Kosovo,2017,1830700,XKX,,,
12591,"Yemen, Rep.",2017,27834821,YEM,0.452,9.0,
12592,South Africa,2017,57000451,ZAF,0.699,13.3,12320.426838
12593,Zambia,2017,16853688,ZMB,0.588,12.5,3330.552717


In [9]:
after90.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6115 entries, 6480 to 12594
Data columns (total 7 columns):
 #   Column                                               Non-Null Count  Dtype  
---  ------                                               --------------  -----  
 0   Country Name                                         6115 non-null   object 
 1   Year                                                 6115 non-null   int64  
 2   Count                                                6115 non-null   int64  
 3   Code                                                 6115 non-null   object 
 4   Human Development Index (UNDP)                       4776 non-null   float64
 5   Expected Years of Schooling (years)                  5115 non-null   float64
 6   GNI per capita, PPP (constant 2017 international $)  3503 non-null   float64
dtypes: float64(3), int64(2), object(2)
memory usage: 382.2+ KB


In [12]:
# Number of rows in original DataFrame
total_rows = len(data_merged)

# Number of rows in before90 DataFrame
before90_rows = len(before90)

# Calculate percentage of before90 rows from total
before90_percentage = (before90_rows / total_rows) * 100

# Do the same for the after90 DataFrame
after90 = data_merged[data_merged["Year"] >= 1990]
after90_rows = len(after90)
after90_percentage = (after90_rows / total_rows) * 100

# Number of rows in original DataFrame
total_rows = len(data_merged)

# Calculate percentage of total rows from total
data_merged_percentage = (total_rows / total_rows) * 100

# Print the results
print(f"data_merged: {total_rows} rows, {data_merged_percentage:.2f}% of the total")
print(f"Before 1990: {before90_rows} rows, {before90_percentage:.2f}% of the total")
print(f"After 1990: {after90_rows} rows, {after90_percentage:.2f}% of the total")


data_merged: 12595 rows, 100.00% of the total
Before 1990: 6480 rows, 51.45% of the total
After 1990: 6115 rows, 48.55% of the total


In [11]:
# Calculate total number of elements in before90 DataFrame
total_before90 = before90.size

# Calculate number of NaN values in before90 DataFrame
before90_nan = before90.isna().sum().sum()

# Calculate percentage of NaN values in before90 DataFrame
before90_nan_percentage = (before90_nan / total_before90) * 100

# Calculate total number of elements in after90 DataFrame
total_after90 = after90.size

# Calculate number of NaN values in after90 DataFrame
after90_nan = after90.isna().sum().sum()

# Calculate percentage of NaN values in after90 DataFrame
after90_nan_percentage = (after90_nan / total_after90) * 100

# Calculate total number of elements in data_merged DataFrame
total_data_merged = data_merged.size

# Calculate number of NaN values in data_merged DataFrame
data_merged_nan = data_merged.isna().sum().sum()

# Calculate percentage of NaN values in data_merged DataFrame
data_merged_nan_percentage = (data_merged_nan / total_data_merged) * 100

# Print the results
print(
    f"Data_merged: {total_data_merged} total values, {data_merged_nan} of them are NaN values ({data_merged_nan_percentage:.2f}% of the total)"
)
print(
    f"Before 1990: {total_before90} total values, {before90_nan} of them are NaN values ({before90_nan_percentage:.2f}% of the total)"
)
print(
    f"After 1990: {total_after90} total values, {after90_nan} of them are NaN values ({after90_nan_percentage:.2f}% of the total)"
)


Data_merged: 88165 total values, 24175 of them are NaN values (27.42% of the total)
Before 1990: 45360 total values, 19224 of them are NaN values (42.38% of the total)
After 1990: 42805 total values, 4951 of them are NaN values (11.57% of the total)


In [7]:
# save to csv file
data_merged.to_csv("datasets/country-data-merged.csv", index=False)