In [53]:
#Importing libraries
import pandas as pd
import numpy as np

In [54]:
#Loading fertility dataset
df_fertility = pd.read_csv('./fertility_rate.csv')
df_fertility.head()

Unnamed: 0,Country,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,_World,4.98,5.0,5.03,5.05,5.06,5.04,4.99,4.97,4.92,...,2.5,2.49,2.47,2.46,2.46,2.44,2.43,2.41,2.4,2.39
1,Afghanistan,7.45,7.45,7.45,7.45,7.45,7.45,7.45,7.45,7.45,...,5.77,5.56,5.36,5.16,4.98,4.8,4.63,4.47,4.32,4.18
2,Albania,6.49,6.4,6.28,6.13,5.96,5.77,5.58,5.39,5.22,...,1.67,1.68,1.69,1.69,1.68,1.66,1.64,1.62,1.6,1.58
3,Algeria,7.52,7.57,7.61,7.65,7.67,7.68,7.68,7.67,7.67,...,2.91,2.95,2.99,3.02,3.04,3.05,3.05,3.02,2.99,2.94
4,Angola,6.71,6.79,6.87,6.95,7.04,7.12,7.19,7.27,7.33,...,6.12,6.04,5.95,5.86,5.77,5.69,5.6,5.52,5.44,5.37


In [55]:
#Loading schooling dataset
df_schooling = pd.read_csv('mean-years-of-schooling-long-run.csv', sep=';')
df_schooling.head()

Unnamed: 0,Entity,Code,Year,avg_years_of_schooling
0,Afghanistan,AFG,1870,0.01
1,Afghanistan,AFG,1875,0.01
2,Afghanistan,AFG,1880,0.01
3,Afghanistan,AFG,1885,0.01
4,Afghanistan,AFG,1890,0.01


In [56]:
#Renaming "Entity" column to "Country" in the schooling dataset
df_schooling = df_schooling.rename(columns={'Entity': 'Country'})
df_schooling

Unnamed: 0,Country,Code,Year,avg_years_of_schooling
0,Afghanistan,AFG,1870,0.01
1,Afghanistan,AFG,1875,0.01
2,Afghanistan,AFG,1880,0.01
3,Afghanistan,AFG,1885,0.01
4,Afghanistan,AFG,1890,0.01
...,...,...,...,...
7758,Zimbabwe,ZWE,2013,8.00
7759,Zimbabwe,ZWE,2014,8.20
7760,Zimbabwe,ZWE,2015,8.20
7761,Zimbabwe,ZWE,2016,8.20


In [60]:
#Transforming the years columns to a single "Year" column containing all years in the fertility dataset
df_fertility = pd.melt(df_fertility,
                       id_vars=["Country",],
                       var_name="Year", value_name="fertility")
df_fertility.head()

Unnamed: 0,Country,Year,fertility
0,Afghanistan,1960,7.45
1,Albania,1960,6.49
2,Algeria,1960,7.52
3,Angola,1960,6.71
4,Antigua and Barbuda,1960,4.43


In [61]:
#Sorting values by year in the fertility dataset
df_fertility = df_fertility.sort_values(["Country"])
df_fertility.sort_values("Year")

Unnamed: 0,Country,Year,fertility
0,Afghanistan,1960,7.45
14,Bangladesh,1960,6.73
20,Bhutan,1960,6.64
121,New Zealand,1960,4.03
166,Timor-Leste,1960,6.32
...,...,...,...
11277,Euro area,2020,1.49
11321,Madagascar,2020,3.98
11304,Israel,2020,2.90
11305,Italy,2020,1.24


In [69]:
#Transforming the "Year" column values into integers and removing everything below the year 1960
df_fertility = df_fertility.astype({'Year':'int'})
df_fertility.drop(df_fertility[df_fertility['Year']<=1959].index, inplace = True)
df_fertility

Unnamed: 0,Country,Year,fertility
0,Afghanistan,1960,7.45
10285,Afghanistan,2015,4.98
1122,Afghanistan,1966,7.45
2244,Afghanistan,1972,7.45
9163,Afghanistan,2009,6.18
...,...,...,...
747,_World,1963,5.05
7292,_World,1998,2.75
4113,_World,1981,3.66
5609,_World,1989,3.32


In [70]:
#Removing every "_World" value in the "Country" column since we won't be using them
df_fertility.drop(df_fertility[df_fertility['Country'] == '_World'].index, inplace = True)
df_fertility

Unnamed: 0,Country,Year,fertility
0,Afghanistan,1960,7.45
10285,Afghanistan,2015,4.98
1122,Afghanistan,1966,7.45
2244,Afghanistan,1972,7.45
9163,Afghanistan,2009,6.18
...,...,...,...
3551,Zimbabwe,1978,7.20
6356,Zimbabwe,1993,4.35
6917,Zimbabwe,1996,3.99
1868,Zimbabwe,1969,7.42


In [73]:
#Merging both datasets into one, each row and column with their respective matching values
df = df_fertility.merge(df_schooling, how='inner', on=('Country', 'Year'))
df

Unnamed: 0,Country,Year,fertility,Code,avg_years_of_schooling
0,Afghanistan,1960,7.45,AFG,0.22
1,Afghanistan,2015,4.98,AFG,3.60
2,Afghanistan,2009,6.18,AFG,3.10
3,Afghanistan,1992,7.50,AFG,1.60
4,Afghanistan,2017,4.63,AFG,3.80
...,...,...,...,...,...
5121,Zimbabwe,1985,5.84,ZWE,3.99
5122,Zimbabwe,2013,4.03,ZWE,8.00
5123,Zimbabwe,1993,4.35,ZWE,5.10
5124,Zimbabwe,1996,3.99,ZWE,5.70


In [74]:
#Checking for dataset shape
df.shape

(5126, 5)

In [75]:
#Sorting dataset values by year
df.sort_values('Year')

Unnamed: 0,Country,Year,fertility,Code,avg_years_of_schooling
0,Afghanistan,1960,7.45,AFG,0.22
1553,Eswatini,1960,6.75,SWZ,1.41
1612,Fiji,1960,6.46,FJI,3.85
1641,Finland,1960,2.72,FIN,4.04
1680,France,1960,2.85,FRA,4.10
...,...,...,...,...,...
2087,Iceland,2017,1.71,ISL,12.40
2046,Hungary,2017,1.54,HUN,11.90
2013,Honduras,2017,2.50,HND,6.50
2470,Kenya,2017,3.57,KEN,6.50


In [80]:
#Checking for NaN values
df.isna().sum()

Country                   0
Year                      0
fertility                 0
Code                      0
avg_years_of_schooling    0
dtype: int64

In [82]:
#Checking for duplicates
df.duplicated().sum()

0

In [86]:
#Grouping by country
df.groupby('Country').count()

Unnamed: 0_level_0,Year,fertility,Code,avg_years_of_schooling
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan,34,34,34,34
Albania,34,34,34,34
Algeria,34,34,34,34
Angola,19,19,19,19
Antigua and Barbuda,18,18,18,18
...,...,...,...,...
Venezuela,34,34,34,34
Vietnam,34,34,34,34
Yemen,34,34,34,34
Zambia,34,34,34,34
