# EDA

## Preparing the data

In [1]:
#Importing libraries 
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
#Loading data
df_education = pd.read_csv('./raw_data/education.csv', sep=';')

#Checking data
df_education.head()

Unnamed: 0,Entity,Code,Year,avg_years_of_schooling
0,Afghanistan,AFG,1870,0.01
1,Afghanistan,AFG,1875,0.01
2,Afghanistan,AFG,1880,0.01
3,Afghanistan,AFG,1885,0.01
4,Afghanistan,AFG,1890,0.01


In [3]:
df_education.shape

(7763, 4)

In [4]:
#Looking for NaN and duplicateds
df_education.isna().sum()

Entity                    0
Code                      0
Year                      0
avg_years_of_schooling    0
dtype: int64

In [5]:
df_education.duplicated().sum()

0

In [6]:
#Limiting years >= 1960 and years <= 2010
df_educ = df_education[df_education['Year']>=1960]
df_educ = df_educ[df_educ['Year']<=2010]

In [7]:
df_educ.groupby('Code').count().sort_values(by='Year')

Unnamed: 0_level_0,Entity,Year,avg_years_of_schooling
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
TKM,1,1,1
SOM,1,1,1
SSD,3,3,3
KNA,6,6,6
BTN,6,6,6
...,...,...,...
IDN,27,27,27
IND,27,27,27
IRL,27,27,27
IRQ,27,27,27


On the table above, we can understand the number 27 as the <b> maximum number of appearances of each country </b>.

Considering this new information, we count how many countries we have with 27 or more appearances: 

In [8]:
df_countries = df_educ.groupby('Entity').count()

In [9]:
#Here, we have all countries that has 27 appearances, between 1960 and 2010
df_countries[df_countries['Code']>=27].shape

(143, 3)

## Schooling improvement

### The first information that we want to see is how many years each country has improved in schooling annual mean

In [10]:
#Limiting years >= 1960
df_educ = df_education[df_education['Year']>=1960]

In [11]:
df_1960 = df_educ[df_educ['Year']==1960]
df_2017 = df_educ[df_educ['Year']==2017]

In [12]:
df_education_improved = df_1960.merge(df_2017, how='left', on='Entity')
df_education_improved['years_improved'] = df_education_improved['avg_years_of_schooling_y'] - df_education_improved['avg_years_of_schooling_x']

In [13]:
df_education_improved.dropna(inplace=True)

In [14]:
df_1960['Year'].unique()

array([1960])

In [15]:
df_education_improved.sort_values(by='years_improved', ascending=False)

Unnamed: 0,Entity,Code_x,Year_x,avg_years_of_schooling_x,Code_y,Year_y,avg_years_of_schooling_y,years_improved
137,United Arab Emirates,ARE,1960,1.38,ARE,2017.0,10.8,9.42
58,Iran,IRN,1960,0.60,IRN,2017.0,9.8,9.20
75,Lithuania,LTU,1960,3.94,LTU,2017.0,13.0,9.06
65,Jordan,JOR,1960,1.42,JOR,2017.0,10.4,8.98
119,South Korea,KOR,1960,3.12,KOR,2017.0,12.1,8.98
...,...,...,...,...,...,...,...,...
80,Maldives,MDV,1960,3.42,MDV,2017.0,6.3,2.88
18,Burundi,BDI,1960,0.38,BDI,2017.0,3.0,2.62
81,Mali,MLI,1960,0.13,MLI,2017.0,2.3,2.17
96,Niger,NER,1960,0.39,NER,2017.0,2.0,1.61


In [16]:
#df_education_improved.to_csv('df_tratado_lucas')

### df_education_improved.nunique()

In [17]:
#Creates df_countries
df_countries = df_educ.groupby(['Entity']).count().sort_values(by='Code', ascending=False)

In [18]:
df_educ[df_educ['Entity']=='Algeria']

Unnamed: 0,Entity,Code,Year,avg_years_of_schooling
122,Algeria,DZA,1960,0.72
123,Algeria,DZA,1965,0.64
124,Algeria,DZA,1970,0.78
125,Algeria,DZA,1975,1.18
126,Algeria,DZA,1980,1.72
127,Algeria,DZA,1985,2.51
128,Algeria,DZA,1990,3.38
129,Algeria,DZA,1991,3.8
130,Algeria,DZA,1992,4.0
131,Algeria,DZA,1993,4.3


In [19]:
df_1960 = df_educ[df_educ['Year']>=1960]
df_2017 = df_educ[df_educ['Year']==2017]

In [20]:
df_ = df_1960.merge(df_2017, how='left', on='Entity')
df_1960

Unnamed: 0,Entity,Code,Year,avg_years_of_schooling
18,Afghanistan,AFG,1960,0.22
19,Afghanistan,AFG,1965,0.29
20,Afghanistan,AFG,1970,0.35
21,Afghanistan,AFG,1975,0.62
22,Afghanistan,AFG,1980,0.78
...,...,...,...,...
7758,Zimbabwe,ZWE,2013,8.00
7759,Zimbabwe,ZWE,2014,8.20
7760,Zimbabwe,ZWE,2015,8.20
7761,Zimbabwe,ZWE,2016,8.20


In [21]:
#Select only countries that have 34 or more samples (max samples between 1960 and 2017)
df_countries = df_countries[df_countries['Code']>=34]

In [22]:
df_educ[['Entity','avg_years_of_schooling']].groupby(['Entity']).count()

Unnamed: 0_level_0,avg_years_of_schooling
Entity,Unnamed: 1_level_1
Afghanistan,34
Albania,34
Algeria,34
Andorra,18
Angola,19
...,...
Venezuela,34
Vietnam,34
Yemen,34
Zambia,34


In [23]:
#sns.lineplot(data=df_educ, x='Year', y='avg_years_of_schooling', hue='Entity')

### df = pd.read_csv('./raw_data/treated.csv')

df.tail()

In [24]:
#sns.scatterplot(data=df[df['Year']==1990], x='fertility', y='avg_years_of_schooling', alpha=0.5)

In [26]:
data = pd.read_csv('./raw_data/fertility.csv')
data.head()

Unnamed: 0,Country,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,_World,4.98,5.0,5.03,5.05,5.06,5.04,4.99,4.97,4.92,...,2.5,2.49,2.47,2.46,2.46,2.44,2.43,2.41,2.4,2.39
1,Afghanistan,7.45,7.45,7.45,7.45,7.45,7.45,7.45,7.45,7.45,...,5.77,5.56,5.36,5.16,4.98,4.8,4.63,4.47,4.32,4.18
2,Albania,6.49,6.4,6.28,6.13,5.96,5.77,5.58,5.39,5.22,...,1.67,1.68,1.69,1.69,1.68,1.66,1.64,1.62,1.6,1.58
3,Algeria,7.52,7.57,7.61,7.65,7.67,7.68,7.68,7.67,7.67,...,2.91,2.95,2.99,3.02,3.04,3.05,3.05,3.02,2.99,2.94
4,Angola,6.71,6.79,6.87,6.95,7.04,7.12,7.19,7.27,7.33,...,6.12,6.04,5.95,5.86,5.77,5.69,5.6,5.52,5.44,5.37


In [27]:
data.sort_values(by='1960')

Unnamed: 0,Country,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
96,Latvia,1.94,1.94,1.91,1.85,1.79,1.74,1.76,1.80,1.83,...,1.33,1.44,1.52,1.65,1.70,1.74,1.69,1.60,1.61,1.55
55,Estonia,1.98,1.98,1.95,1.89,1.94,1.88,1.87,1.90,2.03,...,1.61,1.56,1.52,1.54,1.58,1.60,1.59,1.67,1.66,1.58
88,Japan,2.00,2.05,2.01,2.02,2.05,2.14,1.58,2.02,2.13,...,1.39,1.41,1.43,1.42,1.45,1.44,1.43,1.42,1.36,1.34
78,Hungary,2.02,1.94,1.79,1.82,1.82,1.82,1.89,2.01,2.06,...,1.23,1.34,1.35,1.44,1.45,1.53,1.54,1.55,1.55,1.56
46,Czechia,2.09,2.11,2.12,2.31,2.34,2.18,2.02,1.91,1.84,...,1.43,1.45,1.46,1.53,1.57,1.63,1.69,1.71,1.71,1.71
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42,Cote d'Ivoire,7.69,7.72,7.75,7.78,7.81,7.84,7.87,7.89,7.91,...,5.03,4.97,4.92,4.87,4.81,4.76,4.70,4.65,4.59,4.54
89,Jordan,7.69,7.80,7.90,7.98,8.03,8.06,8.05,8.03,8.00,...,3.54,3.44,3.32,3.20,3.07,2.95,2.85,2.76,2.69,2.64
184,Yemen,7.94,7.96,7.99,8.03,8.07,8.11,8.17,8.22,8.28,...,4.55,4.44,4.33,4.21,4.10,3.99,3.89,3.79,3.70,3.61
91,Kenya,7.95,8.00,8.04,8.08,8.10,8.12,8.13,8.13,8.12,...,4.25,4.12,4.00,3.88,3.77,3.66,3.57,3.49,3.42,3.37


In [None]:
school = pd.read_csv('./raw_data/schooling.csv', header=2)
school.head()

In [None]:
school.drop(columns=['1960','1961','1962','1963','1964','1965','1966','1967','1968', '1969'], inplace=True)

In [None]:
school.isnull().sum(axis=1).mean()

In [None]:
school.iloc[0,].isna().sum()

# --- new dataset ---

In [None]:
dfedu = pd.read_csv('./raw_data/edustats.csv')
dfedu.head()

In [None]:
dfedu['Indicator Name'].unique().tolist()

In [None]:
ind = 'Barro-Lee: Percentage of population age 15+ with no education'

In [None]:
educ_df = dfedu[dfedu['Indicator Name'] == ind]

In [None]:
educ_df.shape

In [None]:
educ_df['1999']

In [None]:
years_with_na = pd.DataFrame(educ_df.isna().sum())

In [None]:
years_with_na.rename(columns={0:'nan'}, inplace=True)
years_with_na.reset_index(inplace=True)
years_with_na.drop(labels=[0,1,2,3], inplace=True)

years_with_na.head()

In [None]:
years_with_na[years_with_na['nan']>=240]

More than

In [None]:
educ_df[['Country Name','1980', '2000', '2010', '2020']].isna().sum()

In [29]:
test = pd.read_csv('./raw_data/treated.csv')
test.head()

Unnamed: 0.1,Unnamed: 0,Country,Year,fertility,Code,avg_years_of_schooling
0,0,Afghanistan,2007,6.56,AFG,2.9
1,1,Afghanistan,1995,7.61,AFG,1.9
2,2,Afghanistan,2008,6.37,AFG,3.0
3,3,Afghanistan,2017,4.63,AFG,3.8
4,4,Afghanistan,1994,7.57,AFG,1.8


In [30]:
test.drop(columns='Unnamed: 0', inplace=True)
test.head()

Unnamed: 0,Country,Year,fertility,Code,avg_years_of_schooling
0,Afghanistan,2007,6.56,AFG,2.9
1,Afghanistan,1995,7.61,AFG,1.9
2,Afghanistan,2008,6.37,AFG,3.0
3,Afghanistan,2017,4.63,AFG,3.8
4,Afghanistan,1994,7.57,AFG,1.8


In [31]:
paises = test.Country.unique().tolist()

In [53]:
df_world = pd.DataFrame(np.array([[1, 2, 3, 4, 5]]),
                   columns=['Country', 'Year', 'fertility', 'Code', 'avg_years_of_schooling'])

df_world.head()

Unnamed: 0,Country,Year,fertility,Code,avg_years_of_schooling
0,1,2,3,4,5


In [59]:
df_list = []

In [60]:
df_list = []
new_df = pd.DataFrame()
for pais in paises:
    new_df = test[test['Country']==pais]
    #print(new_df)
    #input()
    df_list.append(new_df) #pd.concat([df_world, new_df], ignore_index=True)
    
#df_world.drop(labels=[0], inplace=True)
### order the dataset
### mantain only fertility and avg_years_school on X (with year as index)
### y = only avg_years_of_schooling (with year as index)

In [70]:
df_list

AttributeError: 'list' object has no attribute 'shape'

In [55]:
df_world.Country.nunique()

166

In [57]:
df_world.shape

(5126, 5)

In [None]:
pd.concat([df_world, new_df])

In [None]:
df_world

In [None]:
pegar a lista de todos os paises
fazer um for loop filtrando o df_original e armazenando o filtro em um novo df

In [None]:
from tensorflow.keras import Sequential, layers
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=3)
model = Sequential()
