In [44]:
#Import libraries
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import warnings
warnings.filterwarnings('ignore')

In [60]:
# Read in the csv raw data file for public education expendtiure
csvfile = pd.read_csv("data_source/raw_data/public_expenditure_education.csv")
#Convert raw data files to Data Frame
raw_df = pd.DataFrame(csvfile)
#Remove the first row and replace with second row
new_header = raw_df.iloc[0]
raw_df = raw_df[1:]
raw_df.columns = new_header 
#Fill in blank column header
raw_df.columns = raw_df.columns.fillna('Country Name')
#Convert Value to float and round
raw_df['Value'] = round(raw_df['Value'].astype(float),2)
raw_df['Year'] = raw_df['Year'].astype(int)
#Remove columns that are not needed (Source, Footnote)
clean_df = raw_df[['Region/Country/Area','Series', 'Country Name','Year','Value']]
clean_df = clean_df.rename(columns={'Region/Country/Area': 'Country ID', 'Value': 'Education Spend (%)' })

clean_df = clean_df.loc[clean_df['Series'].apply(lambda x: x in ['Public expenditure on education (% of government expenditure)','Public expenditure on education (% of GDP)'])]
clean_df['Series'] = clean_df['Series'].replace('Public expenditure on education (% of government expenditure)', 'Government Expenditure')
clean_df['Series'] = clean_df['Series'].replace('Public expenditure on education (% of GDP)', 'GDP')
clean_df =clean_df.reset_index(drop=True)
#Display the data frame

clean_df.head(10)


Unnamed: 0,Country ID,Series,Country Name,Year,Education Spend (%)
0,4,Government Expenditure,Afghanistan,2010,17.07
1,4,GDP,Afghanistan,2010,3.46
2,4,Government Expenditure,Afghanistan,2015,12.51
3,4,GDP,Afghanistan,2015,3.24
4,4,Government Expenditure,Afghanistan,2017,15.66
5,4,GDP,Afghanistan,2017,3.93
6,8,Government Expenditure,Albania,2005,11.36
7,8,GDP,Albania,2005,3.24
8,8,Government Expenditure,Albania,2007,11.18
9,8,GDP,Albania,2007,3.27


In [61]:
#Write to csv in output_data
clean_df.to_csv ('data_source/output_data/public_education_clean.csv', index = False, header=True)

In [62]:
# Read in the csv raw data file for Consumer Price Index expendtiure
csvfile2 = pd.read_csv("data_source/raw_data/consumer_price_index.csv", encoding='latin-1')
#Convert raw data files to Data Frame
raw2_df = pd.DataFrame(csvfile2)
#Remove the first row and replace with second row
raw2_header =  raw2_df.iloc[0]
raw2_df = raw2_df[1:]
raw2_df.columns = new_header 
#Fill in blank column header
raw2_df.columns = raw2_df.columns.fillna('Country Name')
#Convert Value to float and round
raw2_df['Value'] = round(raw2_df['Value'].astype(float),2)
raw2_df['Year'] = raw2_df['Year'].astype(int)
#Remove columns that are not needed (Source, Footnote)
cpi_df = raw2_df[['Region/Country/Area','Series', 'Country Name','Year','Value']]
cpi_df = cpi_df.rename(columns={'Region/Country/Area': 'Country ID','Value': 'CPI' })

cpi_df['Series'] = cpi_df['Series'].replace('Consumer price index: General', 'General')
cpi_df['Series'] = cpi_df['Series'].replace('Consumer price index: Food', 'Food')
cpi_df =cpi_df.reset_index(drop=True)
#Display the data frame
cpi_df.head(10)

Unnamed: 0,Country ID,Series,Country Name,Year,CPI
0,728,General,South Sudan,2017,4583.71
1,728,Food,South Sudan,2017,4574.02
2,728,Food,South Sudan,2016,1660.87
3,728,General,South Sudan,2016,1592.38
4,862,Food,Venezuela (Boliv. Rep. of),2015,1487.2
5,760,Food,Syrian Arab Republic,2016,807.3
6,862,General,Venezuela (Boliv. Rep. of),2015,772.02
7,760,General,Syrian Arab Republic,2016,662.9
8,90,Food,Solomon Islands,2016,555.4
9,90,Food,Solomon Islands,2015,552.52


In [63]:
#Write to csv in output_data
cpi_df.to_csv ('data_source/output_data/cpi_clean.csv', index = False, header=True)

In [92]:
# Read in the csv raw data file for Poverty Rate expendtiure
# Read in the csv raw data file for Consumer Price Index expendtiure
csvfile3 = pd.read_csv("data_source/raw_data/gdp_and_gdp_per_capita.csv", encoding='latin-1')
#Convert raw data files to Data Frame
raw3_df = pd.DataFrame(csvfile3)
#Remove the first row and replace with second row
raw3_header =  raw3_df.iloc[0]
raw3_df = raw3_df[1:]
raw3_df.columns = new_header 
#Fill in blank column header
raw3_df.columns = raw3_df.columns.fillna('Country Name')
#Convert Value to float and round
raw3_df['Value'] = pd.to_numeric(raw3_df['Value'], errors='coerce')
raw3_df['Year'] = raw3_df['Year'].astype(int)
#Remove columns that are not needed (Source, Footnote)
gdp_df = raw3_df[['Region/Country/Area','Series', 'Country Name','Year','Value']]
gdp_df = gdp_df.rename(columns={'Region/Country/Area': 'Country ID','Value': 'GDP' })
gdp_df['Series'] = gdp_df['Series'].replace('GDP in current prices (millions of US dollars)', 'GDP (millions of $)')

#Remove Country ID 1 (Totals)


totals = gdp_df[gdp_df['Country ID'] == '1' ].index
gdp_df.drop(totals , inplace=True)
#Rename values in Series column
gdp_df['Series'] = gdp_df['Series'].replace('GDP per capita (US dollars)', 'GDP per Capita ($)')
gdp_df['Series'] = gdp_df['Series'].replace('GDP in constant 2010 prices (millions of US dollars)', 'GDP in 2010 dollars ($)')
gdp_df =gdp_df.reset_index(drop=True)
gdp_percent_df =gdp_df.loc[gdp_df['Series']== 'GDP real rates of growth (percent)']

#Remove Percent 
percents = gdp_df[gdp_df['Series'] == 'GDP real rates of growth (percent)' ].index
gdp_df.drop(percents , inplace=True)

#Display the data frame
gdp_df.head()


Unnamed: 0,Country ID,Series,Country Name,Year,GDP
0,2,GDP (millions of $),Africa,1985,517301.0
1,2,GDP (millions of $),Africa,1995,582440.0
2,2,GDP (millions of $),Africa,2005,1128718.0
3,2,GDP (millions of $),Africa,2010,1966483.0
4,2,GDP (millions of $),Africa,2015,2293977.0


In [7]:
# Read in the csv raw data file for R&D as a % of GDP expendtiure

#Convert raw data files to Data Frame

#Remove columns that are not needed (Series, Source, Footnote)

#Display the data frame

In [8]:
# Merge data frames to Public education data frame

#Drop N/A values

#Display clean DataFrame

In [9]:
# Write Data Frame to csv file and save to data_source folder

In [10]:
# Create Data Frame grouped by Country

In [None]:
#Create Data Frame grouped by Year