In [94]:
#Import libraries
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import warnings
warnings.filterwarnings('ignore')

In [108]:
# Read in the csv raw data file for public education expendtiure
csvfile = pd.read_csv("data_source/raw_data/public_expenditure_education.csv")
#Convert raw data files to Data Frame
raw_df = pd.DataFrame(csvfile)
#Remove the first row and replace with second row
new_header = raw_df.iloc[0]
raw_df = raw_df[1:]
raw_df.columns = new_header 
#Fill in blank column header
raw_df.columns = raw_df.columns.fillna('Country Name')
#Convert Value to float and round
raw_df['Value'] = round(raw_df['Value'].astype(float),2)
raw_df['Year'] = raw_df['Year'].astype(int)
#Remove columns that are not needed (Source, Footnote)
clean_df = raw_df[['Region/Country/Area','Series', 'Country Name','Year','Value']]
clean_df = clean_df.rename(columns={'Region/Country/Area': 'Country ID', 'Value': 'Education Spend (%)' })

clean_df = clean_df.loc[clean_df['Series'].apply(lambda x: x in ['Public expenditure on education (% of government expenditure)','Public expenditure on education (% of GDP)'])]
clean_df['Series'] = clean_df['Series'].replace('Public expenditure on education (% of government expenditure)', 'Government Expenditure')
clean_df['Series'] = clean_df['Series'].replace('Public expenditure on education (% of GDP)', 'GDP')
clean_df =clean_df.reset_index(drop=True)
#Display the data frame

clean_df.head(10)


Unnamed: 0,Country ID,Series,Country Name,Year,Education Spend (%)
0,4,Government Expenditure,Afghanistan,2010,17.07
1,4,GDP,Afghanistan,2010,3.46
2,4,Government Expenditure,Afghanistan,2015,12.51
3,4,GDP,Afghanistan,2015,3.24
4,4,Government Expenditure,Afghanistan,2017,15.66
5,4,GDP,Afghanistan,2017,3.93
6,8,Government Expenditure,Albania,2005,11.36
7,8,GDP,Albania,2005,3.24
8,8,Government Expenditure,Albania,2007,11.18
9,8,GDP,Albania,2007,3.27


In [96]:
#Write to csv in output_data
clean_df.to_csv ('data_source/output_data/public_education_clean.csv', index = False, header=True)

In [97]:
# Read in the csv raw data file for Consumer Price Index expendtiure
csvfile2 = pd.read_csv("data_source/raw_data/consumer_price_index.csv", encoding='latin-1')

#Convert raw data files to Data Frame
raw2_df = pd.DataFrame(csvfile2)

#Remove the first row and replace with second row
raw2_header =  raw2_df.iloc[0]
raw2_df = raw2_df[1:]
raw2_df.columns = new_header 

#Fill in blank column header
raw2_df.columns = raw2_df.columns.fillna('Country Name')

#Convert Value to float and Year to int and round
raw2_df['Value'] = round(raw2_df['Value'].astype(float),2)
raw2_df['Year'] = raw2_df['Year'].astype(int)

#Remove columns that are not needed (Source, Footnote)
cpi_df = raw2_df[['Region/Country/Area','Series', 'Country Name','Year','Value']]
cpi_df = cpi_df.rename(columns={'Region/Country/Area': 'Country ID','Value': 'CPI' })

#Rename values in Series
cpi_df['Series'] = cpi_df['Series'].replace('Consumer price index: General', 'General')
cpi_df['Series'] = cpi_df['Series'].replace('Consumer price index: Food', 'Food')
cpi_df =cpi_df.reset_index(drop=True)

#Display the data frame
cpi_df.head(10)

Unnamed: 0,Country ID,Series,Country Name,Year,CPI
0,728,General,South Sudan,2017,4583.71
1,728,Food,South Sudan,2017,4574.02
2,728,Food,South Sudan,2016,1660.87
3,728,General,South Sudan,2016,1592.38
4,862,Food,Venezuela (Boliv. Rep. of),2015,1487.2
5,760,Food,Syrian Arab Republic,2016,807.3
6,862,General,Venezuela (Boliv. Rep. of),2015,772.02
7,760,General,Syrian Arab Republic,2016,662.9
8,90,Food,Solomon Islands,2016,555.4
9,90,Food,Solomon Islands,2015,552.52


In [98]:
#Write to csv in output_data
cpi_df.to_csv ('data_source/output_data/cpi_clean.csv', index = False, header=True)

In [99]:
# Read in the csv raw data file for GDP & GDP per Capita
csvfile3 = pd.read_csv("data_source/raw_data/gdp_and_gdp_per_capita.csv", encoding='latin-1')

#Convert raw data files to Data Frame
raw3_df = pd.DataFrame(csvfile3)

#Remove the first row and replace with second row
raw3_header =  raw3_df.iloc[0]
raw3_df = raw3_df[1:]
raw3_df.columns = new_header 

#Fill in blank column header
raw3_df.columns = raw3_df.columns.fillna('Country Name')

#Convert Value to float and round
raw3_df['Value'] = pd.to_numeric(raw3_df['Value'], errors='coerce')
raw3_df['Year'] = raw3_df['Year'].astype(int)

#Remove columns that are not needed (Source, Footnote)
gdp_df = raw3_df[['Region/Country/Area','Series', 'Country Name','Year','Value']]
gdp_df = gdp_df.rename(columns={'Region/Country/Area': 'Country ID','Value': 'GDP' })


#Remove Country ID 1 (Totals)
totals = gdp_df[gdp_df['Country ID'] == '1' ].index
gdp_df.drop(totals , inplace=True)

#Rename values in Series column
gdp_df['Series'] = gdp_df['Series'].replace('GDP in current prices (millions of US dollars)', 'GDP (millions of $)')
gdp_df['Series'] = gdp_df['Series'].replace('GDP per capita (US dollars)', 'GDP per Capita ($)')
gdp_df['Series'] = gdp_df['Series'].replace('GDP in constant 2010 prices (millions of US dollars)', 'GDP in 2010 dollars ($)')
gdp_df =gdp_df.reset_index(drop=True)


#Create seperate DataFrame for GDP percent
gdp_percent_df =gdp_df.loc[gdp_df['Series']== 'GDP real rates of growth (percent)']

#Remove Percent 
percents = gdp_df[gdp_df['Series'] == 'GDP real rates of growth (percent)' ].index
gdp_df.drop(percents , inplace=True)

#Display the data frame
gdp_df.head()


Unnamed: 0,Country ID,Series,Country Name,Year,GDP
0,2,GDP (millions of $),Africa,1985,517301.0
1,2,GDP (millions of $),Africa,1995,582440.0
2,2,GDP (millions of $),Africa,2005,1128718.0
3,2,GDP (millions of $),Africa,2010,1966483.0
4,2,GDP (millions of $),Africa,2015,2293977.0


In [100]:
#Write both Data Frames to csv in output_data folder
gdp_df.to_csv ('data_source/output_data/gdp_clean.csv', index = False, header=True)
gdp_percent_df.to_csv ('data_source/output_data/gdp_percent_clean.csv', index = False, header=True)

In [103]:
# Read in the csv raw data file for R&D as a % of GDP expendtiure
csvfile4 = pd.read_csv("data_source/raw_data/gdp_r&d.csv", encoding='latin-1')

#Convert raw data files to Data Frame
raw4_df = pd.DataFrame(csvfile4)

#Remove the first row and replace with second row
raw4_header =  raw4_df.iloc[0]
raw4_df = raw4_df[1:]
raw4_df.columns = new_header 

#Fill in blank column header
raw4_df.columns = raw4_df.columns.fillna('Country Name')

#Convert Value to float and round
raw4_df['Value'] = pd.to_numeric(raw4_df['Value'], errors='coerce')
raw4_df['Year'] = raw4_df['Year'].astype(int)

#Remove columns that are not needed (Source, Footnote)
rnd_df = raw4_df[['Region/Country/Area','Series', 'Country Name','Year','Value']]
rnd_df = rnd_df.rename(columns={'Region/Country/Area': 'Country ID','Value': 'Expenditure on R&D (%)' })


#Remove Country ID 1 (Totals)
totals = rnd_df[rnd_df['Country ID'] == '1' ].index
rnd_df.drop(totals , inplace=True)

#Rename values in Series column
rnd_df['Series'] = rnd_df['Series'].replace('Gross domestic expenditure on R & D: as a percentage of GDP (%)', 'GDP(%)')
rnd_df['Series'] = rnd_df['Series'].replace('Gross domestic expenditure on R & D: Business enterprises (%)', 'Business Enterprises (%)')
rnd_df['Series'] = rnd_df['Series'].replace('Gross domestic expenditure on R & D: Government (%)', 'Government (%)')
rnd_df['Series'] = rnd_df['Series'].replace('Gross domestic expenditure on R & D: Higher education (%)', 'Higher education (%)')
rnd_df['Series'] = rnd_df['Series'].replace('Gross domestic expenditure on R & D: Funds from abroad (%)', 'Funds from abroad (%)')
rnd_df['Series'] = rnd_df['Series'].replace('Gross domestic expenditure on R & D: Private non-profit (%)', 'Private non-profit (%)')
rnd_df['Series'] = rnd_df['Series'].replace('Gross domestic expenditure on R & D: Not distributed (%)', 'Not distributed (%)')
rnd_df =rnd_df.reset_index(drop=True)



#Display the data frame
rnd_df.head()


Unnamed: 0,Country ID,Series,Country Name,Year,Expenditure on R&D (%)
0,15,GDP(%),Northern Africa,2005,0.3
1,15,GDP(%),Northern Africa,2010,0.4
2,15,GDP(%),Northern Africa,2015,0.5
3,202,GDP(%),Sub-Saharan Africa,2005,0.4
4,202,GDP(%),Sub-Saharan Africa,2010,0.4


In [104]:
#Write Data Frame to csv in output_data folder
rnd_df.to_csv ('data_source/output_data/rnd_gdp_clean.csv', index = False, header=True)

In [137]:
# Read in the csv raw data file for Crime Rate
csvfile5 = pd.read_csv("data_source/raw_data/crimes.csv", encoding='latin-1')

#Convert raw data files to Data Frame
raw5_df = pd.DataFrame(csvfile5)

#Remove the first row and replace with second row
raw5_header =  raw5_df.iloc[0]
raw5_df = raw5_df[1:]
raw5_df.columns = new_header 

#Fill in blank column header
raw5_df.columns = raw5_df.columns.fillna('Country Name')

#Convert Value to float and round
raw5_df['Value'] = pd.to_numeric(raw5_df['Value'], errors='coerce')
raw5_df['Year'] = raw5_df['Year'].astype(int)

#Remove columns that are not needed (Source, Footnote)
crime_df = raw5_df[['Region/Country/Area','Series', 'Country Name','Year','Value']]
crime_df = crime_df.rename(columns={'Region/Country/Area': 'Country ID','Value': 'Rate per 100,000' })


#Remove Country ID 1 (Totals)
totals = crime_df[crime_df['Country ID'] == '1' ].index
crime_df.drop(totals , inplace=True)

#Rename values in Series column
crime_df['Series'] = crime_df['Series'].replace('Intentional homicide rates per 100,000', 'Homicide (%)')
crime_df['Series'] = crime_df['Series'].replace('Percentage of male and female intentional homicide victims', 'Male Victim (%)')
crime_df['Series'] = crime_df['Series'].replace('Percentage of male and female intentional homicide victims', 'Female Victim (%)')
crime_df['Series'] = crime_df['Series'].replace('Theft at the national level, rate per 100,000 population', 'Theft (%)')
crime_df['Series'] = crime_df['Series'].replace('Assault rate per 100,000 population', 'Assault (%)')
crime_df['Series'] = crime_df['Series'].replace('Robbery at the national level, rate per 100,000 population', 'Robbery (%)')
crime_df['Series'] = crime_df['Series'].replace('Total Sexual Violence at the national level, rate per 100,000', 'Sexual Violence (%)')
crime_df['Series'] = crime_df['Series'].replace('Kidnapping at the national level, rate per 100,000', 'Kidnapping (%)')
crime_df =crime_df.reset_index(drop=True)



#Display the data frame
crime_df.head()

Unnamed: 0,Country ID,Series,Country Name,Year,"Rate per 100,000"
0,202,Homicide (%),Sub-Saharan Africa,2015,9.6
1,419,Homicide (%),Latin America & the Caribbean,2015,22.3
2,62,Homicide (%),South-central Asia,2015,3.7
3,9,Homicide (%),Oceania,2015,8.8
4,53,Homicide (%),Australia and New Zealand,2015,1.0


In [138]:
# Write Data Frame to csv file and save to data_source folder
crime_df.to_csv ('data_source/output_data/crime_clean.csv', index = False, header=True)

In [140]:
# Read in the csv raw data file for Crime Rate
csvfile6 = pd.read_csv("data_source/raw_data/labour_force_unemployment.csv", encoding='latin-1')

#Convert raw data files to Data Frame
raw6_df = pd.DataFrame(csvfile6)

#Remove the first row and replace with second row
raw6_header =  raw6_df.iloc[0]
raw6_df = raw6_df[1:]
raw6_df.columns = new_header 

#Fill in blank column header
raw6_df.columns = raw6_df.columns.fillna('Country Name')

#Convert Value to float and round
raw6_df['Value'] = pd.to_numeric(raw6_df['Value'], errors='coerce')
raw6_df['Year'] = raw6_df['Year'].astype(int)

#Remove columns that are not needed (Source, Footnote)
labor_df = raw6_df[['Region/Country/Area','Series', 'Country Name','Year','Value']]
labor_df = labor_df.rename(columns={'Region/Country/Area': 'Country ID','Value': 'Participation/Rate (%)' })


#Remove Country ID 1 (Totals)
totals = labor_df[labor_df['Country ID'] == '1' ].index
labor_df.drop(totals , inplace=True)

#Rename values in Series column
labor_df['Series'] = labor_df['Series'].replace('Labour force participation - Total', 'Total Labour Force')
labor_df['Series'] = labor_df['Series'].replace('Unemployment rate - Total', 'Total Unemployment')
labor_df['Series'] = labor_df['Series'].replace('Labour force participation - Male', 'Male Labour Force')
labor_df['Series'] = labor_df['Series'].replace('Labour force participation - Female', 'Female Labour Force')
labor_df['Series'] = labor_df['Series'].replace('Unemployment rate - Male', 'Male Unemployment')
labor_df['Series'] = labor_df['Series'].replace('Unemployment rate - Female', 'Female Unemployment')
labor_df =labor_df.reset_index(drop=True)



#Display the data frame
labor_df.head()

Unnamed: 0,Country ID,Series,Country Name,Year,Participation/Rate (%)
0,2,Total Labour Force,Africa,2005,64.0
1,2,Total Unemployment,Africa,2005,8.4
2,2,Male Labour Force,Africa,2005,74.8
3,2,Male Unemployment,Africa,2005,7.2
4,2,Female Labour Force,Africa,2005,53.5


In [141]:
# Write Data Frame to csv file and save to data_source folder
labor_df.to_csv ('data_source/output_data/labor_clean.csv', index = False, header=True)

In [None]:
#Create Data Frame grouped by Year