# EDA and Cleaning USAID Data

In [64]:
# Importing libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [65]:
# Reading in USAID data
us_aid = pd.read_csv('./aid_data/usaid/us_foreign_aid_complete.csv')

In [66]:
# Looking at the data frame
us_aid.head()

Unnamed: 0,country_id,country_code,country_name,region_id,region_name,income_group_id,income_group_name,income_group_acronym,implementing_agency_id,implementing_agency_acronym,...,activity_start_date,activity_end_date,transaction_type_id,transaction_type_name,fiscal_year,current_amount,constant_amount,USG_sector_id,USG_sector_name,submission_id
0,4,AFG,Afghanistan,4,South and Central Asia,1.0,Low Income Country,LIC,7,DOD,...,,,2,Obligations,2011,9941000000,11172173522,3,Stabilization Operations and Security Sector R...,28
1,4,AFG,Afghanistan,4,South and Central Asia,1.0,Low Income Country,LIC,7,DOD,...,,,2,Obligations,2012,9243000000,10195234944,3,Stabilization Operations and Security Sector R...,28
2,4,AFG,Afghanistan,4,South and Central Asia,1.0,Low Income Country,LIC,7,DOD,...,,,3,Disbursements,2011,7840175215,8811165672,3,Stabilization Operations and Security Sector R...,28
3,4,AFG,Afghanistan,4,South and Central Asia,1.0,Low Income Country,LIC,7,DOD,...,,,3,Disbursements,2013,7764310985,8409304652,3,Stabilization Operations and Security Sector R...,28
4,4,AFG,Afghanistan,4,South and Central Asia,1.0,Low Income Country,LIC,7,DOD,...,,,2,Obligations,2013,6928000000,7503519983,3,Stabilization Operations and Security Sector R...,28


In [67]:
# Looking at the regions in the data 
us_aid['region_name'].unique()

array(['South and Central Asia', 'Middle East and North Africa',
       'Europe and Eurasia', 'East Asia and Oceania', 'World',
       'Western Hemisphere', 'Sub-Saharan Africa'], dtype=object)

In [68]:
# I am adding Turkey as seperate because it is under Europe in this data
turkey = us_aid[us_aid['country_name'] == 'Turkey'] 

In [69]:
# Including Middle East and North Africa in my data
us_aid1 = us_aid[us_aid['region_name'] == 'Middle East and North Africa'] 

In [None]:
# Including Sub-Saharan Africa in my data 
us_aid2 = us_aid[us_aid['region_name'] == 'Sub-Saharan Africa'] 

In [None]:
# Combining all of data frames
us_aid = pd.concat([us_aid1, us_aid2, turkey])

In [None]:
# Looking at the data types and null values 
us_aid.info()

In [None]:
# Looking at the counts by sector names
us_aid['dac_sector_name'].value_counts()

In [None]:
# Looking at the values for this. Not a very useful column, so I will not keep it. 
us_aid['dac_purpose_name'].head()

In [None]:
# Looking at the values 
us_aid['funding_agency_name'].value_counts()

In [None]:
# Saving the columns I want to save for modeling/analysis 
us_aid = us_aid[['country_name', 'activity_name', 'dac_sector_name', 'funding_agency_name', 'fiscal_year', 'constant_amount']]

In [None]:
# Looking at my new dataframe
us_aid.head()

In [None]:
# Looking at the info to check for null values and data types
# No null value, but need to change fiscal year to numeric type
us_aid.info()

In [None]:
# Changing fiscal year to a numeric data type 
us_aid['fiscal_year'] = pd.to_numeric(us_aid['fiscal_year'], errors='coerce')

In [None]:
# Checking to see if the numeric change took
us_aid.info()

In [None]:
# Looking at the range of values, min, max, etc. 
us_aid['fiscal_year'].describe()

In [None]:
# Getting rid of any values before 2000
us_aid = us_aid[us_aid['fiscal_year'] >= 2000] 

In [None]:
# Looking at my new range of values 
us_aid['fiscal_year'].describe()

In [None]:
# Removing values above 2014
us_aid = us_aid[us_aid['fiscal_year'] <= 2014] 

In [None]:
# Looking at my new range of values
us_aid['fiscal_year'].describe()

In [None]:
# Looking at my new value counts
us_aid.info()

In [None]:
# Looking at unique country names 
us_aid['country_name'].unique()

In [None]:
# Changing the names to be consistent 
us_aid.replace({'country_name' : {'West Bank/Gaza' : 'Palestine',
                    'Sudan (former)' : 'Sudan',
                    "Cote d'Ivoire" : "Cote D'Ivoire",
                    'Congo (Kinshasa)': 'Democratic Republic of Congo',
                    'Congo (Brazzaville)' : 'Congo',
                    }}, inplace=True)

In [None]:
# Grouping by country names and amounts to make sums for each country
usaid_sums = us_aid.groupby('country_name')['constant_amount'].sum()

In [None]:
# Lookng at my new data
usaid_sums.head(5)

In [None]:
# Saving my data to a data frame
usaid_sums = usaid_sums.to_frame()

In [None]:
# Renaming the column
usaid_sums.rename(columns={'constant_amount': 'usaid_totals'}, inplace=True)

In [None]:
# Saving my data frame to a csv
usaid_sums.to_csv('./aid_data/usaid/usaid_sums.csv', index=True)

In [45]:
usaid_yearly_sums = us_aid.groupby(['country_name', 'fiscal_year'])['constant_amount'].sum()

In [46]:
usaid_yearly_sums = usaid_yearly_sums.to_frame()

In [47]:
usaid_yearly_sums.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,constant_amount
country_name,fiscal_year,Unnamed: 2_level_1
Algeria,2000.0,1696369
Algeria,2001.0,4678370
Algeria,2002.0,10008641
Algeria,2003.0,9894601
Algeria,2004.0,10618022
Algeria,2005.0,6266475
Algeria,2006.0,6411946
Algeria,2007.0,10812148
Algeria,2008.0,26320757
Algeria,2009.0,24412834


In [48]:
usaid_yearly_sums.rename(columns={'constant_amount': 'usaid_totals'}, inplace=True)

In [49]:
usaid_yearly_sums.rename(columns={'fiscal_year': 'year'}, inplace=True)

In [50]:
usaid_yearly_sums.rename(columns={'constant_amount' : 'usaid_totals', 'fiscal_year' : 'year'}, inplace=True)

In [51]:
usaid_yearly_sums.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,usaid_totals
country_name,fiscal_year,Unnamed: 2_level_1
Algeria,2000.0,1696369
Algeria,2001.0,4678370
Algeria,2002.0,10008641
Algeria,2003.0,9894601
Algeria,2004.0,10618022


In [52]:
usaid_yearly_sums.to_csv('./aid_data/usaid/usaid_yearly_sums.csv', index=True)