In [1]:
import pandas as pd
import numpy as np
import copy
import itertools as itr
import wbdata
import pickle
import matplotlib.pyplot as plt
%matplotlib inline
%store -r wb_codes_df
%store -r world_1994
%store -r wb_data_df
%store -r world_wbdata_2016
%store -r fips_df

pd.options.display.max_columns = 150
path = 'c:/Users/silas/MIDS/jupyternotebooks/USAID/'

In [None]:
'''FA COLUMN HEADINGS
Index(['DateYear', 'country_id', 'country_code', 'country_name', 'region_id',
       'region_name', 'income_group_id', 'income_group_name',
       'income_group_acronym', 'implementing_agency_id',
       'implementing_agency_acronym', 'implementing_agency_name',
       'implementing_subagency_id', 'subagency_acronym', 'subagency_name',
       'channel_category_id', 'channel_category_name',
       'channel_subcategory_id', 'channel_subcategory_name', 'channel_id',
       'channel_name', 'dac_category_id', 'dac_category_name',
       'dac_sector_code', 'dac_sector_name', 'dac_purpose_code',
       'dac_purpose_name', 'funding_account_id', 'funding_account_name',
       'funding_agency_id', 'funding_agency_name', 'funding_agency_acronym',
       'assistance_category_id', 'assistance_category_name',
       'aid_type_group_id', 'aid_type_group_name', 'activity_id',
       'activity_name', 'activity_project_number', 'activity_start_date',
       'activity_end_date', 'transaction_type_id', 'transaction_type_name',
       'fiscal_year', 'current_amount', 'constant_amount', 'USG_sector_id',
       'USG_sector_name', 'framework', 'submission_id', 'numeric_year']
'''

'''Free (1.0 to 2.5), Partly Free (3.0 to 5.0), or Not Free (5.5 to 7.0)
https://freedomhouse.org/report/methodology-freedom-world-2017
'''

In [2]:
# read in foreign aid (fa) data and copy to data frame
fa_data_in = pd.read_csv('{}us_foreign_aid_complete (1).csv'.format(path), low_memory=False)

In [43]:
df_fa = copy.copy(fa_data_in.reset_index())

In [4]:
# read in freedom house (fh) data and copy to data frame
fh_data_in = pd.read_csv('{}FH_Country_and_Territory_Ratings_and_Statuses_1972-2016.csv'.format(path))

In [22]:
fh_df = copy.copy(fh_data_in)

### Freedom House dataframe preparation

In [23]:
# prepare for melt fh df so that every country has for each year its cl, pr, status
fh_df = fh_df.T.reset_index(drop=True)
fh_df.columns = fh_df.loc[0]
fh_df = fh_df.drop(0)
fh_df['Year(s) Under Review'] = fh_df['Year(s) Under Review'].ffill()

In [24]:
# corrects headers for FH dataset and prepares entries for merge with fa data
fh_c_list = list(fh_df.columns)
fh_c_list = [i.replace('&', 'and') for i in fh_c_list[2:]]
fh_c_list = pd.Series(fh_c_list).str.replace('Yugoslavia (Serbia and Montenegro)', 'Serbia and Montenegro')
fh_c_list = pd.Series(fh_c_list).str.replace(', The', '')
fh_c_list = list(fh_c_list)
fh_c_list.insert(0, 'FH_Score')
fh_c_list.insert(0, 'Year')
fh_df.columns = fh_c_list

In [27]:
# correcting year to single year note:a hack here where a single regex could work
fh_df['new'], fh_df['newer'] = fh_df['Year'].str.split('-').str
fh_df['new'] = fh_df['new'].str.replace(r'\D', '')
fh_df['Year'] = fh_df['new']
del fh_df['new']
del fh_df['newer']
# correct db for overlapping year
fh_df = fh_df.append(fh_df[27:30], ignore_index=True)

In [28]:
# create a new entry to get 1982 (because 1981 and 1982 were done in same year)
new_Y82 = fh_df.Year[132:]
fh_df.Year[132:] = list(map(lambda x: int(x)+1, new_Y82))

In [29]:
# bump each year up by one from '82 to '88
new_80s = fh_df.Year[30:51]
fh_df.Year[30:51] = list(map(lambda x: int(x)+1, new_80s))

In [30]:
fh_df.Year = fh_df.Year.astype(int)

In [31]:
fh_df = fh_df.sort_values(by='Year').reset_index(drop=True)

In [33]:
# switches columns FH score with Year
fh_melt = fh_df.set_index('FH_Score').reset_index()
# melts df on fh score and year
fh_melted = pd.melt(fh_melt, id_vars=['FH_Score', 'Year'], value_vars=list(fh_melt)[2:])
fh_melted.rename(columns={'variable':'country', 'value':'FH_value', 'Year': 'year'}, inplace=True)
# prep for selection
fh_melted.FH_Score = fh_melted['FH_Score'].str.strip()

### optional selection of FH category

In [34]:
# selection here example CL
fh_CL = fh_melted.loc[fh_melted['FH_Score'] == 'CL'].sort_values(by=['year', 'country'])
fh_CL = fh_CL.replace('-', fh_CL.replace(['-'], [np.nan]))

In [35]:
# caught one odd value
# fh_CL.iloc[162]['FH_value'] = '3'

In [36]:
# one value kept parenthesis - deleting parans
# fh_df = fh_df.set_value(162, 'FH_value', '3')

In [37]:
fh_CL = fh_CL.set_value(21871, 'FH_value', '3')

In [38]:
# prep FH dtypes for merge with FA
fh_CL.year = fh_CL.year.astype(int)
fh_CL.FH_value = fh_CL.FH_value.astype(float)

In [39]:
fh_CL_df = pd.DataFrame(fh_CL.groupby(['country', 'year'])['FH_value'].mean())
fh_CL_df.reset_index(inplace=True)
fh_CL_df.rename(columns={'Year':'year' }, inplace=True)

In [None]:
%store fh_CL_df

### Foreign Aid dataframe preparation

In [40]:
# prepare FA df for merge with FH on countries
'''
d = dict([('Cabo Verde', 'Cape Verde'),('China (P.R.C.)', 'China'),('China, Republic of (Taiwan)', 'Taiwan'),('Czechia', 'Czech Republic'),
('Germany (former East)', 'Germany, E.'),('Korea, Democratic Republic', 'North Korea'),('Korea Republic', 'South Korea'),
('Vietnam (former South)', 'Vietnam, S.'),('Yemen (former P.D.R.)', 'Yemen, S.')])
'''
# df_fa.country_name = pd.Series([d.get(e, e) for e in list(df_fa.country_name)])
# df_fa.country_name = df_fa.country_name.str.replace(r'\(.*', '')
# prepare FA df for merge with FH on year
# df_fa['index'] = df_fa.numeric_year.astype(int)
df_fa.rename(columns={'index': 'year','country_name': 'country' }, inplace=True) 
del df_fa['numeric_year']

In [42]:
df_fa.head()

Unnamed: 0,year,country_id,country_code,country,region_id,region_name,income_group_id,income_group_name,income_group_acronym,implementing_agency_id,implementing_agency_acronym,implementing_agency_name,implementing_subagency_id,subagency_acronym,subagency_name,channel_category_id,channel_category_name,channel_subcategory_id,channel_subcategory_name,channel_id,channel_name,dac_category_id,dac_category_name,dac_sector_code,dac_sector_name,dac_purpose_code,dac_purpose_name,funding_account_id,funding_account_name,funding_agency_id,funding_agency_name,funding_agency_acronym,assistance_category_id,assistance_category_name,aid_type_group_id,aid_type_group_name,activity_id,activity_name,activity_project_number,activity_start_date,activity_end_date,transaction_type_id,transaction_type_name,fiscal_year,current_amount,constant_amount,USG_sector_id,USG_sector_name,framework,submission_id
0,0,4,AFG,Afghanistan,4,South and Central Asia,1.0,Low Income Country,LIC,7,DOD,Department of Defense,999,,not applicable,1.0,Government,1.0,Government - United States,1000003,U.S. Government - Department of Defense,3,Governance,152,"Conflict, Peace, and Security",15210,Security system management and reform,21x2091,"Department of the Army, Afghanistan Security F...",8,Department of the Army,ARMY,2,Military,3,Project-Type,25078,Afghanistan Security Force Fund,,,,2,Obligations,2011,9941000000,10731991839,3,Stabilization Operations and Security Sector R...,,28
1,1,4,AFG,Afghanistan,4,South and Central Asia,1.0,Low Income Country,LIC,7,DOD,Department of Defense,999,,not applicable,1.0,Government,1.0,Government - United States,1000003,U.S. Government - Department of Defense,3,Governance,152,"Conflict, Peace, and Security",15210,Security system management and reform,21x2091,"Department of the Army, Afghanistan Security F...",8,Department of the Army,ARMY,2,Military,3,Project-Type,25078,Afghanistan Security Force Fund,,,,2,Obligations,2012,9243000000,9799467226,3,Stabilization Operations and Security Sector R...,,28
2,2,4,AFG,Afghanistan,4,South and Central Asia,1.0,Low Income Country,LIC,7,DOD,Department of Defense,999,,not applicable,1.0,Government,1.0,Government - United States,1000003,U.S. Government - Department of Defense,3,Governance,152,"Conflict, Peace, and Security",15210,Security system management and reform,21x2091,"Department of the Army, Afghanistan Security F...",8,Department of the Army,ARMY,2,Military,3,Project-Type,25078,Afghanistan Security Force Fund,,,,3,Disbursements,2011,7840175215,8464007285,3,Stabilization Operations and Security Sector R...,,28
3,3,4,AFG,Afghanistan,4,South and Central Asia,1.0,Low Income Country,LIC,7,DOD,Department of Defense,999,,not applicable,1.0,Government,1.0,Government - United States,1000003,U.S. Government - Department of Defense,3,Governance,152,"Conflict, Peace, and Security",15210,Security system management and reform,21x2091,"Department of the Army, Afghanistan Security F...",8,Department of the Army,ARMY,2,Military,3,Project-Type,25078,Afghanistan Security Force Fund,,,,3,Disbursements,2013,7764310985,8095048196,3,Stabilization Operations and Security Sector R...,,28
4,4,4,AFG,Afghanistan,4,South and Central Asia,1.0,Low Income Country,LIC,7,DOD,Department of Defense,999,,not applicable,1.0,Government,1.0,Government - United States,1000003,U.S. Government - Department of Defense,3,Governance,152,"Conflict, Peace, and Security",15210,Security system management and reform,21x2091,"Department of the Army, Afghanistan Security F...",8,Department of the Army,ARMY,2,Military,3,Project-Type,25078,Afghanistan Security Force Fund,,,,2,Obligations,2013,6928000000,7223112779,3,Stabilization Operations and Security Sector R...,,28


In [None]:
# select feature (here its amounts), reduce(agg years), select/add FH score
df_fa_prep = df_fa.groupby(['country', 'year','country_code', 'country_id'])['current_amount', 'constant_amount'].mean()
df_fa_prep['current_diff'] = df_fa_prep.current_amount.diff()
df_fa_prep['constant_diff'] = df_fa_prep.constant_amount.diff()
df_fa_prep['current_pct_chg'] = df_fa_prep.current_amount.pct_change()
df_fa_prep['constant_pct_chg'] = df_fa_prep.constant_amount.pct_change()
df_fa_prep.reset_index(inplace=True)
df_fa_prep.rename(columns={'country_name':'country'}, inplace=True)
%store df_fa_prep

In [None]:
df_fa_prep.head(2)

In [None]:
# df_fa_prep = df_fa.groupby(['country_name', 'Year'])['current_amount', 'constant_amount'].mean()

### Merges DataFrames: Freedom House, Foreign Aid, World Bank

In [None]:
%store -r fh_CL_df
%store -r df_fa_prep
%store -r wb_data_df

In [None]:
# merge FH and FA here
mgd_df = pd.merge(fh_CL_df, df_fa_prep, how='left', on=['country', 'year'])

In [None]:
# filling nan country codes
mgd_df.country_code = mgd_df.country_code.bfill()

In [None]:
# merge WB and FHFA here
FHFAWB_df = pd.merge(mgd_df, wb_data_df, how='left', on=['country', 'year'])

In [None]:
%store FHFAWB_df

In [None]:
FHFAWB_df.head(2)

### Calcs
- calculate diffs, pct change, and rolling mean of y2y pct change (5 years) for FH score


In [None]:
# make sure not to calculate null values for FH
FHFAWB_df = FHFAWB_df[FHFAWB_df['FH_value'].isnull() == False]

# calculate diffs, pct change, and rolling mean of y2y pct change (5 years) for FH score
FHFAWB_df['FH_diff'] = FHFAWB_df.FH_value.diff()
FHFAWB_df['FH_pct_change'] = FHFAWB_df.FH_value.pct_change()
FHFAWB_df = FHFAWB_df.sort_values(by=['country', 'year'])
FHFAWB_df['FH_pct_rolling5'] = FHFAWB_df['FH_value'].rolling(5).mean()

# calculate covariance for FH pct change
FHFAWB_df.constant_pct_chg.cov(FHFAWB_df.FH_diff)

%store FHFAWB_df

FHFAWB_df.head()

# apply fips codes

In [None]:
wb_codes = wbdata.api.get_country(country_id=None)

In [None]:
fh_melted
%store -r wb_codes_df
%store -r world_1994
%store -r wb_data_df
%store -r world_wbdata_2016
%store -r fips_df

In [None]:
fips_df.head(1)

In [None]:
fh_melted.head(1)

In [None]:
df_fa.head(1)

In [None]:
wb_data_df.head(1)

In [None]:
world_wbdata_2016.head(1)

In [None]:
world_1994.head(1)

In [52]:
def merge_dfs(dfl,dfr, i_on):
    merged_df = dfl.merge(dfr, how='left', on=i_on)
    print('DF shape: ', merged_df.shape)
    return merged_df

In [51]:
def pre_merge_dfs(df1_lst,df2_lst):
    print('length list L: ', len(df1_lst))
    print('length list R: ', len(df2_lst))
    print('length if merged: ', len(set(df2_lst).intersection(df1_lst)))
    not_in = [x for x in df2_lst if x not in df1_lst]
    return not_in

# add fips data to wb so that there are all fips countries on wb db

In [None]:
wb_w_wbcodes_df.head(1)

In [None]:
fips_df.rename(columns={'ISO3166_C':'wb_code'}, inplace=True)

In [None]:
wb_w_wbcodes_df = merge_dfs(wb_data_df, wb_codes_df, 'country')

In [None]:
wb_fipscoded = merge_dfs(wb_w_wbcodes_df, fips_df, 'wb_code')

In [None]:
del wb_fipscoded['country_y']
del wb_fipscoded['STANAG']

In [None]:
wb_fipscoded.head(1)

In [None]:
wb_fipscoded.rename(columns={'country_x':'country'}, inplace=True)

# fh to wb

In [None]:
'''
pre_merge_dfs(wb_fipscoded.country.unique(), fh_melted.country.unique())

length list L:  273
length list R:  205
length if merged:  171
'''

In [None]:
chg_FH_WB = dict([('Bahamas','Bahamas, The'),
 ('Bosnia-Herzegovina','Bosnia and Herzegovina'),
 ('Brunei','Brunei Darussalam'),
 ('Cape Verde','Cabo Verde'),
 ('Congo (Brazzaville)', 'Congo, Rep.'),
 ('Congo (Kinshasa)', 'Congo, Dem. Rep.'),
 ('Egypt','Egypt, Arab Rep.'),
 ('Gambia','Gambia, The'),
 ('Iran', 'Iran, Islamic Rep.'),
 ('Kyrgyzstan','Kyrgyz Republic'),
 ('Laos','Lao PDR'),
 ('Macedonia','Macedonia, FYR'),
 ('Micronesia','Micronesia, Fed. Sts.'),
 ('North Korea','Korea, Dem. People���s Rep.'),
 ('Russia','Russian Federation'),
 ('Saint Kitts and Nevis','St. Kitts and Nevis'),
 ('Saint Lucia','St. Lucia'),
 ('Saint Vincent and Grenadines','St. Vincent and the Grenadines'),
 ('Slovakia','Slovak Republic'),
 ('South Korea','Korea, Rep.'),
 ('Syria', 'Syrian Arab Republic'),
 ('Taiwan','Taiwan, China'),
 ('Venezuela', 'Venezuela, RB'),
 ('Yemen','Yemen, Rep.')])

In [None]:
fh_melted.shape

In [None]:
# pre_merge_dfs(fh_melted.country.unique(), wb_fipscoded.country.unique())

In [None]:
# change non-fit country names here
fh_melted.country = pd.Series([chg_FH_WB.get(e, e) for e in list(fh_melted.country)])

In [None]:
# l merge wb into fh
fh_w_wb_df = merge_dfs(fh_melted, wb_w_fips_df,  ['country', 'year'])

In [None]:
fh_w_all_wb_df.head()

In [None]:
fh_to_add_to_wb = ['Czechoslovakia',
 'Germany, E. ',
 'Germany, W. ',
 'USSR',
 'Vietnam, N.',
 'Vietnam, S.',
 'Yemen, N.',
 'Yemen, S.',
 'Yugoslavia',
 'Yugoslavia (Serbia and Montenegro)']

In [None]:
 %store fh_w_wb_df

In [None]:
pickle.dump(fh_w_wb_df, open( "{}fh_w_wb_df.pickle".format(path), "wb" ))
# pickle.load( open( "{}fh_w_wb_df.pickle".format(path), "rb" ) )

# FHWB to FA

In [49]:
df_fa.numeric_year = df_fa.numeric_year.astype(int)

In [59]:
df_fa.rename(columns={'country_code':'ISO3166_C', 'numeric_year':'year'}, inplace=True)

In [61]:
fhfawb = fh_w_wb_df.merge( df_fa, how='left', on=['ISO3166_C', 'year'])

In [64]:
%store fhfawb

Stored 'fhfawb' (DataFrame)


In [62]:
pickle.dump(fhfawb, open( "{}fhfawb.pickle".format(path), "wb" ))