In [1]:
import pandas as pd
import numpy as np
import copy
import itertools as itr
import matplotlib.pyplot as plt
%matplotlib inline
pd.options.display.max_columns = 150

In [2]:
'''FA COLUMN HEADINGS
Index(['DateYear', 'country_id', 'country_code', 'country_name', 'region_id',
       'region_name', 'income_group_id', 'income_group_name',
       'income_group_acronym', 'implementing_agency_id',
       'implementing_agency_acronym', 'implementing_agency_name',
       'implementing_subagency_id', 'subagency_acronym', 'subagency_name',
       'channel_category_id', 'channel_category_name',
       'channel_subcategory_id', 'channel_subcategory_name', 'channel_id',
       'channel_name', 'dac_category_id', 'dac_category_name',
       'dac_sector_code', 'dac_sector_name', 'dac_purpose_code',
       'dac_purpose_name', 'funding_account_id', 'funding_account_name',
       'funding_agency_id', 'funding_agency_name', 'funding_agency_acronym',
       'assistance_category_id', 'assistance_category_name',
       'aid_type_group_id', 'aid_type_group_name', 'activity_id',
       'activity_name', 'activity_project_number', 'activity_start_date',
       'activity_end_date', 'transaction_type_id', 'transaction_type_name',
       'fiscal_year', 'current_amount', 'constant_amount', 'USG_sector_id',
       'USG_sector_name', 'framework', 'submission_id', 'numeric_year']
'''

'''Free (1.0 to 2.5), Partly Free (3.0 to 5.0), or Not Free (5.5 to 7.0)
https://freedomhouse.org/report/methodology-freedom-world-2017
'''

'Free (1.0 to 2.5), Partly Free (3.0 to 5.0), or Not Free (5.5 to 7.0)\nhttps://freedomhouse.org/report/methodology-freedom-world-2017\n'

In [3]:
# read in foreign aid (fa) data and copy to data frame
fa_data_in = pd.read_csv('us_foreign_aid_complete (1).csv', low_memory=False)

In [4]:
df_fa = copy.copy(fa_data_in.reset_index())

In [5]:
# read in freedom house (fh) data and copy to data frame
fh_data_in = pd.read_csv('FH_Country_and_Territory_Ratings_and_Statuses_1972-2016.csv')

In [6]:
fh_df = copy.copy(fh_data_in)

### Freedom House dataframe preparation

In [7]:
# prepare for melt fh df so that every country has for each year its cl, pr, status
fh_df = fh_df.T.reset_index(drop=True)
fh_df.columns = fh_df.loc[0]
fh_df = fh_df.drop(0)
fh_df['Year(s) Under Review'] = fh_df['Year(s) Under Review'].ffill()

In [8]:
# corrects headers for FH dataset and prepares entries for merge with fa data
fh_c_list = list(fh_df.columns)
fh_c_list = [i.replace('&', 'and') for i in fh_c_list[2:]]
fh_c_list = pd.Series(fh_c_list).str.replace('Yugoslavia (Serbia and Montenegro)', 'Serbia and Montenegro')
fh_c_list = pd.Series(fh_c_list).str.replace(', The', '')
fh_c_list = list(fh_c_list)
fh_c_list.insert(0, 'FH_Score')
fh_c_list.insert(0, 'Year')
fh_df.columns = fh_c_list

In [9]:
# correcting year to single year note:a hack here where a single regex could work
fh_df['new'], fh_df['newer'] = fh_df['Year'].str.split('-').str
fh_df['new'] = fh_df['new'].str.replace(r'\D', '')
fh_df['Year'] = fh_df['new']
del fh_df['new']
del fh_df['newer']
# correct db for overlapping year
fh_df = fh_df.append(fh_df[27:30], ignore_index=True)

In [10]:
# create a new entry to get 1982 (because 1981 and 1982 were done in same year)
new_Y82 = fh_df.Year[132:]
fh_df.Year[132:] = list(map(lambda x: int(x)+1, new_Y82))

In [11]:
# bump each year up by one from '82 to '88
new_80s = fh_df.Year[30:51]
fh_df.Year[30:51] = list(map(lambda x: int(x)+1, new_80s))

In [12]:
fh_df.Year = fh_df.Year.astype(int)

In [13]:
fh_df = fh_df.sort_values(by='Year').reset_index(drop=True)

In [14]:
# switches columns FH score with Year
fh_melt = fh_df.set_index('FH_Score').reset_index()
# melts df on fh score and year
fh_melted = pd.melt(fh_melt, id_vars=['FH_Score', 'Year'], value_vars=list(fh_melt)[2:])
fh_melted.rename(columns={'variable':'country_name', 'value':'FH_value'}, inplace=True)
# prep for selection
fh_melted.FH_Score = fh_melted['FH_Score'].str.strip()

In [15]:
# selection here example CL
fh_CL = fh_melted.loc[fh_melted['FH_Score'] == 'CL'].sort_values(by=['Year', 'country_name'])
fh_CL = fh_CL.replace('-', fh_CL.replace(['-'], [np.nan]))

In [16]:
# caught one odd value
# fh_CL.iloc[162]['FH_value'] = '3'

In [17]:
# one value kept parenthesis - deleting parans
# fh_df = fh_df.set_value(162, 'FH_value', '3')

In [18]:
fh_CL = fh_CL.set_value(21871, 'FH_value', '3')

In [19]:
# prep FH dtypes for merge with FA
fh_CL.Year = fh_CL.Year.astype(int)
fh_CL.FH_value = fh_CL.FH_value.astype(float)

In [20]:
fh_CL_df = pd.DataFrame(fh_CL.groupby(['country_name', 'Year'])['FH_value'].mean())
fh_CL_df.reset_index(inplace=True)
fh_CL_df.rename(columns={'country_name':'country','Year':'year' }, inplace=True)

In [21]:
%store fh_CL_df

Stored 'fh_CL_df' (DataFrame)


### Foreign Aid dataframe preparation

In [22]:
# prepare FA df for merge with FH on countries
d = dict([('Cabo Verde', 'Cape Verde'),('China (P.R.C.)', 'China'),('China, Republic of (Taiwan)', 'Taiwan'),('Czechia', 'Czech Republic'),
('Germany (former East)', 'Germany, E.'),('Korea, Democratic Republic', 'North Korea'),('Korea Republic', 'South Korea'),
('Vietnam (former South)', 'Vietnam, S.'),('Yemen (former P.D.R.)', 'Yemen, S.')])
df_fa.country_name = pd.Series([d.get(e, e) for e in list(df_fa.country_name)])
df_fa.country_name = df_fa.country_name.str.replace(r'\(.*', '')
# prepare FA df for merge with FH on year
df_fa['index'] = df_fa.numeric_year.astype(int)
df_fa.rename(columns={'index': 'year','country_name': 'country' }, inplace=True) 
del df_fa['numeric_year']

In [24]:
df_fa.head(2)

Unnamed: 0,year,country_id,country_code,country,region_id,region_name,income_group_id,income_group_name,income_group_acronym,implementing_agency_id,implementing_agency_acronym,implementing_agency_name,implementing_subagency_id,subagency_acronym,subagency_name,channel_category_id,channel_category_name,channel_subcategory_id,channel_subcategory_name,channel_id,channel_name,dac_category_id,dac_category_name,dac_sector_code,dac_sector_name,dac_purpose_code,dac_purpose_name,funding_account_id,funding_account_name,funding_agency_id,funding_agency_name,funding_agency_acronym,assistance_category_id,assistance_category_name,aid_type_group_id,aid_type_group_name,activity_id,activity_name,activity_project_number,activity_start_date,activity_end_date,transaction_type_id,transaction_type_name,fiscal_year,current_amount,constant_amount,USG_sector_id,USG_sector_name,framework,submission_id
0,2011,4,AFG,Afghanistan,4,South and Central Asia,1.0,Low Income Country,LIC,7,DOD,Department of Defense,999,,not applicable,1.0,Government,1.0,Government - United States,1000003,U.S. Government - Department of Defense,3,Governance,152,"Conflict, Peace, and Security",15210,Security system management and reform,21x2091,"Department of the Army, Afghanistan Security F...",8,Department of the Army,ARMY,2,Military,3,Project-Type,25078,Afghanistan Security Force Fund,,,,2,Obligations,2011,9941000000,10731991839,3,Stabilization Operations and Security Sector R...,,28
1,2012,4,AFG,Afghanistan,4,South and Central Asia,1.0,Low Income Country,LIC,7,DOD,Department of Defense,999,,not applicable,1.0,Government,1.0,Government - United States,1000003,U.S. Government - Department of Defense,3,Governance,152,"Conflict, Peace, and Security",15210,Security system management and reform,21x2091,"Department of the Army, Afghanistan Security F...",8,Department of the Army,ARMY,2,Military,3,Project-Type,25078,Afghanistan Security Force Fund,,,,2,Obligations,2012,9243000000,9799467226,3,Stabilization Operations and Security Sector R...,,28


In [25]:
# select feature (here its amounts), reduce(agg years), select/add FH score
df_fa_prep = df_fa.groupby(['country', 'year','country_code', 'country_id'])['current_amount', 'constant_amount'].mean()
df_fa_prep['current_diff'] = df_fa_prep.current_amount.diff()
df_fa_prep['constant_diff'] = df_fa_prep.constant_amount.diff()
df_fa_prep['current_pct_chg'] = df_fa_prep.current_amount.pct_change()
df_fa_prep['constant_pct_chg'] = df_fa_prep.constant_amount.pct_change()
df_fa_prep.reset_index(inplace=True)
df_fa_prep.rename(columns={'country_name':'country'}, inplace=True)
%store df_fa_prep

Stored 'df_fa_prep' (DataFrame)


In [27]:
df_fa_prep.head(2)

Unnamed: 0,country,year,country_code,country_id,current_amount,constant_amount,current_diff,constant_diff,current_pct_chg,constant_pct_chg
0,Afghanistan,1950,AFG,4,1000.0,8201.0,,,,
1,Afghanistan,1951,AFG,4,100000.0,778151.0,99000.0,769950.0,99.0,93.884892


In [None]:
# df_fa_prep = df_fa.groupby(['country_name', 'Year'])['current_amount', 'constant_amount'].mean()

### Merges DataFrames: Freedom House, Foreign Aid, World Bank

In [28]:
%store -r fh_CL_df
%store -r df_fa_prep
%store -r wb_data_df

In [29]:
# merge FH and FA here
mgd_df = pd.merge(fh_CL_df, df_fa_prep, how='left', on=['country', 'year'])

In [30]:
# filling nan country codes
mgd_df.country_code = mgd_df.country_code.bfill()

In [31]:
# merge WB and FHFA here
FHFAWB_df = pd.merge(mgd_df, wb_data_df, how='left', on=['country', 'year'])

In [32]:
%store FHFAWB_df

Stored 'FHFAWB_df' (DataFrame)


In [34]:
FHFAWB_df.head(2)

Unnamed: 0,country,year,FH_value,country_code,country_id,current_amount,constant_amount,current_diff,constant_diff,current_pct_chg,constant_pct_chg,date,gdppc,gini,nat,pop,voice
0,Afghanistan,1972,5.0,AFG,4.0,5134143.0,23238530.0,2363143.0,10102290.0,0.852812,0.76904,1972-01-01,,,0.397964,11721940.0,
1,Afghanistan,1973,6.0,AFG,4.0,6468000.0,28053250.0,1333857.0,4814714.0,0.259801,0.207187,1973-01-01,,,0.768116,12027822.0,


### Calcs
- calculate diffs, pct change, and rolling mean of y2y pct change (5 years) for FH score


In [35]:
# make sure not to calculate null values for FH
FHFAWB_df = FHFAWB_df[FHFAWB_df['FH_value'].isnull() == False]

# calculate diffs, pct change, and rolling mean of y2y pct change (5 years) for FH score
FHFAWB_df['FH_diff'] = FHFAWB_df.FH_value.diff()
FHFAWB_df['FH_pct_change'] = FHFAWB_df.FH_value.pct_change()
FHFAWB_df = FHFAWB_df.sort_values(by=['country', 'year'])
FHFAWB_df['FH_pct_rolling5'] = FHFAWB_df['FH_value'].rolling(5).mean()

# calculate covariance for FH pct change
FHFAWB_df.constant_pct_chg.cov(FHFAWB_df.FH_diff)

%store FHFAWB_df

FHFAWB_df.head()

Stored 'FHFAWB_df' (DataFrame)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,country,year,FH_value,country_code,country_id,current_amount,constant_amount,current_diff,constant_diff,current_pct_chg,constant_pct_chg,date,gdppc,gini,nat,pop,voice,FH_diff,FH_pct_change,FH_pct_rolling5
0,Afghanistan,1972,5.0,AFG,4.0,5134143.0,23238530.0,2363143.0,10102290.0,0.852812,0.76904,1972-01-01,,,0.397964,11721940.0,,,,
1,Afghanistan,1973,6.0,AFG,4.0,6468000.0,28053250.0,1333857.0,4814714.0,0.259801,0.207187,1973-01-01,,,0.768116,12027822.0,,1.0,0.2,
2,Afghanistan,1974,6.0,AFG,4.0,2848400.0,11538980.0,-3619600.0,-16514270.0,-0.559617,-0.588676,1974-01-01,,,0.910773,12321541.0,,0.0,0.0,
3,Afghanistan,1975,6.0,AFG,4.0,4298200.0,15778530.0,1449800.0,4239546.0,0.508988,0.367411,1975-01-01,,,1.328856,12590286.0,,0.0,0.0,
4,Afghanistan,1976,6.0,AFG,4.0,1495556.0,5084007.0,-2802644.0,-10694520.0,-0.652051,-0.677789,1976-01-01,,,1.32918,12840299.0,,0.0,0.0,5.8
