# EDA and Cleaning William and Mary Datasets

In [382]:
# Importing libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [383]:
# Reading in the data from the Chinese official finance dataset
chinese_official_finance = pd.read_excel('./aid_data/aid_data_wm/chinese_official_finance.xlsm')

From the ReadMe: "The dataset captures the known universe of officially-financed Chinese projects in 5 regions of the world from 2000-2014 (including Africa, the Middle East, Asia and the Pacific, Latin America and the Caribbean, and Central and Eastern Europe). Chinese Official Finance refers to concessional and non-concessional sources of funding from Chinese government institutions (including central, state or local government institutions) with development, commercial, or representational intent. More specifically, it captures (a) highly concessional, Chinese development projects that meet the OECD’s criteria for ODA; and (b) officially-financed Chinese projects that lack development intent or are provided with higher interest rates and lower grant elements (i.e. projects that fall within the OECD’s criteria for “Other Official Flows”, or OOF.) Chinese ODA represents “Chinese aid” in the strictest sense of the term, but Chinese official finance (ODA and Other Official Flows) is sometimes used as a broader definition of aid. AidData’s dataset allows users to disaggregate Chinese official finance into its constituent parts and determine if they wish to use a narrow or broad definition of aid. This dataset builds off of previous work to track Chinese Official Finance in Africa (versions 1.0, 1.1, and 1.2).

In [384]:
# Checking out the data
chinese_official_finance.head()

Unnamed: 0,project_id,project_location_id,precision_code,geoname_id,place_name,latitude,longitude,location_type_code,location_type_name,gazetteer_adm_code,...,loan_type,interest_rate,maturity,grace_period,grant_element,location_details,contacts,source_triangulation,field_completeness,round_coded
0,1,1_2377450,1,2377450,Nouakchott,18.08581,-15.9785,PPLC,capital of a political entity,6295630|6255146|MR|NKC|2377450,...,,,,,,,"Zhang Xun, ; Cheikh Ould Horma,",3,6,ChinatoAfrica
1,1,1_2376719,1,2376719,S√©libaby,15.15846,-12.1843,PPLA,seat of a first-order administrative division,6295630|6255146|MR|10|2376719,...,,,,,,,"Zhang Xun, ; Cheikh Ould Horma,",3,6,ChinatoAfrica
2,1,1_2378538,1,2378538,Kiffa,16.61659,-11.40453,PPLA,seat of a first-order administrative division,6295630|6255146|MR|03|2378538,...,,,,,,,"Zhang Xun, ; Cheikh Ould Horma,",3,6,ChinatoAfrica
3,1,1_2378080,6,2378080,Mauritania,,,PCLI,independent political entity,6295630|6255146|MR,...,,,,,,,"Zhang Xun, ; Cheikh Ould Horma,",3,6,ChinatoAfrica
4,3,3_2377450,1,2377450,Nouakchott,18.08581,-15.9785,PPLC,capital of a political entity,6295630|6255146|MR|NKC|2377450,...,Concessional,2.0,20.0,5.0,54.08,Nouakchott,"Zhai Jun, ; Zhang Xun, ; Mohamed Ould Abdel Az...",8,8,ChinatoAfrica


In [385]:
# This data has a lot of columns that are not useful for my analysis
# I used the data dictionary to decide what would be useful and what I could discard
chinese_official_finance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6190 entries, 0 to 6189
Data columns (total 78 columns):
project_id                   6190 non-null int64
project_location_id          6190 non-null object
precision_code               6190 non-null int64
geoname_id                   6190 non-null int64
place_name                   6190 non-null object
latitude                     4564 non-null float64
longitude                    4564 non-null float64
location_type_code           6190 non-null object
location_type_name           6190 non-null object
gazetteer_adm_code           6190 non-null object
gazetteer_adm_name           6190 non-null object
location_class               6190 non-null int64
geographic_exactness         6190 non-null int64
project_title                6190 non-null object
start_actual_isodate         1447 non-null datetime64[ns]
end_actual_isodate           1573 non-null datetime64[ns]
donors_iso3                  6190 non-null object
recipients                   6

In [386]:
# These are the columns useful to my analysis
chinese_official_finance = chinese_official_finance[['all_recipients', 'crs_sector_name', 'flow', 'flow_class', 'funding_agency', 'intent', 'location_details', 'latitude', 'longitude', 'place_name', 'usd_defl_2014', 'year', 'project_title', 'recipient_condensed', 'round_coded']]

In [387]:
# Looking at my new dataframe with relevent columns 
chinese_official_finance.head()

Unnamed: 0,all_recipients,crs_sector_name,flow,flow_class,funding_agency,intent,location_details,latitude,longitude,place_name,usd_defl_2014,year,project_title,recipient_condensed,round_coded
0,Mauritania,Health,Free-standing technical assistance,ODA-like,"Unspecified Chinese Government Institution, Go...",Development,,18.08581,-15.9785,Nouakchott,,2010,29th medical team to Mauritania to assist loca...,Mauritania,ChinatoAfrica
1,Mauritania,Health,Free-standing technical assistance,ODA-like,"Unspecified Chinese Government Institution, Go...",Development,,15.15846,-12.1843,S√©libaby,,2010,29th medical team to Mauritania to assist loca...,Mauritania,ChinatoAfrica
2,Mauritania,Health,Free-standing technical assistance,ODA-like,"Unspecified Chinese Government Institution, Go...",Development,,16.61659,-11.40453,Kiffa,,2010,29th medical team to Mauritania to assist loca...,Mauritania,ChinatoAfrica
3,Mauritania,Health,Free-standing technical assistance,ODA-like,"Unspecified Chinese Government Institution, Go...",Development,,,,Mauritania,,2010,29th medical team to Mauritania to assist loca...,Mauritania,ChinatoAfrica
4,Mauritania,Transport and Storage,Loan (excluding debt rescheduling),ODA-like,"Export-Import Bank of China, Government Agency",Development,Nouakchott,18.08581,-15.9785,Nouakchott,396886331.0,2008,China issues 2 billion yuan loan to fund Port ...,Mauritania,ChinatoAfrica


In [388]:
# Checking my null values and data types 
chinese_official_finance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6190 entries, 0 to 6189
Data columns (total 15 columns):
all_recipients         6190 non-null object
crs_sector_name        6190 non-null object
flow                   6190 non-null object
flow_class             6190 non-null object
funding_agency         6190 non-null object
intent                 6190 non-null object
location_details       3103 non-null object
latitude               4564 non-null float64
longitude              4564 non-null float64
place_name             6190 non-null object
usd_defl_2014          3970 non-null float64
year                   6190 non-null int64
project_title          6190 non-null object
recipient_condensed    6190 non-null object
round_coded            6190 non-null object
dtypes: float64(3), int64(1), object(11)
memory usage: 725.5+ KB


In [389]:
# Looking at the list of unique recipients 
chinese_official_finance['recipient_condensed'].unique()

array(['Mauritania', 'Angola', 'Algeria', 'Botswana', 'Benin',
       'Sierra Leone', 'Guinea', 'Congo, Dem. Rep.', 'Ethiopia', 'Niger',
       'Congo, Rep.', 'Gabon', 'Equatorial Guinea', 'Burundi', 'Kenya',
       'Tanzania', 'Ghana', 'Senegal', 'Mauritius',
       'Central African Rep.', "Cote D'Ivoire", 'Sudan', 'Nigeria',
       'Cameroon', 'Togo', 'Mali', 'Cape Verde', 'Rwanda', 'Zambia',
       'Comoros', 'Chad', 'Djibouti', 'Egypt', 'Eritrea',
       'Africa, regional', 'Lesotho', 'Guinea-Bissau', 'Mozambique',
       'Madagascar', 'Malawi', 'Zimbabwe', 'Namibia', 'Seychelles',
       'Liberia', 'Morocco', 'Somalia', 'South Sudan', 'Tunisia',
       'Uganda', 'South Africa', 'Libya', 'Cambodia',
       'Southeast Asia, regional', 'Sri Lanka', 'Laos', 'Timor-Leste',
       'Myanmar', 'Bangladesh', 'Viet Nam', 'Nepal', 'Asia, regional',
       'Afghanistan', 'Pakistan', 'Iran', 'Philippines', 'Singapore',
       'Thailand', 'Brunei', 'Indonesia', 'Malaysia', 'Maldives',
       'B

In [390]:
# Narrowing my list of countries to Africa and the Middle East
chinese_official_finance = chinese_official_finance[chinese_official_finance['recipient_condensed'].isin(['Mauritania', 'Angola', 'Algeria', 'Botswana', 'Benin',
       'Sierra Leone', 'Guinea', 'Congo, Dem. Rep.', 'Ethiopia', 'Niger',
       'Congo, Rep.', 'Gabon', 'Equatorial Guinea', 'Burundi', 'Kenya',
       'Tanzania', 'Ghana', 'Senegal', 'Mauritius',
       'Central African Rep.', "Cote D'Ivoire", 'Sudan', 'Nigeria',
       'Cameroon', 'Togo', 'Mali', 'Cape Verde', 'Rwanda', 'Zambia',
       'Comoros', 'Chad', 'Djibouti', 'Egypt', 'Eritrea',
       'Africa, regional', 'Lesotho', 'Guinea-Bissau', 'Mozambique',
       'Madagascar', 'Malawi', 'Zimbabwe', 'Namibia', 'Seychelles',
       'Liberia', 'Morocco', 'Somalia', 'South Sudan', 'Tunisia',
       'Uganda', 'South Africa', 'Libya', 'Lebanon', 'Syria', 'Jordan',
       'Turkey', 'Yemen', 'Palestinian Adm. Areas', 'Iraq', 'Bahrain',
       'United Arab Emirates', 'Israel' ])]

In [391]:
# Looking at my new value counts
chinese_official_finance.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3644 entries, 0 to 6185
Data columns (total 15 columns):
all_recipients         3644 non-null object
crs_sector_name        3644 non-null object
flow                   3644 non-null object
flow_class             3644 non-null object
funding_agency         3644 non-null object
intent                 3644 non-null object
location_details       1740 non-null object
latitude               2659 non-null float64
longitude              2659 non-null float64
place_name             3644 non-null object
usd_defl_2014          2208 non-null float64
year                   3644 non-null int64
project_title          3644 non-null object
recipient_condensed    3644 non-null object
round_coded            3644 non-null object
dtypes: float64(3), int64(1), object(11)
memory usage: 455.5+ KB


In [392]:
# Filling in null values for location with unknown 
chinese_official_finance['location_details'] = chinese_official_finance['location_details'].fillna('unknown')

In [393]:
# Looking at value counts for funding agency
chinese_official_finance['funding_agency'].value_counts()

Unspecified Chinese Government Institution, Government Agency                                                                                                           2499
Export-Import Bank of China, Government Agency                                                                                                                           928
China Development Bank (CDB), Government Agency                                                                                                                           40
Chinese Embassy/Consulate, Government Agency                                                                                                                              38
China-Africa Development Fund, Government Agency                                                                                                                          18
China Ministry of Commerce, Government Agency                                                                                          

In [394]:
# Looking at recipient counts
chinese_official_finance['recipient_condensed'].value_counts()

Tanzania                  313
Angola                    202
Kenya                     195
Africa, regional          193
Zambia                    151
                         ... 
Palestinian Adm. Areas      5
Iraq                        4
Bahrain                     2
Libya                       2
United Arab Emirates        1
Name: recipient_condensed, Length: 61, dtype: int64

In [395]:
# Looking at aid type counts
chinese_official_finance['flow_class'].value_counts()

ODA-like                    2774
OOF-like                     448
Vague (Official Finance)     422
Name: flow_class, dtype: int64

In [396]:
# Looking at intent value counts 
chinese_official_finance['intent'].value_counts()

Development         3345
Mixed                158
Representational     112
Commercial            29
Name: intent, dtype: int64

In [397]:
# Looknig at sector name value counts 
chinese_official_finance['crs_sector_name'].value_counts()

Health                                                                           727
Transport and Storage                                                            611
Education                                                                        407
Communications                                                                   330
Government and Civil Society                                                     303
Energy Generation and Supply                                                     245
Agriculture, Forestry and Fishing                                                231
Other Social infrastructure and services                                         170
Water Supply and Sanitation                                                      147
Emergency Response                                                               141
Other Multisector                                                                 84
Action Relating to Debt                                          

In [398]:
# Filling latitude null values with 0s 
chinese_official_finance['latitude'] = chinese_official_finance['latitude'].fillna(0)

In [399]:
# Filling longitude null values with 0s
chinese_official_finance['longitude'] = chinese_official_finance['longitude'].fillna(0)

In [400]:
# Looking at my new dataframe 
chinese_official_finance.head()

Unnamed: 0,all_recipients,crs_sector_name,flow,flow_class,funding_agency,intent,location_details,latitude,longitude,place_name,usd_defl_2014,year,project_title,recipient_condensed,round_coded
0,Mauritania,Health,Free-standing technical assistance,ODA-like,"Unspecified Chinese Government Institution, Go...",Development,unknown,18.08581,-15.9785,Nouakchott,,2010,29th medical team to Mauritania to assist loca...,Mauritania,ChinatoAfrica
1,Mauritania,Health,Free-standing technical assistance,ODA-like,"Unspecified Chinese Government Institution, Go...",Development,unknown,15.15846,-12.1843,S√©libaby,,2010,29th medical team to Mauritania to assist loca...,Mauritania,ChinatoAfrica
2,Mauritania,Health,Free-standing technical assistance,ODA-like,"Unspecified Chinese Government Institution, Go...",Development,unknown,16.61659,-11.40453,Kiffa,,2010,29th medical team to Mauritania to assist loca...,Mauritania,ChinatoAfrica
3,Mauritania,Health,Free-standing technical assistance,ODA-like,"Unspecified Chinese Government Institution, Go...",Development,unknown,0.0,0.0,Mauritania,,2010,29th medical team to Mauritania to assist loca...,Mauritania,ChinatoAfrica
4,Mauritania,Transport and Storage,Loan (excluding debt rescheduling),ODA-like,"Export-Import Bank of China, Government Agency",Development,Nouakchott,18.08581,-15.9785,Nouakchott,396886331.0,2008,China issues 2 billion yuan loan to fund Port ...,Mauritania,ChinatoAfrica


In [401]:
# Changing values to numeric types
chinese_official_finance['usd_defl_2014'] = pd.to_numeric(chinese_official_finance['usd_defl_2014'], errors='coerce')

In [402]:
# Checking the datatypes 
chinese_official_finance.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3644 entries, 0 to 6185
Data columns (total 15 columns):
all_recipients         3644 non-null object
crs_sector_name        3644 non-null object
flow                   3644 non-null object
flow_class             3644 non-null object
funding_agency         3644 non-null object
intent                 3644 non-null object
location_details       3644 non-null object
latitude               3644 non-null float64
longitude              3644 non-null float64
place_name             3644 non-null object
usd_defl_2014          2208 non-null float64
year                   3644 non-null int64
project_title          3644 non-null object
recipient_condensed    3644 non-null object
round_coded            3644 non-null object
dtypes: float64(3), int64(1), object(11)
memory usage: 455.5+ KB


In [403]:
# Creating a column called 'null amounts as zero' and filling n/a amounts with 0
# I am using this column to split this into 2 dataframes for known and unknown values
chinese_official_finance['null_amounts_as_zero'] = chinese_official_finance['usd_defl_2014'].fillna(0)

In [404]:
# Reading my clean data to a csv
chinese_official_finance.to_csv('./aid_data/aid_data_wm/chinese_official_finance_clean.csv', index=False)

### At this point in the process, I decided to try a novel way to deal with my missing data. There is a lot of missing data due to the lack of transparency in Chinese government aid. Since this dataset had a lot of descriptive text columns, I decided to see if I could use NLP to predict the amount pledged for each project. This modeling is in 2_NLP_modeling_ipynb in this repository.

In [405]:
# Reading in the data that has been modeled 
chinese_official_finance = pd.read_csv('./aid_data/aid_data_wm/chinese_aid_modeled.csv')

In [406]:
# Looking at the data types and null values 
chinese_official_finance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3644 entries, 0 to 3643
Data columns (total 17 columns):
all_recipients           3644 non-null object
crs_sector_name          3644 non-null object
flow                     3644 non-null object
flow_class               3644 non-null object
funding_agency           3644 non-null object
intent                   3644 non-null object
location_details         3644 non-null object
latitude                 3644 non-null float64
longitude                3644 non-null float64
place_name               3644 non-null object
year                     3644 non-null int64
project_title            3644 non-null object
recipient_condensed      3644 non-null object
round_coded              3644 non-null object
aid_amount               3644 non-null float64
predicted_by_modeling    3644 non-null bool
aid_in_millions          3644 non-null float64
dtypes: bool(1), float64(4), int64(1), object(11)
memory usage: 459.2+ KB


In [407]:
# Looking at the distribution of aid amounts 
chinese_official_finance["aid_amount"].value_counts(bins=8)

(-2846733.244, 355863831.375]      3327
(355863831.375, 711727486.75]       142
(711727486.75, 1067591142.125]      107
(1067591142.125, 1423454797.5]       28
(2491045763.625, 2846909419.0]       22
(1423454797.5, 1779318452.875]       16
(2135182108.25, 2491045763.625]       2
(1779318452.875, 2135182108.25]       0
Name: aid_amount, dtype: int64

In [408]:
# Looking at the description of the aid amounts 
chinese_official_finance["aid_amount"].describe()

count    3.644000e+03
mean     1.169104e+08
std      3.107050e+08
min      1.760000e+02
25%      3.081550e+06
50%      1.535664e+07
75%      8.900000e+07
max      2.846909e+09
Name: aid_amount, dtype: float64

In [409]:
# Looking at the top 50 recipients of Chinese aid 
chinese_official_finance['recipient_condensed'].value_counts().head(50)

Tanzania                313
Angola                  202
Kenya                   195
Africa, regional        193
Zambia                  151
Ghana                   144
Zimbabwe                142
Ethiopia                140
Sudan                   139
Cameroon                112
Liberia                 108
Uganda                  106
Sierra Leone             96
Congo, Rep.              83
Namibia                  74
Nigeria                  69
Mauritius                69
Rwanda                   69
Botswana                 66
Niger                    65
Togo                     65
Mozambique               58
Burundi                  58
Equatorial Guinea        55
Lesotho                  48
Senegal                  46
Benin                    46
Congo, Dem. Rep.         45
Guinea                   44
Mali                     43
Malawi                   43
Morocco                  43
Cote D'Ivoire            34
Gabon                    31
Mauritania               31
Madagascar          

In [410]:
# Looking at the data frame
chinese_official_finance.head()

Unnamed: 0,all_recipients,crs_sector_name,flow,flow_class,funding_agency,intent,location_details,latitude,longitude,place_name,year,project_title,recipient_condensed,round_coded,aid_amount,predicted_by_modeling,aid_in_millions
0,Mauritania,Transport and Storage,Loan (excluding debt rescheduling),ODA-like,"Export-Import Bank of China, Government Agency",Development,Nouakchott,18.08581,-15.9785,Nouakchott,2008,China issues 2 billion yuan loan to fund Port ...,Mauritania,ChinatoAfrica,396886331.0,False,396.886331
1,Angola,Emergency Response,Grant,ODA-like,"Unspecified Chinese Government Institution, Go...",Development,Bie Province,-12.34989,17.3031,Prov√≠ncia do Bi√©,2001,"China grants $600,000 USD in food aid for floo...",Angola,ChinatoAfrica,1364094.0,False,1.364094
2,Botswana,Other Social infrastructure and services,Loan (excluding debt rescheduling),ODA-like,"Unspecified Chinese Government Institution, Go...",Development,"Maun, Jwaneng, Gaborone, Lobatse, Francistown ...",-24.60166,24.7281,Jwaneng,2004,China loans 117 million BWP for medium and low...,Botswana,ChinatoAfrica,51378371.0,False,51.378371
3,Botswana,Other Social infrastructure and services,Loan (excluding debt rescheduling),ODA-like,"Unspecified Chinese Government Institution, Go...",Development,"Maun, Jwaneng, Gaborone, Lobatse, Francistown ...",-25.22435,25.67728,Lobatse,2004,China loans 117 million BWP for medium and low...,Botswana,ChinatoAfrica,51378371.0,False,51.378371
4,Botswana,Other Social infrastructure and services,Loan (excluding debt rescheduling),ODA-like,"Unspecified Chinese Government Institution, Go...",Development,"Maun, Jwaneng, Gaborone, Lobatse, Francistown ...",-24.65451,25.90859,Gaborone,2004,China loans 117 million BWP for medium and low...,Botswana,ChinatoAfrica,51378371.0,False,51.378371


In [411]:
# Reading in World Bank data
world_bank = pd.read_csv('./aid_data/aid_data_wm/world_bank.csv')

From the ReadMe: "This geocoded dataset release represents all World Bank projects in the IBRD and IDA lending lines approved from 1995-2014. Each record in the projects table is associated with at least one record in the locations table. The files project.csv, locations.csv, and transactions.csv are structured so as to be combinable with other v1.0 geocoded dataset releases. The file projects_ancillary.csv contains all fields that are found in exports from the World Bank project database (http://www.worldbank.org/projects/advancedsearch?lang=en)."

In [412]:
# Looking at my world bank data
world_bank.head()

Unnamed: 0,project_id,is_geocoded,project_title,start_actual_isodate,start_actual_type,end_actual_isodate,end_actual_type,donors,donors_iso3,recipients,recipients_iso3,ad_sector_codes,ad_sector_names,status,transactions_start_year,transactions_end_year,total_commitments,total_disbursements
0,P124054,1,ML-Strengthening Reproductive Health,2011-12-20,start-actual,2017-02-28,end-actual,World Bank,DAC,Mali,MLI,120|121,"Health, general|Health",Implementation,2011,2017,30000000.0,
1,P035626,1,Pilot Community Infrastructure Works and Capac...,1998-08-24,start-actual,2004-06-30,end-actual,World Bank,DAC,Gabon,GAB,151|160|331|210|140,Water supply and sanitation|Other social infra...,Completion,1998,2004,6628359.0,6992928.0
2,P102288,1,Afghanistan: Emergency National Solidarity Pro...,2006-12-07,start-actual,2011-09-30,end-actual,World Bank,DAC,Afghanistan,AFG,311|160|210|140,Water supply and sanitation|Other social infra...,Completion,2006,2011,131772400.0,218926100.0
3,P065954,1,Health Reform Project,2003-09-11,start-actual,2007-03-31,end-actual,World Bank,DAC,Slovak Republic,SVK,120|151|121,"Health, general|Health|Government and civil so...",Completion,2003,2007,75769050.0,83774660.0
4,P118375,1,Road Upgrading and Modernization Project,2010-11-11,start-actual,2015-11-30,end-actual,World Bank,DAC,Belarus,BLR,210,Transport and storage,Implementation,2010,2015,153197600.0,20988270.0


In [413]:
# Looking at the data types and null values 
world_bank.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5881 entries, 0 to 5880
Data columns (total 18 columns):
project_id                 5881 non-null object
is_geocoded                5881 non-null int64
project_title              5881 non-null object
start_actual_isodate       5881 non-null object
start_actual_type          5881 non-null object
end_actual_isodate         5089 non-null object
end_actual_type            5881 non-null object
donors                     5881 non-null object
donors_iso3                5881 non-null object
recipients                 5881 non-null object
recipients_iso3            5881 non-null object
ad_sector_codes            5881 non-null object
ad_sector_names            5881 non-null object
status                     5881 non-null object
transactions_start_year    5881 non-null int64
transactions_end_year      5881 non-null int64
total_commitments          5880 non-null float64
total_disbursements        4182 non-null float64
dtypes: float64(2), int64(3), 

In [414]:
# Looking at unique aid recipients 
world_bank['recipients'].unique()

array(['Mali', 'Gabon', 'Afghanistan', 'Slovak Republic', 'Belarus',
       'Ukraine', 'Liberia', 'Rwanda', 'Mexico', 'Bosnia and Herzegovina',
       'Eastern Africa', 'Honduras', 'Georgia', 'Romania', 'Benin',
       'Peru', 'Brazil', 'Uganda', 'Chad', 'Ethiopia', 'Kyrgyz Republic',
       'Kazakhstan', 'Ghana', 'Western Africa', 'Malawi', 'Senegal',
       'Uruguay', 'Tunisia', 'Africa', 'Niger', 'Nicaragua', 'India',
       'Congo, Democratic Republic of', 'Russian Federation', 'Togo',
       'Guinea', 'Tanzania', "Lao People's Democratic Republic",
       'Congo, Republic of', 'Philippines', 'Jordan', 'Marshall Islands',
       'Armenia', 'Vietnam', 'Indonesia', 'Mozambique', 'Kenya',
       'South Sudan', 'OECS Countries', 'Poland', 'Morocco', 'Madagascar',
       'Belize', 'China', 'Bangladesh', 'Argentina', 'Uzbekistan',
       'Haiti', 'Panama', 'Sierra Leone', 'Eritrea', 'Lesotho',
       'Mauritania', 'Nepal', 'Croatia', 'Sri Lanka',
       'Venezuela, Republica Bolivariana 

In [415]:
# Putting my unique recipients into a list 
list1 = world_bank['recipients'].unique()

In [416]:
# Narrowing my list to Africa and Middle East 
world_bank = world_bank[world_bank['recipients'].isin(['Mali', 'Gabon', 'Liberia', 'Rwanda', 'Eastern Africa', 'Benin', 'Uganda', 'Chad', 'Ethiopia', 'Ghana', 'Western Africa', 'Malawi', 'Senegal', 'Tunisia', 'Africa', 'Niger', 'Congo, Democratic Republic of', 'Togo',
       'Guinea', 'Tanzania', 'Congo, Republic of', 'Jordan', 'Mozambique', 'Kenya', 'Comoros', "Cote d'Ivoire", 'Zambia', 'Angola',
       'South Sudan', 'Morocco', 'Madagascar', 'Sierra Leone', 'Eritrea', 'Lesotho', 'Sao Tome and Principe', 'Cabo Verde',
       'Mauritania',  'Yemen, Republic of', 'Egypt, Arab Republic of', 'Central African Republic', 'Cameroon', 'Lebanon', 'Botswana',
       'Nigeria', 'Djibouti', 'Iraq', 'Burkina Faso', 'Guinea-Bissau', 'Iran, Islamic Republic of', 'Algeria', 'Gambia, The', 'Mauritius', 'Burundi', 'Turkey', 'Zimbabwe', 'Namibia', 'South Africa', 'Seychelles', 'Swaziland'])]

In [417]:
# Looking at unique recipients 
world_bank['recipients'].unique()

array(['Mali', 'Gabon', 'Liberia', 'Rwanda', 'Eastern Africa', 'Benin',
       'Uganda', 'Chad', 'Ethiopia', 'Ghana', 'Western Africa', 'Malawi',
       'Senegal', 'Tunisia', 'Africa', 'Niger',
       'Congo, Democratic Republic of', 'Togo', 'Guinea', 'Tanzania',
       'Congo, Republic of', 'Jordan', 'Mozambique', 'Kenya',
       'South Sudan', 'Morocco', 'Madagascar', 'Sierra Leone', 'Eritrea',
       'Lesotho', 'Mauritania', 'Zambia', 'Yemen, Republic of',
       'Egypt, Arab Republic of', 'Cabo Verde', "Cote d'Ivoire", 'Angola',
       'Central African Republic', 'Cameroon', 'Lebanon', 'Botswana',
       'Nigeria', 'Djibouti', 'Iraq', 'Burkina Faso', 'Guinea-Bissau',
       'Iran, Islamic Republic of', 'Algeria', 'Gambia, The', 'Mauritius',
       'Burundi', 'Turkey', 'Comoros', 'Sao Tome and Principe',
       'Zimbabwe', 'Namibia', 'South Africa', 'Seychelles', 'Swaziland'],
      dtype=object)

In [418]:
# Saying the recipients to a list 
list2 = world_bank['recipients'].unique()

In [419]:
# Finding the difference between two lists to make sure I didn't miss any since this was a copy/paste job
def Diff(li1, li2): 
    return (list(set(li1) - set(li2))) 
  
print(Diff(list1, list2)) 

['Armenia', 'Paraguay', 'Barbados', 'Honduras', 'Nicaragua', 'Uzbekistan', 'Costa Rica', 'Maldives', 'Pakistan', 'Peru', 'Kazakhstan', 'Hungary', 'Cambodia', 'Tonga', 'Papua New Guinea', 'Slovak Republic', 'Vietnam', 'Kyrgyz Republic', 'Trinidad and Tobago', 'Sri Lanka', 'Ecuador', 'Antigua and Barbuda', 'Uruguay', 'Malaysia', 'India', 'Romania', 'Latvia', 'Belize', 'Korea, Republic of', 'Kiribati', 'Micronesia, Federated States of', 'South Asia', 'Georgia', 'Slovenia', 'Brazil', 'Central Asia', 'Philippines', 'Afghanistan', 'Colombia', 'Indonesia', 'Bhutan', 'Tajikistan', 'Serbia', 'Macedonia, former Yugoslav Republic of', 'Ukraine', 'St. Vincent and the Grenadines', 'Tuvalu', 'Azerbaijan', 'Caribbean', 'Jamaica', 'Turkmenistan', 'El Salvador', 'South Eastern Europe and Balkans', 'Bangladesh', 'Nepal', 'Samoa', 'OECS Countries', 'Estonia', 'Dominican Republic', 'Mongolia', 'Chile', 'Grenada', 'Bolivia', 'China', 'Belarus', 'Russian Federation', 'Panama', 'Haiti', 'Marshall Islands', '

In [420]:
# Looking at recipient counts 
world_bank['recipients'].value_counts()

Tanzania                         100
Ghana                             92
Ethiopia                          84
Yemen, Republic of                76
Mozambique                        71
Uganda                            71
Turkey                            71
Senegal                           70
Nigeria                           69
Morocco                           68
Kenya                             66
Madagascar                        63
Burkina Faso                      60
Malawi                            58
Mali                              54
Rwanda                            54
Zambia                            53
Africa                            53
Egypt, Arab Republic of           49
Benin                             47
Niger                             47
Congo, Democratic Republic of     47
Tunisia                           46
Sierra Leone                      45
Cameroon                          43
Cote d'Ivoire                     42
Mauritania                        40
B

In [421]:
# Looking at data types and null values 
world_bank.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2192 entries, 0 to 5880
Data columns (total 18 columns):
project_id                 2192 non-null object
is_geocoded                2192 non-null int64
project_title              2192 non-null object
start_actual_isodate       2192 non-null object
start_actual_type          2192 non-null object
end_actual_isodate         1811 non-null object
end_actual_type            2192 non-null object
donors                     2192 non-null object
donors_iso3                2192 non-null object
recipients                 2192 non-null object
recipients_iso3            2192 non-null object
ad_sector_codes            2192 non-null object
ad_sector_names            2192 non-null object
status                     2192 non-null object
transactions_start_year    2192 non-null int64
transactions_end_year      2192 non-null int64
total_commitments          2191 non-null float64
total_disbursements        1446 non-null float64
dtypes: float64(2), int64(3), 

In [422]:
# Putting all world bank recipients in a list
recipients_world_bank = world_bank['recipients'].to_list()

In [423]:
# Putting the chinese recipients in a list 
recipients_china = chinese_official_finance['recipient_condensed'].to_list()

In [424]:
# Looking at the difference between the two lists
print(Diff(recipients_china, recipients_world_bank)) 

['Egypt', 'Congo, Rep.', 'Somalia', 'Syria', 'Sudan', 'Palestinian Adm. Areas', "Cote D'Ivoire", 'Congo, Dem. Rep.', 'Africa, regional', 'Cape Verde', 'Libya', 'Central African Rep.', 'Yemen', 'United Arab Emirates', 'Israel', 'Equatorial Guinea', 'Bahrain']


In [425]:
# Looking at the difference between the two lists
print(Diff(recipients_world_bank, recipients_china)) 

['Yemen, Republic of', "Cote d'Ivoire", 'Sao Tome and Principe', 'Congo, Republic of', 'Burkina Faso', 'Western Africa', 'Iran, Islamic Republic of', 'Swaziland', 'Gambia, The', 'Congo, Democratic Republic of', 'Cabo Verde', 'Eastern Africa', 'Central African Republic', 'Egypt, Arab Republic of', 'Africa']


In [426]:
# Renaming countries for consistency with other data
world_bank.replace({'recipients' : {'Yemen, Republic of' : 'Yemen',
                   'Egypt, Arab Republic of' : 'Egypt',
                    "Cote d'Ivoire" : "Cote D'Ivoire",
                    'Congo, Republic of' : 'Congo',
                    'Congo, Democratic Republic of': 'Democratic Republic of Congo',
                    'Iran, Islamic Republic of' : 'Iran',
                    'Gambia, The' : 'Gambia'}}, inplace=True)

In [427]:
# Looking at the null values and data types
world_bank.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2192 entries, 0 to 5880
Data columns (total 18 columns):
project_id                 2192 non-null object
is_geocoded                2192 non-null int64
project_title              2192 non-null object
start_actual_isodate       2192 non-null object
start_actual_type          2192 non-null object
end_actual_isodate         1811 non-null object
end_actual_type            2192 non-null object
donors                     2192 non-null object
donors_iso3                2192 non-null object
recipients                 2192 non-null object
recipients_iso3            2192 non-null object
ad_sector_codes            2192 non-null object
ad_sector_names            2192 non-null object
status                     2192 non-null object
transactions_start_year    2192 non-null int64
transactions_end_year      2192 non-null int64
total_commitments          2191 non-null float64
total_disbursements        1446 non-null float64
dtypes: float64(2), int64(3), 

In [428]:
# Putting the new recipient list after renaming into a list
recipients_world_bank2 = world_bank['recipients'].to_list()

In [429]:
# Looking at the difference between 2 lists 
print(Diff(recipients_china, recipients_world_bank2)) 

['Congo, Rep.', 'Somalia', 'Syria', 'Sudan', 'Palestinian Adm. Areas', 'Congo, Dem. Rep.', 'Africa, regional', 'Cape Verde', 'Libya', 'Central African Rep.', 'United Arab Emirates', 'Israel', 'Equatorial Guinea', 'Bahrain']


In [430]:
# Looking at the difference between 2 lists 
print(Diff(recipients_world_bank2, recipients_china)) 

['Sao Tome and Principe', 'Gambia', 'Congo', 'Western Africa', 'Swaziland', 'Cabo Verde', 'Iran', 'Democratic Republic of Congo', 'Eastern Africa', 'Central African Republic', 'Burkina Faso', 'Africa']


In [431]:
# Changing the names that are different between the two 
chinese_official_finance.replace({'recipient_condensed' : {'Palestinian Adm. Areas' : 'Palestine',
                    'Congo, Rep.' : 'Congo',
                    'Congo, Dem. Rep.': 'Democratic Republic of Congo',
                    'Central African Rep.': 'Central African Republic',
                    'Cape Verde' : 'Cabo Verde'}}, inplace=True)

In [432]:
# Putting the new names in a list
recipients_china2 = chinese_official_finance['recipient_condensed'].to_list()

In [433]:
# Looking at the difference between the two lists
print(Diff(recipients_china2, recipients_world_bank2)) 

['Somalia', 'Palestine', 'Syria', 'Sudan', 'Africa, regional', 'Libya', 'United Arab Emirates', 'Israel', 'Equatorial Guinea', 'Bahrain']


In [434]:
# Looking at the difference between the two lists. I don't see
print(Diff(recipients_world_bank2, recipients_china2)) 

['Sao Tome and Principe', 'Gambia', 'Western Africa', 'Swaziland', 'Iran', 'Eastern Africa', 'Burkina Faso', 'Africa']


In [435]:
# Looking at my World Bank data frame 
world_bank.head()

Unnamed: 0,project_id,is_geocoded,project_title,start_actual_isodate,start_actual_type,end_actual_isodate,end_actual_type,donors,donors_iso3,recipients,recipients_iso3,ad_sector_codes,ad_sector_names,status,transactions_start_year,transactions_end_year,total_commitments,total_disbursements
0,P124054,1,ML-Strengthening Reproductive Health,2011-12-20,start-actual,2017-02-28,end-actual,World Bank,DAC,Mali,MLI,120|121,"Health, general|Health",Implementation,2011,2017,30000000.0,
1,P035626,1,Pilot Community Infrastructure Works and Capac...,1998-08-24,start-actual,2004-06-30,end-actual,World Bank,DAC,Gabon,GAB,151|160|331|210|140,Water supply and sanitation|Other social infra...,Completion,1998,2004,6628359.0,6992928.0
6,P102915,1,Re-engagement and Reform Support Program,2007-12-04,start-actual,2008-09-30,end-actual,World Bank,DAC,Liberia,LBR,151,"Government and civil society, general",Completion,2007,2008,458870300.0,432441700.0
7,P057294,1,Economic Recovery Credit Project,1999-03-30,start-actual,2001-03-31,end-actual,World Bank,DAC,Rwanda,RWA,151|311|321|160,Other social infrastructure and services|Indus...,Completion,1999,2001,97899960.0,110436100.0
10,P111556,1,AFCC2/RI-East Africa Public Health Laboratory ...,2010-05-25,start-actual,2020-03-30,end-actual,World Bank,DAC,Eastern Africa,Unspecified,120|220,Communications|Health,Implementation,2010,2020,65017080.0,10818880.0


In [436]:
# Looking at the donor value counts. Getting rid of this column. 
world_bank['donors'].value_counts()

World Bank    2192
Name: donors, dtype: int64

In [437]:
# Saving the columns that I want to keep based on the data dictionary 
world_bank = world_bank[['project_title', 'recipients', 'ad_sector_names', 'transactions_start_year', 'total_commitments']]

In [438]:
# Looking at data types and null values 
world_bank.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2192 entries, 0 to 5880
Data columns (total 5 columns):
project_title              2192 non-null object
recipients                 2192 non-null object
ad_sector_names            2192 non-null object
transactions_start_year    2192 non-null int64
total_commitments          2191 non-null float64
dtypes: float64(1), int64(1), object(3)
memory usage: 102.8+ KB


In [439]:
# Dropping the null value 
world_bank = world_bank.dropna()

In [440]:
# Looking at the start years. Keeping the window to 2000-2014. 
world_bank['transactions_start_year'].value_counts(bins=10)

(2012.1, 2014.0]     303
(2008.3, 2010.2]     284
(2010.2, 2012.1]     255
(2006.4, 2008.3]     237
(2002.6, 2004.5]     208
(1998.8, 2000.7]     203
(2004.5, 2006.4]     181
(2000.7, 2002.6]     177
(1994.98, 1996.9]    175
(1996.9, 1998.8]     168
Name: transactions_start_year, dtype: int64

In [441]:
# Getting rid of all values before 2000
world_bank = world_bank[world_bank['transactions_start_year'] >= 2000] 

In [442]:
# Looking at the distribution of years 
world_bank['transactions_start_year'].value_counts(bins=10)

(2012.6, 2014.0]                303
(2009.8, 2011.2]                289
(2005.6, 2007.0]                220
(1999.9850000000001, 2001.4]    213
(2002.8, 2004.2]                208
(2008.4, 2009.8]                131
(2011.2, 2012.6]                119
(2007.0, 2008.4]                118
(2001.4, 2002.8]                 81
(2004.2, 2005.6]                 80
Name: transactions_start_year, dtype: int64

In [443]:
# Checking to see how many values are left
world_bank.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1762 entries, 0 to 5874
Data columns (total 5 columns):
project_title              1762 non-null object
recipients                 1762 non-null object
ad_sector_names            1762 non-null object
transactions_start_year    1762 non-null int64
total_commitments          1762 non-null float64
dtypes: float64(1), int64(1), object(3)
memory usage: 82.6+ KB


In [444]:
# Saving the clean data to a csv 
world_bank.to_csv('./aid_data/aid_data_wm/world_bank_clean.csv', index=False)

In [445]:
# Looking at Chinese official finance data again
chinese_official_finance.head()

Unnamed: 0,all_recipients,crs_sector_name,flow,flow_class,funding_agency,intent,location_details,latitude,longitude,place_name,year,project_title,recipient_condensed,round_coded,aid_amount,predicted_by_modeling,aid_in_millions
0,Mauritania,Transport and Storage,Loan (excluding debt rescheduling),ODA-like,"Export-Import Bank of China, Government Agency",Development,Nouakchott,18.08581,-15.9785,Nouakchott,2008,China issues 2 billion yuan loan to fund Port ...,Mauritania,ChinatoAfrica,396886331.0,False,396.886331
1,Angola,Emergency Response,Grant,ODA-like,"Unspecified Chinese Government Institution, Go...",Development,Bie Province,-12.34989,17.3031,Prov√≠ncia do Bi√©,2001,"China grants $600,000 USD in food aid for floo...",Angola,ChinatoAfrica,1364094.0,False,1.364094
2,Botswana,Other Social infrastructure and services,Loan (excluding debt rescheduling),ODA-like,"Unspecified Chinese Government Institution, Go...",Development,"Maun, Jwaneng, Gaborone, Lobatse, Francistown ...",-24.60166,24.7281,Jwaneng,2004,China loans 117 million BWP for medium and low...,Botswana,ChinatoAfrica,51378371.0,False,51.378371
3,Botswana,Other Social infrastructure and services,Loan (excluding debt rescheduling),ODA-like,"Unspecified Chinese Government Institution, Go...",Development,"Maun, Jwaneng, Gaborone, Lobatse, Francistown ...",-25.22435,25.67728,Lobatse,2004,China loans 117 million BWP for medium and low...,Botswana,ChinatoAfrica,51378371.0,False,51.378371
4,Botswana,Other Social infrastructure and services,Loan (excluding debt rescheduling),ODA-like,"Unspecified Chinese Government Institution, Go...",Development,"Maun, Jwaneng, Gaborone, Lobatse, Francistown ...",-24.65451,25.90859,Gaborone,2004,China loans 117 million BWP for medium and low...,Botswana,ChinatoAfrica,51378371.0,False,51.378371


In [446]:
# Creating a list with the total for each country 
chinese_aid_sums = chinese_official_finance.groupby('recipient_condensed')['aid_amount'].sum()

In [447]:
chinese_aid_sums

recipient_condensed
Africa, regional        1.554420e+10
Algeria                 5.674971e+08
Angola                  3.824927e+10
Bahrain                 2.108542e+07
Benin                   1.212040e+09
                            ...     
Uganda                  3.797628e+09
United Arab Emirates    3.177106e+07
Yemen                   4.056454e+08
Zambia                  1.067845e+10
Zimbabwe                2.340166e+10
Name: aid_amount, Length: 61, dtype: float64

In [448]:
# Putting this information into a dataframe 
chinese_aid_sums = chinese_aid_sums.to_frame()

In [449]:
# Renaming the columns
chinese_aid_sums = chinese_aid_sums.rename(columns={'aid_amount': 'chinese_aid_totals'})

In [450]:
chinese_aid_sums.head()

Unnamed: 0_level_0,chinese_aid_totals
recipient_condensed,Unnamed: 1_level_1
"Africa, regional",15544200000.0
Algeria,567497100.0
Angola,38249270000.0
Bahrain,21085420.0
Benin,1212040000.0


In [451]:
# Saving the totals to a csv
chinese_aid_sums.to_csv('./aid_data/aid_data_wm/chinese_aid_sums.csv', index=True)

In [452]:
# Making a list of world bank recipients sums 
world_bank_sums = world_bank.groupby('recipients')['total_commitments'].sum()

In [453]:
# Saving this list to a data frame 
world_bank_sums = world_bank_sums.to_frame()

In [454]:
# Renaming column 
world_bank_sums = world_bank_sums.rename(columns={'total_commitments': 'world_bank_totals'})

In [455]:
# Looking at my new data frame 
world_bank_sums.head()

Unnamed: 0_level_0,world_bank_totals
recipients,Unnamed: 1_level_1
Africa,3875336000.0
Algeria,438050500.0
Angola,803087300.0
Benin,1107820000.0
Botswana,385871900.0


In [456]:
# Looking at the shape of the data frame 
world_bank_sums.shape

(58, 1)

In [457]:
# Saving my data to a dataframe
world_bank_sums.to_csv('./aid_data/aid_data_wm/world_bank_sums.csv', index=True)

### Might get rid of everything below this

In [179]:
chinese_official_finance.head()

Unnamed: 0,all_recipients,crs_sector_name,flow,flow_class,funding_agency,intent,location_details,latitude,longitude,place_name,year,project_title,recipient_condensed,round_coded,aid_amount,predicted_by_modeling
0,Mauritania,Transport and Storage,Loan (excluding debt rescheduling),ODA-like,"Export-Import Bank of China, Government Agency",Development,Nouakchott,18.08581,-15.9785,Nouakchott,2008,China issues 2 billion yuan loan to fund Port ...,Mauritania,ChinatoAfrica,396886331.0,False
1,Angola,Emergency Response,Grant,ODA-like,"Unspecified Chinese Government Institution, Go...",Development,Bie Province,-12.34989,17.3031,Prov√≠ncia do Bi√©,2001,"China grants $600,000 USD in food aid for floo...",Angola,ChinatoAfrica,1364094.0,False
2,Botswana,Other Social infrastructure and services,Loan (excluding debt rescheduling),ODA-like,"Unspecified Chinese Government Institution, Go...",Development,"Maun, Jwaneng, Gaborone, Lobatse, Francistown ...",-24.60166,24.7281,Jwaneng,2004,China loans 117 million BWP for medium and low...,Botswana,ChinatoAfrica,51378371.0,False
3,Botswana,Other Social infrastructure and services,Loan (excluding debt rescheduling),ODA-like,"Unspecified Chinese Government Institution, Go...",Development,"Maun, Jwaneng, Gaborone, Lobatse, Francistown ...",-25.22435,25.67728,Lobatse,2004,China loans 117 million BWP for medium and low...,Botswana,ChinatoAfrica,51378371.0,False
4,Botswana,Other Social infrastructure and services,Loan (excluding debt rescheduling),ODA-like,"Unspecified Chinese Government Institution, Go...",Development,"Maun, Jwaneng, Gaborone, Lobatse, Francistown ...",-24.65451,25.90859,Gaborone,2004,China loans 117 million BWP for medium and low...,Botswana,ChinatoAfrica,51378371.0,False


In [180]:
chinese_yearly_sums = chinese_official_finance.groupby(['recipient_condensed', 'year'])['aid_amount'].sum()

In [181]:
chinese_yearly_sums = chinese_yearly_sums.to_frame()

In [182]:
chinese_yearly_sums.to_csv('./aid_data/aid_data_wm/chinese_yearly_sums.csv', index=True)

In [183]:
world_bank.head()

Unnamed: 0,project_title,recipients,ad_sector_names,transactions_start_year,total_commitments
0,ML-Strengthening Reproductive Health,Mali,"Health, general|Health",2011,30000000.0
6,Re-engagement and Reform Support Program,Liberia,"Government and civil society, general",2007,458870300.0
10,AFCC2/RI-East Africa Public Health Laboratory ...,Eastern Africa,Communications|Health,2010,65017080.0
14,Increased Access to Modern Energy,Benin,Energy generation and supply,2009,72447530.0
18,Local Government Management and Services Deliv...,Uganda,Other social infrastructure and services|Gover...,2007,58692710.0


In [184]:
world_bank_yearly_sums = world_bank.groupby(['recipients', 'transactions_start_year'])['total_commitments'].sum()

In [185]:
world_bank_yearly_sums = world_bank_yearly_sums.to_frame()

In [186]:
world_bank_yearly_sums.to_csv('./aid_data/aid_data_wm/world_bank_yearly_sums.csv', index=True)