In [81]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# dashboard
import dash
import dash_core_components as dcc
import dash_html_components as html
import plotly.graph_objs as go
import plotly.offline as pyo
from jupyter_plotly_dash import JupyterDash
from pprint import pprint
import pandas as pd
import numpy as np

In [4]:
aid_data = pd.read_csv("./AidDataCoreThin_ResearchRelease_Level1_v3.1.csv")
aid_data.head()

Unnamed: 0,aiddata_id,aiddata_2_id,year,donor,recipient,commitment_amount_usd_constant,coalesced_purpose_code,coalesced_purpose_name
0,1,,2003,African Development Bank (AFDB),Togo,29589911,24030,Formal sector financial intermediaries
1,2,,1990,African Development Bank (AFDB),Burundi,9713596,31100,"Agriculture, combinations of purposes in Agric..."
2,3,,1991,African Development Bank (AFDB),Cote d`Ivoire,148139421,31120,Agricultural development
3,4,,1992,African Development Bank (AFDB),Cameroon,24693752,31120,Agricultural development
4,5,,1992,African Development Bank (AFDB),Gabon,82312507,31100,"Agriculture, combinations of purposes in Agric..."


In [20]:
print(aid_data.shape)
print(aid_data.isna().sum())
print(aid_data.dtypes)

(1561039, 8)
aiddata_id                             0
aiddata_2_id                      569672
year                                   0
donor                                  0
recipient                              0
commitment_amount_usd_constant         0
coalesced_purpose_code                 0
coalesced_purpose_name                 0
dtype: int64
aiddata_id                          int64
aiddata_2_id                      float64
year                                int64
donor                              object
recipient                          object
commitment_amount_usd_constant      int64
coalesced_purpose_code              int64
coalesced_purpose_name             object
dtype: object


In [31]:
# How many unique values in each column
aid_data.apply("nunique")

aiddata_id                        1561039
aiddata_2_id                       991298
year                                   68
donor                                  96
recipient                             252
commitment_amount_usd_constant     542121
coalesced_purpose_code                268
coalesced_purpose_name                480
dtype: int64

A total of 96 donors and 252 recipients. Theres only a total 195 countries in the world, so there must be some smaller nations included. Theres also less purpose codes than purpose names so some codes must mutiple names.

In [57]:
# Group by purpose code and select the codes with multiple names
gb = aid_data.groupby("coalesced_purpose_code")
diff_names = [g[0] for g in gb if g[1]["coalesced_purpose_name"].nunique() > 1]

# diff_gb = [label if aid_data.iloc[v,"coalesced_purpose_name"] else "" for label, v in gb.groups]
print(diff_names)

[11110, 11120, 11130, 11182, 11220, 11230, 11240, 11320, 11330, 11420, 11430, 12110, 12181, 12182, 12191, 12220, 12230, 12240, 12250, 12261, 12281, 13010, 13020, 13030, 13040, 13081, 14010, 14020, 14030, 14040, 14050, 14081, 15110, 15120, 15130, 15140, 15150, 15161, 15162, 15163, 15164, 15230, 15240, 15250, 15261, 16010, 16020, 16030, 16050, 16061, 16062, 16063, 16064, 21010, 21020, 21030, 21040, 21050, 21061, 21081, 22010, 22020, 22030, 22040, 23010, 23020, 23030, 23040, 23050, 23062, 23063, 23064, 23065, 23067, 23068, 23070, 23081, 23082, 24010, 24020, 24030, 24040, 24081, 25010, 25020, 31110, 31120, 31150, 31161, 31163, 31164, 31181, 31182, 31191, 31192, 31195, 31210, 31220, 31281, 31310, 31320, 32110, 32120, 32130, 32140, 32161, 32162, 32163, 32164, 32165, 32166, 32167, 32169, 32171, 32172, 32182, 32210, 32220, 32261, 32262, 32310, 33110, 33120, 33130, 33140, 33181, 33210, 41010, 41020, 41030, 41040, 41050, 41081, 41082, 43010, 43050, 43081, 51010, 52010, 53030, 53040, 60010, 60020

In [93]:
aid_data[aid_data.coalesced_purpose_code == 33120]["coalesced_purpose_name"].value_counts().idxmax()

'Trade facilitation'

Looks like the purpose names are dirty and this is a problem if we want to use name in a bar chart. Even if we use purpose code as the nominal data type for categorical graphs when we reverse the code the name there will be some inconsistencies. 

In [97]:
corrected_names_dict = {code: aid_data[aid_data.coalesced_purpose_code == code]["coalesced_purpose_name"].value_counts().idxmax()  for code in diff_names}

In [139]:
# aid_data[aid_data.coalesced_purpose_code.isin(diff_names)]["coalesced_purpose_code"].apply(lambda x: corrected_names_dict[x])
dirty_idx = aid_data.coalesced_purpose_code.isin(diff_names)
# aid_data.coalesced_purpose_name.replace(dirty_idx, pd.Series([corrected_names_dict[r.coalesced_purpose_code] for r in aid_data[dirty_idx].itertuples()]))
aid_data.loc[dirty_idx, "coalesced_purpose_name"] = [corrected_names_dict[r.coalesced_purpose_code] for r in aid_data[dirty_idx].itertuples()]

In [140]:
country_code = df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2014_world_gdp_with_codes.csv')
aid-data

Trade facilitation    3441
Name: coalesced_purpose_name, dtype: int64

In [144]:
aid_data["donor"].value_counts()

United States                                                                             208224
Spain                                                                                     122550
Japan                                                                                     111188
Germany                                                                                   104900
France                                                                                     89029
United Nations Children`s Fund (UNICEF)                                                    71585
Norway                                                                                     68980
Canada                                                                                     61661
Belgium                                                                                    55598
Sweden                                                                                     55339
United Kingdom                