In [195]:
import pandas as pd
import numpy as np
import json
from pandas.io.json import json_normalize

In [287]:
# loading the data for the initial inspection
with open('world_bank_projects.json') as d:
    data = json.load(d)    
df = json_normalize(data)

# 1. Find the 10 countries with most projects

In [288]:
# indexing by 'countryname' so grouby can be applied on 'projectname' 
# column chained with the count() method to get the total number of 
# projects for a particular country, then the values are sorted 
# in descending order and the first 10 countries are printed out.
df.set_index('countryname',inplace=True)

sorted_by_projects = df[['project_name']].groupby('countryname').count()\
.sort_values('project_name',ascending=False)

# Column names beautification
sorted_by_projects.index.name = 'Country'
sorted_by_projects.columns = ['Project Count']

sorted_by_projects[:11]
# Its worth noting that Africa is a continent, however it's present 
# in the table, in order to keep the initial integrity of the data
# Africa is not removed however the printed table includes
# 11 entries, so that it can be seen that the actual 10th country is
# Burkina Faso.

Unnamed: 0_level_0,Project Count
Country,Unnamed: 1_level_1
People's Republic of China,19
Republic of Indonesia,19
Socialist Republic of Vietnam,17
Republic of India,16
Republic of Yemen,13
People's Republic of Bangladesh,12
Nepal,12
Kingdom of Morocco,12
Republic of Mozambique,11
Africa,11


# 2. Find the top 10 major project themes (using column 'mjtheme_namecode')

## In 2. above you will notice that some entries have only the code and the name is missing. Create a dataframe with the missing names filled in.


In [285]:
# Creating a dataframe where column 'code' values correspond to 
# categorizations of projects in column 'name'.
df2 = json_normalize(data,'mjtheme_namecode')
df2.head()

Unnamed: 0,code,name
0,8,Human development
1,11,
2,1,Economic management
3,6,Social protection and risk management
4,5,Trade and integration


In [283]:
# a dcitionary is created to match the code with the theme of the project:
name_code_dict = dict(zip(df2['code'],df2['name']))
# there are only 11 codes and only 2 of them had missing corresponding
# values, so it is more cost-effective to assign the missing 
#values manually:
name_code_dict['1'] = 'Economic management'
name_code_dict['6'] = 'Social protection and risk management'

name_code_dict

{'8': 'Human development',
 '11': 'Environment and natural resources management',
 '1': 'Economic management',
 '6': 'Social protection and risk management',
 '5': 'Trade and integration',
 '2': 'Public sector governance',
 '7': 'Social dev/gender/inclusion',
 '4': 'Financial and private sector development',
 '10': 'Rural development',
 '9': 'Urban development',
 '3': 'Rule of law'}

In [284]:
# Here, the 'name' column of df_2 is reassigned to a newly created 'name'
# column which is mapped with a dictionary so that all the missing names
# are filled with the proper project categories.
df2['name'] = df2['code'].map(name_code_dict)
top_project_themes = df2.groupby('name').count().sort_values('code',ascending=False)
top_project_themes[:10]

Unnamed: 0_level_0,code
name,Unnamed: 1_level_1
Environment and natural resources management,250
Rural development,216
Human development,210
Public sector governance,199
Social protection and risk management,168
Financial and private sector development,146
Social dev/gender/inclusion,130
Trade and integration,77
Urban development,50
Economic management,38


In [289]:
# It is worth noting that there are 11 project themes in total so by 
# printing the first 10 only one is bein left out, here the entire 
# dataframe is printed.
top_project_themes

Unnamed: 0_level_0,code
name,Unnamed: 1_level_1
Environment and natural resources management,250
Rural development,216
Human development,210
Public sector governance,199
Social protection and risk management,168
Financial and private sector development,146
Social dev/gender/inclusion,130
Trade and integration,77
Urban development,50
Economic management,38
