# EDA and Cleaning World Bank Data

In [37]:
# Importing libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [38]:
# Reading in gdp per capita data 
gdp_pc = pd.read_csv('./aid_data/gdp/per_cap_gdp.csv')

In [39]:
# Looking at my data frame 
gdp_pc.head()

Unnamed: 0,Country Name,2014
0,Angola,3843.198241
1,United Arab Emirates,38495.04635
2,Burundi,245.3267387
3,Benin,834.4435964
4,Burkina Faso,639.7080956


In [40]:
# Looking at my null values and data types 
gdp_pc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68 entries, 0 to 67
Data columns (total 2 columns):
Country Name    68 non-null object
2014            68 non-null object
dtypes: object(2)
memory usage: 1.2+ KB


In [41]:
# Changing the gdp per capita value to a numeric data type 
gdp_pc['2014'] = pd.to_numeric(gdp_pc['2014'], errors='coerce')

In [42]:
# Seeing how nulls and data types changed 
gdp_pc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68 entries, 0 to 67
Data columns (total 2 columns):
Country Name    68 non-null object
2014            67 non-null float64
dtypes: float64(1), object(1)
memory usage: 1.2+ KB


In [43]:
# Seeing which value is null 
gdp_pc[gdp_pc.isnull().any(axis=1)]

Unnamed: 0,Country Name,2014
14,Djibouti,


In [44]:
# Did research to fill in null value 
gdp_pc = gdp_pc.fillna(1595.15)

In [45]:
# Checking that I have no null values 
gdp_pc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68 entries, 0 to 67
Data columns (total 2 columns):
Country Name    68 non-null object
2014            68 non-null float64
dtypes: float64(1), object(1)
memory usage: 1.2+ KB


In [46]:
# Renaming the column to gdp per capita 
gdp_pc = gdp_pc.rename(columns={'2014' : 'gdp_per_cap'})

In [47]:
# Reading in my data about resource rents as proportion of gdp 
resources = pd.read_csv('./aid_data/gdp/resource_rents.csv')

In [48]:
# Looking at the data frame 
resources.head()

Unnamed: 0,Country Name,2014
0,Angola,23.38193
1,United Arab Emirates,24.249507
2,Burundi,17.033314
3,Benin,4.872945
4,Burkina Faso,16.981603


In [49]:
# Looking at the data types and null values 
resources.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68 entries, 0 to 67
Data columns (total 2 columns):
Country Name    68 non-null object
2014            68 non-null float64
dtypes: float64(1), object(1)
memory usage: 1.2+ KB


In [50]:
# Renaming the column for resource rents 
resources = resources.rename(columns={'2014' : 'resource_rents'})

In [51]:
# Checking to see if the columns changed 
resources.head()

Unnamed: 0,Country Name,resource_rents
0,Angola,23.38193
1,United Arab Emirates,24.249507
2,Burundi,17.033314
3,Benin,4.872945
4,Burkina Faso,16.981603


In [52]:
# Reading in population data 
population = pd.read_csv('./aid_data/gdp/population.csv')

In [53]:
# Looking at my data frame 
population.head()

Unnamed: 0,Country Name,2014
0,Angola,26941779
1,United Arab Emirates,9214175
2,Burundi,9844297
3,Benin,10286842
4,Burkina Faso,17586017


In [54]:
# Looking at the null values and data types 
population.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68 entries, 0 to 67
Data columns (total 2 columns):
Country Name    68 non-null object
2014            68 non-null int64
dtypes: int64(1), object(1)
memory usage: 1.2+ KB


In [55]:
# Renaming the columns 
population = population.rename(columns={'2014' : 'population'})

In [56]:
# Reading in debt to gdp data 
debt_to_gdp = pd.read_csv('./aid_data/gdp/debt_to_gdp.csv', encoding='latin-1')

In [57]:
# Looking at my data frame 
debt_to_gdp.head()

Unnamed: 0,Country,debt_to_gdp
0,Algeria,7.673
1,Angola,40.676
2,Bahrain,44.397
3,Benin,30.452
4,Botswana,17.346


In [58]:
# Checking null values and data types 
debt_to_gdp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68 entries, 0 to 67
Data columns (total 2 columns):
Country        68 non-null object
debt_to_gdp    68 non-null float64
dtypes: float64(1), object(1)
memory usage: 1.2+ KB


In [59]:
# Renaming country column 
debt_to_gdp = debt_to_gdp.rename(columns={'Country' : 'Country Name'})

In [60]:
# Looking at unique country names 
debt_to_gdp['Country Name'].unique()

array(['Algeria', 'Angola', 'Bahrain', 'Benin', 'Botswana',
       'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cameroon',
       'Central African Republic', 'Chad', 'Comoros',
       'Democratic Republic of the Congo', 'Republic of Congo',
       "Côte d'Ivoire", 'Djibouti', 'Egypt', 'Equatorial Guinea',
       'Eritrea', 'Ethiopia', 'Gabon', 'The Gambia', 'Ghana', 'Guinea',
       'Guinea-Bissau', 'Islamic Republic of Iran', 'Iraq', 'Israel',
       'Jordan', 'Kenya', 'Kuwait', 'Lebanon', 'Lesotho', 'Liberia',
       'Libya', 'Madagascar', 'Malawi', 'Mali', 'Mauritania', 'Mauritius',
       'Morocco', 'Mozambique', 'Namibia', 'Niger', 'Nigeria', 'Oman',
       'Qatar', 'Rwanda', 'São Tomé and Príncipe', 'Saudi Arabia',
       'Senegal', 'Sierra Leone', 'Somalia', 'South Africa',
       'South Sudan', 'Sudan', 'Swaziland', 'Syria', 'Tanzania', 'Togo',
       'Tunisia', 'Turkey', 'Uganda', 'United Arab Emirates', 'Yemen',
       'Zambia', 'Zimbabwe', 'Palestine'], dtype=object)

In [61]:
# Renaming countries for consistency 
debt_to_gdp.replace({'Country Name' : {"Côte d'Ivoire" : "Cote D'Ivoire",
                    'Democratic Republic of the Congo' : 'Democratic Republic of Congo',
                    'Eswatini' : 'Swaziland',
                    'Republic of Congo' : 'Congo',
                    'The Gambia' : 'Gambia',
                    'Islamic Republic of Iran' : 'Iran',
                    'São Tomé and Príncipe' : 'Sao Tome and Principe'                
                    }}, inplace=True)

In [62]:
# Looking at the unique country names 
debt_to_gdp['Country Name'].unique()

array(['Algeria', 'Angola', 'Bahrain', 'Benin', 'Botswana',
       'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cameroon',
       'Central African Republic', 'Chad', 'Comoros',
       'Democratic Republic of Congo', 'Congo', "Cote D'Ivoire",
       'Djibouti', 'Egypt', 'Equatorial Guinea', 'Eritrea', 'Ethiopia',
       'Gabon', 'Gambia', 'Ghana', 'Guinea', 'Guinea-Bissau', 'Iran',
       'Iraq', 'Israel', 'Jordan', 'Kenya', 'Kuwait', 'Lebanon',
       'Lesotho', 'Liberia', 'Libya', 'Madagascar', 'Malawi', 'Mali',
       'Mauritania', 'Mauritius', 'Morocco', 'Mozambique', 'Namibia',
       'Niger', 'Nigeria', 'Oman', 'Qatar', 'Rwanda',
       'Sao Tome and Principe', 'Saudi Arabia', 'Senegal', 'Sierra Leone',
       'Somalia', 'South Africa', 'South Sudan', 'Sudan', 'Swaziland',
       'Syria', 'Tanzania', 'Togo', 'Tunisia', 'Turkey', 'Uganda',
       'United Arab Emirates', 'Yemen', 'Zambia', 'Zimbabwe', 'Palestine'],
      dtype=object)

In [63]:
# Merging the population and gdp per capita data together 
wb_info = pd.merge(population, gdp_pc, on=['Country Name'])

In [64]:
# Merging resources data with other world bank info 
wb_info = pd.merge(wb_info, resources, on=['Country Name'])

In [65]:
# Looking at the shape of my data frame 
wb_info.shape

(68, 4)

In [66]:
# Looking unique country names 
wb_info['Country Name'].unique()

array(['Angola', 'United Arab Emirates', 'Burundi', 'Benin',
       'Burkina Faso', 'Bahrain', 'Botswana', 'Central African Republic',
       "Cote d'Ivoire", 'Cameroon', 'Congo, Dem. Rep.', 'Congo, Rep.',
       'Comoros', 'Cabo Verde', 'Djibouti', 'Algeria', 'Egypt, Arab Rep.',
       'Eritrea', 'Ethiopia', 'Gabon', 'Ghana', 'Guinea', 'Gambia, The',
       'Guinea-Bissau', 'Equatorial Guinea', 'Iran, Islamic Rep.', 'Iraq',
       'Israel', 'Jordan', 'Kenya', 'Kuwait', 'Lebanon', 'Liberia',
       'Libya', 'Lesotho', 'Morocco', 'Madagascar', 'Mali', 'Mozambique',
       'Mauritania', 'Mauritius', 'Malawi', 'Namibia', 'Niger', 'Nigeria',
       'Oman', 'West Bank and Gaza', 'Qatar', 'Rwanda', 'Saudi Arabia',
       'Sudan', 'Senegal', 'Sierra Leone', 'Somalia', 'South Sudan',
       'Sao Tome and Principe', 'Eswatini', 'Syrian Arab Republic',
       'Chad', 'Togo', 'Tunisia', 'Turkey', 'Tanzania', 'Uganda',
       'Yemen, Rep.', 'South Africa', 'Zambia', 'Zimbabwe'], dtype=object)

In [67]:
# Changing the country names for consistency 
wb_info.replace({'Country Name' : {"Cote d'Ivoire" : "Cote D'Ivoire",
                    'Congo, Dem. Rep.' : 'Democratic Republic of Congo',
                    'Eswatini' : 'Swaziland',
                    'Congo, Rep.' : 'Congo',
                    'Gambia, The' : 'Gambia',
                    'Iran, Islamic Rep.' : 'Iran',
                    'Syrian Arab Republic': 'Syria',
                    'Egypt, Arab Rep.' : 'Egypt',
                    'West Bank and Gaza' : 'Palestine',
                    'Yemen, Rep.' : 'Yemen'}}, inplace=True)

In [68]:
# Merging debt to gdp data into data frame 
wb_info = pd.merge(wb_info, debt_to_gdp, on=['Country Name'], how='outer')

In [69]:
# Looking at my combined data frame 
wb_info.head()

Unnamed: 0,Country Name,population,gdp_per_cap,resource_rents,debt_to_gdp
0,Angola,26941779,3843.198241,23.38193,40.676
1,United Arab Emirates,9214175,38495.04635,24.249507,15.537
2,Burundi,9844297,245.326739,17.033314,35.772
3,Benin,10286842,834.443596,4.872945,30.452
4,Burkina Faso,17586017,639.708096,16.981603,30.387


In [70]:
# Looking for null values and data types 
wb_info.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 68 entries, 0 to 67
Data columns (total 5 columns):
Country Name      68 non-null object
population        68 non-null int64
gdp_per_cap       68 non-null float64
resource_rents    68 non-null float64
debt_to_gdp       68 non-null float64
dtypes: float64(3), int64(1), object(1)
memory usage: 3.2+ KB


In [71]:
# Renaming country column 
wb_info = wb_info.rename(columns={'Country Name' : 'country'})

In [72]:
# Looking at my data frame
wb_info.head()

Unnamed: 0,country,population,gdp_per_cap,resource_rents,debt_to_gdp
0,Angola,26941779,3843.198241,23.38193,40.676
1,United Arab Emirates,9214175,38495.04635,24.249507,15.537
2,Burundi,9844297,245.326739,17.033314,35.772
3,Benin,10286842,834.443596,4.872945,30.452
4,Burkina Faso,17586017,639.708096,16.981603,30.387


In [73]:
# Saving my info to a csv
wb_info.to_csv('./aid_data/gdp/wb_info.csv', index=False)