# EDA and Cleaning Transparency International 

In [1]:
# Importing libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Reading in Transparency International data 
transparency_int = pd.read_excel('./aid_data/transparency_int/CPI2019.xlsx', sheet_name='CPI Timeseries 2012 - 2019')

About the CPI score: "The Corruption Perceptions Index ranks 180 countries and territories by their perceived levels of public sector corruption, according to experts and business people. This year’s analysis shows corruption is more pervasive in countries where big money can flow freely into electoral campaigns and where governments listen only to the voices of wealthy or well-connected individuals." A score close to 100 indicates a very clean goverment and a score close to 0 means very corrupt. (https://www.transparency.org/cpi2019)

In [3]:
# Looking at my data 
transparency_int.head()

Unnamed: 0,Country,ISO3,Region,CPI score 2019,Rank 2019,Sources 2019,Standard error 2019,CPI score 2018,Rank 2018,Sources 2018,...,Standard error 2015,CPI score 2014,Sources 2014,Standard error 2014,CPI Score 2013,Sources 2013,Standard error 2013,CPI Score 2012,Sources 2012,Standard error 2012
0,New Zealand,NZL,AP,87,1,8,2.28649,87,2,8,...,2.32,91.0,7.0,2.28,91.0,7.0,2.3,90.0,7.0,2.2
1,Denmark,DNK,WE/EU,87,1,8,2.542474,88,1,8,...,2.16,92.0,7.0,2.04,91.0,7.0,2.2,90.0,7.0,2.0
2,Finland,FIN,WE/EU,86,3,8,2.924511,85,3,8,...,1.77,89.0,7.0,2.05,89.0,7.0,1.7,90.0,7.0,3.0
3,Switzerland,CHE,WE/EU,85,4,7,1.580087,85,3,7,...,2.55,86.0,6.0,2.61,85.0,6.0,2.5,86.0,6.0,2.6
4,Singapore,SGP,AP,85,4,9,2.0484,85,3,9,...,2.02,84.0,8.0,1.75,86.0,9.0,2.3,87.0,9.0,2.1


In [4]:
# Looking at the info for my data 
transparency_int.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 30 columns):
Country                180 non-null object
ISO3                   180 non-null object
Region                 180 non-null object
CPI score 2019         180 non-null int64
Rank 2019              180 non-null int64
Sources 2019           180 non-null int64
Standard error 2019    180 non-null float64
CPI score 2018         180 non-null int64
Rank 2018              180 non-null int64
Sources 2018           180 non-null int64
Standard error 2018    180 non-null float64
CPI score 2017         180 non-null int64
Rank 2017              180 non-null int64
Sources 2017           180 non-null int64
Standard error 2017    180 non-null float64
CPI score 2016         176 non-null float64
Sources 2016           176 non-null float64
Standard error 2016    176 non-null float64
CPI score 2015         168 non-null float64
Sources 2015           168 non-null float64
Standard error 2015    168 non-null 

In [5]:
# Keeping just the columns I need. I decided to go with the 2014 scores. 
transparency_int = transparency_int[['Country', 'Region', 'CPI score 2014']]

In [6]:
# Looking at my data frame 
transparency_int.head()

Unnamed: 0,Country,Region,CPI score 2014
0,New Zealand,AP,91.0
1,Denmark,WE/EU,92.0
2,Finland,WE/EU,89.0
3,Switzerland,WE/EU,86.0
4,Singapore,AP,84.0


In [7]:
# Looking at the value counts for each region 
transparency_int['Region'].value_counts()

SSA      49
AME      32
WE/EU    31
AP       31
ECA      19
MENA     18
Name: Region, dtype: int64

In [8]:
# Making a data frame for Sub-Saharan Africa 
SSA = transparency_int[transparency_int['Region'] == 'SSA']

In [9]:
# Making a data frame for Middle East/North Africa 
MENA = transparency_int[transparency_int['Region'] == 'MENA']

In [10]:
# Making a data frame for just Turkey (because it is in Europe in this set)
Turkey = transparency_int[transparency_int['Country'] == 'Turkey']

In [11]:
# Appending Sub-Saharan Africa data to MENA data 
cpi = SSA.append([MENA])

In [12]:
# Appending Turkey to the data frame 
cpi = cpi.append([Turkey])

In [13]:
cpi.head()

Unnamed: 0,Country,Region,CPI score 2014
26,Seychelles,SSA,55.0
33,Botswana,SSA,63.0
42,Cabo Verde,SSA,57.0
50,Rwanda,SSA,49.0
55,Namibia,SSA,49.0


In [14]:
# Looking at the null values and data types for my new data frame 
cpi.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 68 entries, 26 to 91
Data columns (total 3 columns):
Country           68 non-null object
Region            68 non-null object
CPI score 2014    67 non-null float64
dtypes: float64(1), object(2)
memory usage: 2.1+ KB


In [15]:
# Looking at the null row 
cpi[cpi.isnull().any(axis=1)]

Unnamed: 0,Country,Region,CPI score 2014
173,Equatorial Guinea,SSA,


In [16]:
# I used data from the years that were not missing data to fill this in
cpi = cpi.fillna(18)

In [17]:
# Looking at unique country values 
cpi['Country'].unique()

array(['Seychelles', 'Botswana', 'Cabo Verde', 'Rwanda', 'Namibia',
       'Mauritius', 'Sao Tome and Principe', 'Senegal', 'South Africa',
       'Ghana', 'Benin', 'Burkina Faso', 'Lesotho', 'Tanzania',
       'Ethiopia', 'Gambia', "Cote d'Ivoire", 'Eswatini', 'Zambia',
       'Sierra Leone', 'Niger', 'Gabon', 'Malawi', 'Djibouti', 'Togo',
       'Mali', 'Guinea', 'Liberia', 'Kenya', 'Mauritania', 'Uganda',
       'Nigeria', 'Mozambique', 'Angola', 'Comoros', 'Cameroon',
       'Central African Republic', 'Madagascar', 'Zimbabwe', 'Eritrea',
       'Chad', 'Burundi', 'Congo', 'Democratic Republic of the Congo',
       'Guinea Bissau', 'Equatorial Guinea', 'Sudan', 'South Sudan',
       'Somalia', 'United Arab Emirates', 'Qatar', 'Israel',
       'Saudi Arabia', 'Oman', 'Jordan', 'Tunisia', 'Bahrain', 'Morocco',
       'Kuwait', 'Algeria', 'Egypt', 'Lebanon', 'Iran', 'Iraq', 'Libya',
       'Yemen', 'Syria', 'Turkey'], dtype=object)

In [18]:
# Renaming the countries for consistency 
cpi.replace({'Country' : {"Cote d'Ivoire" : "Cote D'Ivoire",
                    'Eswatini': 'Swaziland',
                    'Democratic Republic of the Congo' : 'Democratic Republic of Congo',
                    'Guinea Bissau' : 'Guinea-Bissau'}}, inplace=True)

In [19]:
# Renaming the columns 
cpi.rename(columns={'Country': 'country', 'CPI score 2014' : 'cpi_2014'}, inplace=True)

In [20]:
# Looking at my new dataframe 
cpi.head()

Unnamed: 0,country,Region,cpi_2014
26,Seychelles,SSA,55.0
33,Botswana,SSA,63.0
42,Cabo Verde,SSA,57.0
50,Rwanda,SSA,49.0
55,Namibia,SSA,49.0


In [21]:
# Dropping the region column 
cpi = cpi.drop(columns='Region')

In [22]:
# Looking at the new dataframe 
cpi.head()

Unnamed: 0,country,cpi_2014
26,Seychelles,55.0
33,Botswana,63.0
42,Cabo Verde,57.0
50,Rwanda,49.0
55,Namibia,49.0


In [23]:
# Saving the data frame to a csv 
cpi.to_csv('./aid_data/transparency_int/cpi_clean.csv', index=False)