# Data Cleaning

## Government Spending

In [1]:
import pandas as pd
import numpy as np

In [2]:
spend = pd.read_csv('../data/uncleaned/government_spend.csv')
print(spend.shape)
spend.head()

(10466, 8)


Unnamed: 0,LOCATION,INDICATOR,SUBJECT,MEASURE,FREQUENCY,TIME,Value,Flag Codes
0,AUS,GGEXP,TOT,THND_USD_CAP,A,2007,13737.93,
1,AUS,GGEXP,TOT,THND_USD_CAP,A,2008,14835.57,
2,AUS,GGEXP,TOT,THND_USD_CAP,A,2009,15963.85,
3,AUS,GGEXP,TOT,THND_USD_CAP,A,2010,15802.56,
4,AUS,GGEXP,TOT,THND_USD_CAP,A,2011,16535.76,


In [3]:
spend.FREQUENCY.unique()

array(['A'], dtype=object)

In [4]:
spend.MEASURE.unique()

array(['THND_USD_CAP', 'PC_GDP'], dtype=object)

In [5]:
spend.SUBJECT.unique()

array(['TOT', 'RECULTREL', 'HOUCOMM', 'PUBORD', 'EDU', 'ENVPROT',
       'GRALPUBSER', 'SOCPROT', 'ECOAFF', 'DEF', 'HEALTH'], dtype=object)

In [6]:
spend.INDICATOR.unique()

array(['GGEXP'], dtype=object)

This data is for the general government expenditure for all the countries in OECD. The column `Flag Codes` is not needed for our purpose, and since this data is all annual, we can also drop the column `FREQUENCY`. The column `INDICATOR` also has only 1 value, so we drop that too.

In [7]:
spend = spend.drop(['Flag Codes','FREQUENCY','INDICATOR'], axis=1)

The dataframe now looks like this:

In [8]:
spend.head()

Unnamed: 0,LOCATION,SUBJECT,MEASURE,TIME,Value
0,AUS,TOT,THND_USD_CAP,2007,13737.93
1,AUS,TOT,THND_USD_CAP,2008,14835.57
2,AUS,TOT,THND_USD_CAP,2009,15963.85
3,AUS,TOT,THND_USD_CAP,2010,15802.56
4,AUS,TOT,THND_USD_CAP,2011,16535.76


In [9]:
spend.MEASURE.unique()

array(['THND_USD_CAP', 'PC_GDP'], dtype=object)

In [10]:
spend.SUBJECT.unique()

array(['TOT', 'RECULTREL', 'HOUCOMM', 'PUBORD', 'EDU', 'ENVPROT',
       'GRALPUBSER', 'SOCPROT', 'ECOAFF', 'DEF', 'HEALTH'], dtype=object)

We are only concerned with `THND_USD_CAP`, so we can drop all rows with `MEASURE == PC_GDP`.
We can also see that there are a lot of different elements under `SUBJECT`. For our case, we are only concerned with `TOT`.

In [11]:
spend = spend[(spend['MEASURE']=='THND_USD_CAP') & (spend['SUBJECT']=='TOT')]

Now, we can remove the columns `SUBJECT` and `MEASURE` from the dataframe, since we have only one value in those column.

In [12]:
spend = spend.drop(['SUBJECT','MEASURE'], axis=1)

Checking the unique values in the columns:

In [13]:
spend.nunique()

LOCATION     45
TIME         15
Value       654
dtype: int64

And now we check the shape:

In [14]:
spend.shape

(654, 3)

We check the datatypes of the columns below:

In [15]:
spend.dtypes

LOCATION     object
TIME          int64
Value       float64
dtype: object

We need `LOCATION` and `TIME` to be strings, so we convert them here. 

In [16]:
spend['LOCATION']= spend['LOCATION'].astype('string')
spend['TIME']= spend['TIME'].astype('string')

Checking the datatypes again:

In [17]:
spend.dtypes

LOCATION    string[python]
TIME        string[python]
Value              float64
dtype: object

We reset the index because we have removed a number of rows from the dataframe.

In [18]:
spend.reset_index(drop=True, inplace=True)

We check if there are any null values in the dataframe:

In [19]:
spend.isna().sum()

LOCATION    0
TIME        0
Value       0
dtype: int64

And the dataset now looks like this:

In [20]:
spend.head()

Unnamed: 0,LOCATION,TIME,Value
0,AUS,2007,13737.93
1,AUS,2008,14835.57
2,AUS,2009,15963.85
3,AUS,2010,15802.56
4,AUS,2011,16535.76


We now need to check if we are missing any data. For all unique values of LOCATION in our dataframe, we perform an analysis to find the first year that their data is available from, the last year that the data is available for, and any missing years between this range.

In [21]:
highestmin = 1
highestmincountry = ''
lowestmax = 9999
lowestmaxcountry = ''

for country in spend.LOCATION.unique():
    tempdf = spend[spend['LOCATION']==country]
    minyear = int(tempdf.TIME.min())
    if(highestmin<minyear):
        highestmin=minyear
        highestmincountry = country
        
    maxyear = int(tempdf.TIME.max())
    if(lowestmax>maxyear):
        lowestmax=maxyear
        lowestmaxcountry=country
   
    missingyear = pd.Series([], dtype='string')
    for i in range(minyear,maxyear):
        if((tempdf['TIME']==str(i)).any() == False):
            tempseries = pd.Series([str(i)])
            missingyear = pd.concat([missingyear,tempseries])
            missingyear.reset_index(inplace=True, drop = True)
    
    print('Country = ', country, 'Minimum Year = ',minyear, 'Maximum Year = ', maxyear, 'Missing Years = ', missingyear)
    
print('Highest Min = ', highestmin, ' Country = ', highestmincountry)
print('Lowest Max = ', lowestmax, ' Country = ', lowestmaxcountry)

Country =  AUS Minimum Year =  2007 Maximum Year =  2021 Missing Years =  Series([], dtype: string)
Country =  AUT Minimum Year =  2007 Maximum Year =  2021 Missing Years =  Series([], dtype: string)
Country =  BEL Minimum Year =  2007 Maximum Year =  2021 Missing Years =  Series([], dtype: string)
Country =  CAN Minimum Year =  2007 Maximum Year =  2021 Missing Years =  Series([], dtype: string)
Country =  CZE Minimum Year =  2007 Maximum Year =  2021 Missing Years =  Series([], dtype: string)
Country =  DNK Minimum Year =  2007 Maximum Year =  2021 Missing Years =  Series([], dtype: string)
Country =  FIN Minimum Year =  2007 Maximum Year =  2021 Missing Years =  Series([], dtype: string)
Country =  FRA Minimum Year =  2007 Maximum Year =  2021 Missing Years =  Series([], dtype: string)
Country =  DEU Minimum Year =  2007 Maximum Year =  2021 Missing Years =  Series([], dtype: string)
Country =  GRC Minimum Year =  2007 Maximum Year =  2021 Missing Years =  Series([], dtype: string)


Finally, we export the cleaned dataset into a new csv file that is placed in the `data/temp` directory.

In [22]:
from pathlib import Path 
filepath = Path('../data/cleaned/government_spend_cleaned.csv')
spend.to_csv(filepath, mode='wb', index=False)