# Data Cleaning

## Government Spending

In [1]:
import pandas as pd
import numpy as np

In [2]:
spend = pd.read_csv('../data/uncleaned/government_spend.csv')
print(spend.shape)
spend.head()

(10466, 8)


Unnamed: 0,LOCATION,INDICATOR,SUBJECT,MEASURE,FREQUENCY,TIME,Value,Flag Codes
0,AUS,GGEXP,TOT,THND_USD_CAP,A,2007,13737.93,
1,AUS,GGEXP,TOT,THND_USD_CAP,A,2008,14835.57,
2,AUS,GGEXP,TOT,THND_USD_CAP,A,2009,15963.85,
3,AUS,GGEXP,TOT,THND_USD_CAP,A,2010,15802.56,
4,AUS,GGEXP,TOT,THND_USD_CAP,A,2011,16535.76,


In [3]:
spend.FREQUENCY.unique()

array(['A'], dtype=object)

This data is for the general government expenditure for all the countries in OECD. The column `Flag Codes` is not needed for our purpose, and since this data is all annual, we can also drop the column `FREQUENCY`.

In [4]:
spend = spend.drop(['Flag Codes','FREQUENCY','INDICATOR'], axis=1)

In [5]:
spend.head()

Unnamed: 0,LOCATION,SUBJECT,MEASURE,TIME,Value
0,AUS,TOT,THND_USD_CAP,2007,13737.93
1,AUS,TOT,THND_USD_CAP,2008,14835.57
2,AUS,TOT,THND_USD_CAP,2009,15963.85
3,AUS,TOT,THND_USD_CAP,2010,15802.56
4,AUS,TOT,THND_USD_CAP,2011,16535.76


In [6]:
spend.MEASURE.unique()

array(['THND_USD_CAP', 'PC_GDP'], dtype=object)

In [7]:
spend.SUBJECT.unique()

array(['TOT', 'RECULTREL', 'HOUCOMM', 'PUBORD', 'EDU', 'ENVPROT',
       'GRALPUBSER', 'SOCPROT', 'ECOAFF', 'DEF', 'HEALTH'], dtype=object)

We are only concerned with `THND_USD_CAP`, so we can drop all rows with `MEASURE == PC_GDP`.
We can also see that there are a lot of different elements under `SUBJECT`. For our case, we are only concerned with `TOT`.

In [8]:
spend = spend[(spend['MEASURE']=='THND_USD_CAP') & (spend['SUBJECT']=='TOT')]

Now, we can remove the column `SUBJECT` from the dataframe, since we have only one value in that column.

In [9]:
spend = spend.drop(['SUBJECT'], axis=1)

In [10]:
spend.nunique()

LOCATION     45
MEASURE       1
TIME         15
Value       654
dtype: int64

In [11]:
spend.shape

(654, 4)

In [12]:
spend.dtypes

LOCATION     object
MEASURE      object
TIME          int64
Value       float64
dtype: object

In [13]:
spend['LOCATION']= spend['LOCATION'].astype('string')
spend['MEASURE']= spend['MEASURE'].astype('string')
spend['TIME']= spend['TIME'].astype('string')

In [14]:
spend.dtypes

LOCATION     string
MEASURE      string
TIME         string
Value       float64
dtype: object

In [15]:
spend.reset_index(drop=True, inplace=True)

In [16]:
spend.isna().sum()

LOCATION    0
MEASURE     0
TIME        0
Value       0
dtype: int64

In [17]:
spend.head()

Unnamed: 0,LOCATION,MEASURE,TIME,Value
0,AUS,THND_USD_CAP,2007,13737.93
1,AUS,THND_USD_CAP,2008,14835.57
2,AUS,THND_USD_CAP,2009,15963.85
3,AUS,THND_USD_CAP,2010,15802.56
4,AUS,THND_USD_CAP,2011,16535.76


In [18]:
highestmin = 1
highestmincountry = ''
lowestmax = 9999
lowestmaxcountry = ''

for country in spend.LOCATION.unique():
    tempdf = spend[spend['LOCATION']==country]
    minyear = int(tempdf.TIME.min())
    if(highestmin<minyear):
        highestmin=minyear
        highestmincountry = country
        
    maxyear = int(tempdf.TIME.max())
    if(lowestmax>maxyear):
        lowestmax=maxyear
        lowestmaxcountry=country
   
    missingyear = pd.Series([], dtype='string')
    for i in range(minyear,maxyear):
        if((tempdf['TIME']==str(i)).any() == False):
            tempseries = pd.Series([str(i)])
            missingyear = pd.concat([missingyear,tempseries])
            missingyear.reset_index(inplace=True, drop = True)
    
    print('Country = ', country, 'Minimum Year = ',minyear, 'Maximum Year = ', maxyear, 'Missing Years = ', missingyear)
    
print('Highest Min = ', highestmin, ' Country = ', highestmincountry)
print('Lowest Max = ', lowestmax, ' Country = ', lowestmaxcountry)

Country =  AUS Minimum Year =  2007 Maximum Year =  2021 Missing Years =  Series([], dtype: string)
Country =  AUT Minimum Year =  2007 Maximum Year =  2021 Missing Years =  Series([], dtype: string)
Country =  BEL Minimum Year =  2007 Maximum Year =  2021 Missing Years =  Series([], dtype: string)
Country =  CAN Minimum Year =  2007 Maximum Year =  2021 Missing Years =  Series([], dtype: string)
Country =  CZE Minimum Year =  2007 Maximum Year =  2021 Missing Years =  Series([], dtype: string)
Country =  DNK Minimum Year =  2007 Maximum Year =  2021 Missing Years =  Series([], dtype: string)
Country =  FIN Minimum Year =  2007 Maximum Year =  2021 Missing Years =  Series([], dtype: string)
Country =  FRA Minimum Year =  2007 Maximum Year =  2021 Missing Years =  Series([], dtype: string)
Country =  DEU Minimum Year =  2007 Maximum Year =  2021 Missing Years =  Series([], dtype: string)
Country =  GRC Minimum Year =  2007 Maximum Year =  2021 Missing Years =  Series([], dtype: string)


In [19]:
from pathlib import Path 
filepath = Path('../data/temp/government_spend_cleaned.csv')
spend.to_csv(filepath, mode='wb', index=False)