## Tax Revenue

In [1]:
import pandas as pd
import numpy as np

In [2]:
tax_revenue = pd.read_csv('../data/tax_revenue.csv')
print(tax_revenue.shape)
tax_revenue.head()

(3549, 8)


Unnamed: 0,LOCATION,INDICATOR,SUBJECT,MEASURE,FREQUENCY,TIME,Value,Flag Codes
0,AUS,TAXREV,TOT,PC_GDP,A,1965,20.609,
1,AUS,TAXREV,TOT,PC_GDP,A,1966,19.829,
2,AUS,TAXREV,TOT,PC_GDP,A,1967,20.409,
3,AUS,TAXREV,TOT,PC_GDP,A,1968,20.34,
4,AUS,TAXREV,TOT,PC_GDP,A,1969,20.681,


In [3]:
tax_revenue.FREQUENCY.unique()

array(['A'], dtype=object)

In [4]:
tax_revenue.SUBJECT.unique()

array(['TOT'], dtype=object)

This data is for the tax revenue for all the countries in OECD. The column `Flag Codes` is not needed for our purpose, and since this data is all annual, we can also drop the column `FREQUENCY`. Additionally, since we know this data is for tax revenue, we can drop the column `INDICATOR` too. The column `SUBJECT` also has only 1 value, so we can drop that column as well.

In [5]:
tax_revenue = tax_revenue.drop(['Flag Codes','FREQUENCY','INDICATOR','SUBJECT'], axis=1)

In [6]:
tax_revenue.MEASURE.unique()

array(['PC_GDP', 'MLN_USD'], dtype=object)

Here, we are only concerned with the measure `MLN_USD`, so we keep only those rows.

In [7]:
tax_revenue = tax_revenue[(tax_revenue['MEASURE']=='MLN_USD')]

We reset the index of this dataframe, since we have deleted a lot of rows.

In [8]:
tax_revenue.reset_index(drop=True, inplace=True)

In [9]:
tax_revenue.head()

Unnamed: 0,LOCATION,MEASURE,TIME,Value
0,AUS,MLN_USD,1965,5.608
1,AUS,MLN_USD,1966,5.996
2,AUS,MLN_USD,1967,6.631
3,AUS,MLN_USD,1968,7.405
4,AUS,MLN_USD,1969,8.488


In [10]:
tax_revenue.nunique()

LOCATION      39
MEASURE        1
TIME          57
Value       1684
dtype: int64

In [11]:
tax_revenue.dtypes

LOCATION     object
MEASURE      object
TIME          int64
Value       float64
dtype: object

In [12]:
tax_revenue['LOCATION']= tax_revenue['LOCATION'].astype('string')
tax_revenue['MEASURE']= tax_revenue['MEASURE'].astype('string')
tax_revenue['TIME']= tax_revenue['TIME'].astype('string')

In [13]:
tax_revenue.dtypes

LOCATION     string
MEASURE      string
TIME         string
Value       float64
dtype: object

In [14]:
tax_revenue.shape

(1697, 4)

In [15]:
highestmin = 1
highestmincountry = ''
lowestmax = 9999
lowestmaxcountry = ''

for country in tax_revenue.LOCATION.unique():
    tempdf = tax_revenue[tax_revenue['LOCATION']==country]
    minyear = int(tempdf.TIME.min())
    if(highestmin<minyear):
        highestmin=minyear
        highestmincountry = country
        
    maxyear = int(tempdf.TIME.max())
    if(lowestmax>maxyear):
        lowestmax=maxyear
        lowestmaxcountry=country
   
    missingyear = pd.Series([], dtype='string')
    for i in range(minyear,maxyear):
        if((tempdf['TIME']==str(i)).any() == False):
            tempseries = pd.Series([str(i)])
            missingyear = pd.concat([missingyear,tempseries])
            missingyear.reset_index(inplace=True, drop = True)
    
    print('Country = ', country, 'Minimum Year = ',minyear, 'Maximum Year = ', maxyear, 'Missing Years = ', missingyear)
    
print('Highest Min = ', highestmin, ' Country = ', highestmincountry)
print('Lowest Max = ', lowestmax, ' Country = ', lowestmaxcountry)

Country =  AUS Minimum Year =  1965 Maximum Year =  2020 Missing Years =  Series([], dtype: string)
Country =  AUT Minimum Year =  1979 Maximum Year =  2021 Missing Years =  Series([], dtype: string)
Country =  BEL Minimum Year =  1979 Maximum Year =  2021 Missing Years =  Series([], dtype: string)
Country =  CAN Minimum Year =  1965 Maximum Year =  2021 Missing Years =  Series([], dtype: string)
Country =  CZE Minimum Year =  1993 Maximum Year =  2021 Missing Years =  Series([], dtype: string)
Country =  DNK Minimum Year =  1965 Maximum Year =  2021 Missing Years =  Series([], dtype: string)
Country =  FIN Minimum Year =  1979 Maximum Year =  2021 Missing Years =  Series([], dtype: string)
Country =  FRA Minimum Year =  1979 Maximum Year =  2021 Missing Years =  Series([], dtype: string)
Country =  DEU Minimum Year =  1979 Maximum Year =  2021 Missing Years =  Series([], dtype: string)
Country =  GRC Minimum Year =  1979 Maximum Year =  2021 Missing Years =  Series([], dtype: string)
