In [94]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#sns.set('fivethirtyeight')
plt.style.use('fivethirtyeight') 
%matplotlib inline

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [95]:
path = 'Data/'

In [96]:
countries = (['Australia', 'Canada', 'Denmark', 'France', 'Germany',
              'Ireland', 'Italy', 'Japan', 'New Zealand', 'Norway', 
              'Spain', 'Sweden','Switzerland', 'United States', 
              'USA','United Kingdom'])

In [97]:
data_df = pd.read_csv(path+'WID_Data_national_income_top1p.csv')
data_df = data_df[data_df['Country'].isin(countries)]
print(data_df.shape)
data_df.head()

(774, 3)


Unnamed: 0,Country,Year,National_Income_Top_1
0,Australia,1965,0.0634
1,Australia,1966,0.0616
2,Australia,1967,0.0625
3,Australia,1968,0.0605
4,Australia,1969,0.0592


In [98]:
years = list(data_df.Year.unique())

In [99]:
tax_src_df = pd.read_csv(path + 'tax_revenue_sources.csv')
sorted(tax_src_df['Tax revenue'].unique())

  interactivity=interactivity, compiler=compiler, result=result)


['1000 Taxes on income, profits and capital gains',
 '1100 Taxes on income, profits and capital gains of individuals',
 '1200 Taxes on income, profits and capital gains of corporates',
 '1300 Unallocable between 1100 and 1200',
 '2000 Social security contributions (SSC)',
 '2100 Employees SSC',
 '2200 Employers SSC',
 '2300 Self-employed or non-employed SSC',
 '2400 Unallocable between 2100, 2200 and 2300 SSC',
 '3000 Taxes on payroll and workforce',
 '4000 Taxes on property',
 '4100 Recurrent taxes on immovable property',
 '4200 Recurrent taxes on net wealth',
 '4300 Estate, inheritance and gift taxes',
 '4400 Taxes on financial and capital transactions',
 '4500 Non-recurrent taxes on property',
 '4600 Other recurrent taxes on property except 4100 and 4200',
 '5000 Taxes on goods and services',
 '5100 Taxes on production, sale, transfer, etc',
 '5110 General taxes on goods and services',
 '5111 Value added taxes',
 '5120 Taxes on specific goods and services',
 '5121 Excises',
 '5200 T

In [100]:
tax_src_df = tax_src_df[tax_src_df['Country'].isin(countries)]
tax_src_df = tax_src_df[['TAX', 'Indicator', 'Country', 'Year', 'Value']]
tax_src_df['TAX'] = tax_src_df['TAX'].astype(str)
print(tax_src_df.shape)
tax_src_df.head()


(41956, 5)


Unnamed: 0,TAX,Indicator,Country,Year,Value
0,TOTALTAX,Tax revenue in national currency,Australia,1965,5.024
1,TOTALTAX,Tax revenue in national currency,Australia,1966,5.389
2,TOTALTAX,Tax revenue in national currency,Australia,1967,5.96
3,TOTALTAX,Tax revenue in national currency,Australia,1968,6.661
4,TOTALTAX,Tax revenue in national currency,Australia,1969,7.631


In [101]:
tax_src_df.Country.unique()

array(['Australia', 'Canada', 'Denmark', 'France', 'Germany', 'Ireland',
       'Italy', 'Japan', 'New Zealand', 'Norway', 'Spain', 'Sweden',
       'Switzerland', 'United Kingdom', 'United States'], dtype=object)

### Wrangle Sources of Tax Revenue

This file contains the amount of tax revenue from different sources per year.  We will change the absolute numbers to relative percentages.

In [102]:
def get_percentages(df):
    #Need to check for the case when TOTALTAX is not in the DF
    #Value of the total tax revenue as % of GDP
    try: gdp = float(df.loc[((df.Indicator == 'Tax revenue as % of GDP') & 
                 (df.TAX == 'TOTALTAX'), 'Value')])
    except:
        return pd.DataFrame()
    #Value of total tax revenue
    total = float(df.loc[((df.Indicator == 'Tax revenue in national currency') & 
                 (df.TAX == 'TOTALTAX'), 'Value')])

    df = df[df['Indicator'] != 'Tax revenue as % of GDP']

    df = df[((df['TAX'] != '1000') &
            (df['TAX'] != '2000') &
            (df['TAX'] != '3000') &
            (df['TAX'] != '4000') &
            (df['TAX'] != '5000') &
            (df['TAX'] != 'TOTALTAX'))]
    
    df.sort_values('TAX', inplace=True)
    df.reset_index(drop=True, inplace=True)    

    #Create an empy dataframe to store results
    result = pd.DataFrame()

    #For each tax, calculate it's percentage relative to the total
    for i in range(len(df)):
        tax = df.loc[i, 'TAX']
        value = df.loc[i, 'Value']
        percent = value / total * 100
        result[tax] = pd.Series(percent)    
    result['Tax_Rev_to_GDP'] = pd.Series(gdp)
    result['Year'] = df.loc[0,'Year']
    return result


In [103]:
percentages_df = pd.DataFrame([])

for country in tax_src_df.Country.unique():
    curr_df = tax_src_df[tax_src_df['Country'] == country]
    temp_percentages = pd.DataFrame()
    for year in years:
        if year in curr_df.Year.unique():
            temp_df = pd.DataFrame([])
            temp_df = curr_df[curr_df['Year'] == year]
            temp_df = get_percentages(temp_df)
            # If percentages could not be calculated empty DF is returned
            if len(temp_df) != 0:
                temp_percentages = temp_percentages.append(temp_df)
            else:
                temp_percentages = temp_percentages.append(
                                                pd.DataFrame([year], 
                                                   columns=['Year']))
        else:
            temp_percentages = temp_percentages.append(
                                                pd.DataFrame([year], 
                                                   columns=['Year']))
    temp_percentages['Country'] = pd.Series(
                            [country]*len(temp_percentages))
    percentages_df = percentages_df.append(temp_percentages)
percentages_df.fillna(0, inplace=True)     
    
print(percentages_df.shape)
percentages_df.reset_index(drop=True, inplace=True)
percentages_df.head()

(780, 24)


Unnamed: 0,1100,1200,1300,2100,2200,2300,2400,4100,4200,4300,...,5110,5111,5120,5121,5200,5300,6000,Tax_Rev_to_GDP,Year,Country
0,34.414809,16.281847,0.0,0.0,0.0,0.0,0.0,6.78742,0.0,2.726911,...,7.36465,0.0,22.651274,15.545382,4.697452,0.0,0.0,20.62,1965,Australia
1,35.646688,14.974949,0.0,0.0,0.0,0.0,0.0,6.921507,0.0,2.894786,...,7.069957,0.0,22.638708,15.661533,4.768974,0.0,0.0,19.84,1966,Australia
2,36.493289,14.42953,0.0,0.0,0.0,0.0,0.0,6.778523,0.0,3.053691,...,6.996644,0.0,22.315436,14.983221,4.731544,0.0,0.0,20.419,1967,Australia
3,35.685333,15.613271,0.0,0.0,0.0,0.0,0.0,6.455487,0.0,3.062603,...,7.416304,0.0,21.453235,14.157033,4.698994,0.0,0.0,20.349,1968,Australia
4,37.413183,15.686018,0.0,0.0,0.0,0.0,0.0,6.041148,0.0,2.843664,...,7.456428,0.0,20.32499,12.829249,4.403093,0.0,0.0,20.691,1969,Australia


In [104]:
top1_df = pd.read_csv(path+'WID_Data_national_income_top1p.csv')
top1_df = top1_df[top1_df['Country'].isin(countries)]
top1_df['Country'] = top1_df['Country'].str.replace('USA', 'United States')
print(top1_df.shape)
top1_df.head()

(774, 3)


Unnamed: 0,Country,Year,National_Income_Top_1
0,Australia,1965,0.0634
1,Australia,1966,0.0616
2,Australia,1967,0.0625
3,Australia,1968,0.0605
4,Australia,1969,0.0592


In [105]:
percentages_df[percentages_df['Country'] == 'Switzerland'].tail()

Unnamed: 0,1100,1200,1300,2100,2200,2300,2400,4100,4200,4300,...,5110,5111,5120,5121,5200,5300,6000,Tax_Rev_to_GDP,Year,Country
671,31.601709,10.379119,3.412236,11.589744,11.607678,1.823194,0.0,0.614761,4.19219,0.53002,...,13.296365,13.049745,6.621687,4.92804,3.026632,0.0,0.362718,26.79,2012,Switzerland
672,31.176,10.36452,4.040966,11.527238,11.546216,1.840531,0.0,0.616231,4.33765,0.558048,...,13.311366,13.095329,6.407952,4.745634,2.968127,0.0,0.388368,26.885,2013,Switzerland
673,30.963866,10.370094,4.319554,11.515541,11.534237,1.807071,0.0,0.601149,4.476414,0.669837,...,13.092249,12.888442,6.314921,4.664013,2.989684,0.0,0.446842,26.966,2014,Switzerland
674,31.126456,10.829495,4.516204,11.467182,11.376797,1.741586,0.0,0.641606,4.510993,0.591205,...,12.664949,12.444752,6.101763,4.509046,2.981809,0.0,0.441203,27.677,2015,Switzerland
675,31.224823,10.832018,4.505074,11.45312,11.362845,1.73945,0.0,0.643268,4.534758,0.595738,...,12.601942,12.38284,6.075855,4.486614,2.984368,0.0,0.442296,27.826,2016,Switzerland


In [106]:
data_df = top1_df.merge(percentages_df, on=['Country', 'Year'])
data_df.drop_duplicates(inplace=True)
data_df.reset_index(drop=True, inplace=True)
print(data_df.shape)
data_df.head()

(774, 25)


Unnamed: 0,Country,Year,National_Income_Top_1,1100,1200,1300,2100,2200,2300,2400,...,4600,5100,5110,5111,5120,5121,5200,5300,6000,Tax_Rev_to_GDP
0,Australia,1965,0.0634,34.414809,16.281847,0.0,0.0,0.0,0.0,0.0,...,0.0,30.015924,7.36465,0.0,22.651274,15.545382,4.697452,0.0,0.0,20.62
1,Australia,1966,0.0616,35.646688,14.974949,0.0,0.0,0.0,0.0,0.0,...,0.0,29.708666,7.069957,0.0,22.638708,15.661533,4.768974,0.0,0.0,19.84
2,Australia,1967,0.0625,36.493289,14.42953,0.0,0.0,0.0,0.0,0.0,...,0.0,29.312081,6.996644,0.0,22.315436,14.983221,4.731544,0.0,0.0,20.419
3,Australia,1968,0.0605,35.685333,15.613271,0.0,0.0,0.0,0.0,0.0,...,0.0,28.869539,7.416304,0.0,21.453235,14.157033,4.698994,0.0,0.0,20.349
4,Australia,1969,0.0592,37.413183,15.686018,0.0,0.0,0.0,0.0,0.0,...,0.0,27.781418,7.456428,0.0,20.32499,12.829249,4.403093,0.0,0.0,20.691


In [107]:
cols = list(data_df.columns)
cols = cols[:3] + [cols[-1]] + cols[3:-1]
print(cols)

['Country', 'Year', 'National_Income_Top_1', 'Tax_Rev_to_GDP', '1100', '1200', '1300', '2100', '2200', '2300', '2400', '4100', '4200', '4300', '4400', '4500', '4600', '5100', '5110', '5111', '5120', '5121', '5200', '5300', '6000']


In [108]:
data_df = data_df[cols]
data_df.rename(columns = {'National_Income_Top_1' : 'Top_1'}, inplace=True)
data_df.head()

Unnamed: 0,Country,Year,Top_1,Tax_Rev_to_GDP,1100,1200,1300,2100,2200,2300,...,4500,4600,5100,5110,5111,5120,5121,5200,5300,6000
0,Australia,1965,0.0634,20.62,34.414809,16.281847,0.0,0.0,0.0,0.0,...,0.0,0.0,30.015924,7.36465,0.0,22.651274,15.545382,4.697452,0.0,0.0
1,Australia,1966,0.0616,19.84,35.646688,14.974949,0.0,0.0,0.0,0.0,...,0.0,0.0,29.708666,7.069957,0.0,22.638708,15.661533,4.768974,0.0,0.0
2,Australia,1967,0.0625,20.419,36.493289,14.42953,0.0,0.0,0.0,0.0,...,0.0,0.0,29.312081,6.996644,0.0,22.315436,14.983221,4.731544,0.0,0.0
3,Australia,1968,0.0605,20.349,35.685333,15.613271,0.0,0.0,0.0,0.0,...,0.0,0.0,28.869539,7.416304,0.0,21.453235,14.157033,4.698994,0.0,0.0
4,Australia,1969,0.0592,20.691,37.413183,15.686018,0.0,0.0,0.0,0.0,...,0.0,0.0,27.781418,7.456428,0.0,20.32499,12.829249,4.403093,0.0,0.0


### Add Tax Breakdown by Society/Individual

In [109]:
indiv_df = pd.read_csv(path + 'Individual_Tax.csv')
print(indiv_df.shape)
indiv_df.head()

(1403, 8)


Unnamed: 0,LOCATION,INDICATOR,SUBJECT,MEASURE,FREQUENCY,TIME,Value,Flag Codes
0,CHL,GGEXPDEST,INDIVIDUAL,PC_GDP,A,1996,4.728957,E
1,CHL,GGEXPDEST,INDIVIDUAL,PC_GDP,A,1997,4.772787,E
2,CHL,GGEXPDEST,INDIVIDUAL,PC_GDP,A,1998,4.946377,E
3,CHL,GGEXPDEST,INDIVIDUAL,PC_GDP,A,1999,5.340628,E
4,CHL,GGEXPDEST,INDIVIDUAL,PC_GDP,A,2000,5.468381,E


In [110]:
indiv_df.MEASURE.unique()

array(['PC_GDP'], dtype=object)

In [111]:
indiv_df.LOCATION.unique()

array(['CHL', 'IRL', 'NOR', 'TUR', 'KOR', 'FIN', 'BEL', 'SWE', 'ESP',
       'ISR', 'IND', 'LUX', 'FRA', 'NZL', 'HUN', 'EST', 'DNK', 'AUT',
       'RUS', 'JPN', 'ISL', 'PRT', 'SVN', 'NLD', 'DEU', 'GRC', 'CZE',
       'EA', 'USA', 'EU', 'ITA', 'POL', 'MEX', 'SVK', 'ZAF', 'GBR', 'CHE',
       'AUS', 'CAN', 'BRA', 'IDN', 'LVA', 'OAVG', 'COL', 'CRI', 'LTU'],
      dtype=object)

In [112]:
print(countries)

['Australia', 'Canada', 'Denmark', 'France', 'Germany', 'Ireland', 'Italy', 'Japan', 'New Zealand', 'Norway', 'Spain', 'Sweden', 'Switzerland', 'United States', 'USA', 'United Kingdom']


In [113]:
country_dict = ({'AUS': 'Australia', 'CAN': 'Canada', 
                'CHE': 'Switzerland', 'DEU': 'Germany',
                'DNK': 'Denmark', 'ESP': 'Spain', 'FRA': 'France',
                'GBR': 'United Kingdom', 'IRL': 'Ireland',
                'ITA': 'Italy', 'JPN': 'Japan', 'NOR': 'Norway',
                'NZL': 'New Zealand', 'SWE': 'Sweden', 'USA': 'United States'})
    

In [114]:
indiv_df_smooth = pd.DataFrame()
for k,v in country_dict.items():
    temp = pd.DataFrame()
    temp['Individual_Spending_Rate'] = (indiv_df[indiv_df['LOCATION'] == k]
                                  .Value)
    temp['Year'] = indiv_df[indiv_df['LOCATION'] == k].TIME

    temp['Country'] = pd.DataFrame([v]*len(temp), index=temp.index)
    indiv_df_smooth = indiv_df_smooth.append(temp)
indiv_df_smooth.reset_index(drop=True, inplace=True)

In [115]:
print(indiv_df_smooth.shape)
indiv_df_smooth.head()

(558, 3)


Unnamed: 0,Individual_Spending_Rate,Year,Country
0,6.292179,1970,Australia
1,6.763654,1971,Australia
2,7.059415,1972,Australia
3,7.162653,1973,Australia
4,8.308851,1974,Australia


In [116]:
data_df.shape

(774, 25)

In [117]:
data_df = data_df.merge(indiv_df_smooth, on=['Country', 'Year'], how='outer')
data_df.drop_duplicates(inplace=True)
data_df.reset_index(drop=True, inplace=True)
print(data_df.shape)
data_df.head()

(790, 26)


Unnamed: 0,Country,Year,Top_1,Tax_Rev_to_GDP,1100,1200,1300,2100,2200,2300,...,4600,5100,5110,5111,5120,5121,5200,5300,6000,Individual_Spending_Rate
0,Australia,1965,0.0634,20.62,34.414809,16.281847,0.0,0.0,0.0,0.0,...,0.0,30.015924,7.36465,0.0,22.651274,15.545382,4.697452,0.0,0.0,
1,Australia,1966,0.0616,19.84,35.646688,14.974949,0.0,0.0,0.0,0.0,...,0.0,29.708666,7.069957,0.0,22.638708,15.661533,4.768974,0.0,0.0,
2,Australia,1967,0.0625,20.419,36.493289,14.42953,0.0,0.0,0.0,0.0,...,0.0,29.312081,6.996644,0.0,22.315436,14.983221,4.731544,0.0,0.0,
3,Australia,1968,0.0605,20.349,35.685333,15.613271,0.0,0.0,0.0,0.0,...,0.0,28.869539,7.416304,0.0,21.453235,14.157033,4.698994,0.0,0.0,
4,Australia,1969,0.0592,20.691,37.413183,15.686018,0.0,0.0,0.0,0.0,...,0.0,27.781418,7.456428,0.0,20.32499,12.829249,4.403093,0.0,0.0,


In [118]:
soc_df = pd.read_csv(path + 'Society_Tax.csv')
print(soc_df.shape)
soc_df.head()

(1403, 8)


Unnamed: 0,LOCATION,INDICATOR,SUBJECT,MEASURE,FREQUENCY,TIME,Value,Flag Codes
0,HUN,GGEXPDEST,SOCIETY,PC_GDP,A,1991,10.423786,E
1,HUN,GGEXPDEST,SOCIETY,PC_GDP,A,1992,11.252675,E
2,HUN,GGEXPDEST,SOCIETY,PC_GDP,A,1993,13.630537,E
3,HUN,GGEXPDEST,SOCIETY,PC_GDP,A,1994,11.88517,E
4,HUN,GGEXPDEST,SOCIETY,PC_GDP,A,1995,10.82856,


In [119]:
soc_df_smooth = pd.DataFrame()
for k,v in country_dict.items():
    temp = pd.DataFrame()
    temp['Society_Spending_Rate'] = (soc_df[indiv_df['LOCATION'] == k]
                                  .Value)

    t_years = soc_df[soc_df['LOCATION'] == k].TIME
    temp['Year'] = pd.DataFrame(list(t_years), index=temp.index)

    temp['Country'] = pd.DataFrame([v]*len(temp), index=temp.index)
    soc_df_smooth = soc_df_smooth.append(temp)
soc_df_smooth.reset_index(drop=True, inplace=True)

In [120]:
print(soc_df_smooth.shape)
soc_df_smooth.head()

(558, 3)


Unnamed: 0,Society_Spending_Rate,Year,Country
0,10.908933,1970,Australia
1,10.941755,1971,Australia
2,10.786707,1972,Australia
3,10.74925,1973,Australia
4,10.624467,1974,Australia


In [121]:
data_df = data_df.merge(soc_df_smooth, on=['Country', 'Year'], how='outer')
data_df.drop_duplicates(inplace=True)
data_df.reset_index(drop=True, inplace=True)
print(data_df.shape)
data_df.head(10)

(790, 27)


Unnamed: 0,Country,Year,Top_1,Tax_Rev_to_GDP,1100,1200,1300,2100,2200,2300,...,5100,5110,5111,5120,5121,5200,5300,6000,Individual_Spending_Rate,Society_Spending_Rate
0,Australia,1965,0.0634,20.62,34.414809,16.281847,0.0,0.0,0.0,0.0,...,30.015924,7.36465,0.0,22.651274,15.545382,4.697452,0.0,0.0,,
1,Australia,1966,0.0616,19.84,35.646688,14.974949,0.0,0.0,0.0,0.0,...,29.708666,7.069957,0.0,22.638708,15.661533,4.768974,0.0,0.0,,
2,Australia,1967,0.0625,20.419,36.493289,14.42953,0.0,0.0,0.0,0.0,...,29.312081,6.996644,0.0,22.315436,14.983221,4.731544,0.0,0.0,,
3,Australia,1968,0.0605,20.349,35.685333,15.613271,0.0,0.0,0.0,0.0,...,28.869539,7.416304,0.0,21.453235,14.157033,4.698994,0.0,0.0,,
4,Australia,1969,0.0592,20.691,37.413183,15.686018,0.0,0.0,0.0,0.0,...,27.781418,7.456428,0.0,20.32499,12.829249,4.403093,0.0,0.0,,
5,Australia,1970,0.0558,21.083,37.326593,16.976252,0.0,0.0,0.0,0.0,...,27.792147,7.441806,0.0,20.350341,12.802727,4.173525,0.0,0.0,6.292179,10.908933
6,Australia,1971,0.0559,21.835,38.734568,15.792181,0.0,0.0,0.0,0.0,...,26.882716,7.006173,0.0,19.876543,12.921811,4.269547,0.0,0.0,6.763654,10.941755
7,Australia,1972,0.0574,21.394,38.347418,15.342723,0.0,0.0,0.0,0.0,...,26.619718,7.183099,0.0,19.43662,12.384977,4.431925,0.0,0.0,7.059415,10.786707
8,Australia,1973,0.0541,22.479,40.449853,14.992625,0.0,0.0,0.0,0.0,...,25.811209,7.146018,0.0,18.665192,11.998525,3.768437,0.0,0.0,7.162653,10.74925
9,Australia,1974,0.0498,24.623,43.973533,13.958131,0.0,0.0,0.0,0.0,...,24.351149,6.582625,0.0,17.768524,10.484285,3.325538,0.0,0.0,8.308851,10.624467


### Add Income Tax Rates

In [122]:
top_rate_df = pd.read_csv(path + 'oecd_toprate.csv')
print(top_rate_df.shape)
top_rate_df.head()

(35, 18)


Unnamed: 0,Country,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
0,Australia,48.5%,48.5%,48.5%,48.5%,48.5%,48.5%,48.5%,46.5%,46.5%,46.5%,46.5%,46.5%,47.5%,46.5%,46.5%,49.0%,49.0%
1,Austria,42.6%,42.7%,42.9%,42.9%,43.9%,43.7%,43.7%,43.7%,43.7%,43.7%,43.7%,43.7%,43.7%,50.0%,50.0%,50.0%,55.0%
2,Belgium,52.6%,52.1%,47.4%,45.1%,45.1%,45.1%,45.1%,45.3%,45.3%,45.3%,45.3%,45.3%,45.3%,45.3%,45.3%,45.4%,46.3%
3,Canada,47.9%,46.4%,46.4%,46.4%,46.4%,46.4%,46.4%,46.4%,46.4%,46.4%,46.4%,46.4%,48.0%,49.5%,49.5%,49.5%,53.5%
4,Chile,45.0%,45.0%,43.0%,40.0%,40.0%,40.0%,40.0%,40.0%,40.0%,40.0%,40.0%,39.7%,39.5%,39.5%,39.5%,40.0%,40.0%


In [123]:
for col in top_rate_df.columns[1:]:
    top_rate_df.loc[:, col] = (top_rate_df.loc[:,col].str.strip('%')
                               .astype(float))

In [124]:
top_rate_df.head()

Unnamed: 0,Country,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
0,Australia,48.5,48.5,48.5,48.5,48.5,48.5,48.5,46.5,46.5,46.5,46.5,46.5,47.5,46.5,46.5,49.0,49.0
1,Austria,42.6,42.7,42.9,42.9,43.9,43.7,43.7,43.7,43.7,43.7,43.7,43.7,43.7,50.0,50.0,50.0,55.0
2,Belgium,52.6,52.1,47.4,45.1,45.1,45.1,45.1,45.3,45.3,45.3,45.3,45.3,45.3,45.3,45.3,45.4,46.3
3,Canada,47.9,46.4,46.4,46.4,46.4,46.4,46.4,46.4,46.4,46.4,46.4,46.4,48.0,49.5,49.5,49.5,53.5
4,Chile,45.0,45.0,43.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,39.7,39.5,39.5,39.5,40.0,40.0


In [125]:
top_rate_df.set_index('Country', inplace=True)
top_rate_df.head()

Unnamed: 0_level_0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Australia,48.5,48.5,48.5,48.5,48.5,48.5,48.5,46.5,46.5,46.5,46.5,46.5,47.5,46.5,46.5,49.0,49.0
Austria,42.6,42.7,42.9,42.9,43.9,43.7,43.7,43.7,43.7,43.7,43.7,43.7,43.7,50.0,50.0,50.0,55.0
Belgium,52.6,52.1,47.4,45.1,45.1,45.1,45.1,45.3,45.3,45.3,45.3,45.3,45.3,45.3,45.3,45.4,46.3
Canada,47.9,46.4,46.4,46.4,46.4,46.4,46.4,46.4,46.4,46.4,46.4,46.4,48.0,49.5,49.5,49.5,53.5
Chile,45.0,45.0,43.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,39.7,39.5,39.5,39.5,40.0,40.0


In [126]:
rate_years = top_rate_df.columns.astype(int)
print(rate_years)

Int64Index([2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
            2011, 2012, 2013, 2014, 2015, 2016],
           dtype='int64')


In [131]:
rate_years

Int64Index([2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
            2011, 2012, 2013, 2014, 2015, 2016],
           dtype='int64')

In [130]:
top_rate_df.loc['Australia']

2000    48.5
2001    48.5
2002    48.5
2003    48.5
2004    48.5
2005    48.5
2006    48.5
2007    46.5
2008    46.5
2009    46.5
2010    46.5
2011    46.5
2012    47.5
2013    46.5
2014    46.5
2015    49.0
2016    49.0
Name: Australia, dtype: float64

In [142]:
top_rate_temp = pd.DataFrame()
for c in top_rate_df.index:
    if c in countries:
        temp_df = pd.DataFrame()

        temp_df['Year'] = pd.Series(rate_years)
        temp_df['Country'] = pd.Series([c]*len(temp_df), index=temp_df.index)
        temp_df['Top_Rate'] = pd.Series(list(top_rate_df.loc[c]), index=temp_df.index)
        
        top_rate_temp = top_rate_temp.append(temp_df)
top_rate_temp.reset_index(drop=True, inplace=True)

In [143]:
print(top_rate_temp.shape)
top_rate_temp.head()

(255, 3)


Unnamed: 0,Year,Country,Top_Rate
0,2000,Australia,48.5
1,2001,Australia,48.5
2,2002,Australia,48.5
3,2003,Australia,48.5
4,2004,Australia,48.5


In [144]:
top_rate_temp.Country.unique()

array(['Australia', 'Canada', 'Denmark', 'France', 'Germany', 'Ireland',
       'Italy', 'Japan', 'New Zealand', 'Norway', 'Spain', 'Sweden',
       'Switzerland', 'United Kingdom', 'United States'], dtype=object)

In [145]:
data_df = data_df.merge(top_rate_temp, on=['Country', 'Year'], how='outer')
data_df.drop_duplicates(inplace=True)
data_df.reset_index(drop=True, inplace=True)
print(data_df.shape)
data_df.head()

(790, 28)


Unnamed: 0,Country,Year,Top_1,Tax_Rev_to_GDP,1100,1200,1300,2100,2200,2300,...,5110,5111,5120,5121,5200,5300,6000,Individual_Spending_Rate,Society_Spending_Rate,Top_Rate
0,Australia,1965,0.0634,20.62,34.414809,16.281847,0.0,0.0,0.0,0.0,...,7.36465,0.0,22.651274,15.545382,4.697452,0.0,0.0,,,
1,Australia,1966,0.0616,19.84,35.646688,14.974949,0.0,0.0,0.0,0.0,...,7.069957,0.0,22.638708,15.661533,4.768974,0.0,0.0,,,
2,Australia,1967,0.0625,20.419,36.493289,14.42953,0.0,0.0,0.0,0.0,...,6.996644,0.0,22.315436,14.983221,4.731544,0.0,0.0,,,
3,Australia,1968,0.0605,20.349,35.685333,15.613271,0.0,0.0,0.0,0.0,...,7.416304,0.0,21.453235,14.157033,4.698994,0.0,0.0,,,
4,Australia,1969,0.0592,20.691,37.413183,15.686018,0.0,0.0,0.0,0.0,...,7.456428,0.0,20.32499,12.829249,4.403093,0.0,0.0,,,


### Add GDP Growth

In [194]:
gdp_df = pd.read_csv(path + 'GDP_growth.csv')

In [195]:
print(gdp_df.shape)
gdp_df.head()

(264, 61)


Unnamed: 0,Country Name,Country Code,Indicator Name,1960,1961,1962,1963,1964,1965,1966,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
0,Aruba,ABW,GDP growth (annual %),,,,,,,,...,-6.881302,-5.653502,,,,,,,,
1,Afghanistan,AFG,GDP growth (annual %),,,,,,,,...,3.611368,21.020649,8.43329,6.113685,14.434741,3.900575,2.690522,1.31004,2.366712,2.595542
2,Angola,AGO,GDP growth (annual %),,,,,,,,...,13.817098,2.41291,3.452875,3.873331,5.177594,6.842717,4.703971,2.999747,-0.813494,0.721699
3,Albania,ALB,GDP growth (annual %),,,,,,,,...,3.760854,3.35261,3.710058,2.550161,1.419968,1.000755,1.774369,2.218752,3.352159,3.841364
4,Andorra,AND,GDP growth (annual %),,,,,,,,...,-8.590004,-3.690654,-5.358826,-4.646543,-1.615218,0.351645,2.277683,0.842204,1.889124,1.873197


In [196]:
years_str = [str(x) for x in years]
print(years_str)

['1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016']


In [197]:
gdp_df[years_str].head()

Unnamed: 0,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
0,,,,,,,,,,,...,-3.654626,-6.881302,-5.653502,,,,,,,
1,,,,,,,,,,,...,13.740205,3.611368,21.020649,8.43329,6.113685,14.434741,3.900575,2.690522,1.31004,2.366712
2,,,,,,,,,,,...,23.189597,13.817098,2.41291,3.452875,3.873331,5.177594,6.842717,4.703971,2.999747,-0.813494
3,,,,,,,,,,,...,5.9,3.760854,3.35261,3.710058,2.550161,1.419968,1.000755,1.774369,2.218752,3.352159
4,,,,,,,4.649465,8.149743,7.788467,5.61879,...,0.040011,-8.590004,-3.690654,-5.358826,-4.646543,-1.615218,0.351645,2.277683,0.842204,1.889124


In [198]:
gdp_df.set_index('Country Name', inplace=True)
gdp_df.drop(['Country Code', 'Indicator Name'], axis=1, inplace=True)
gdp_df = gdp_df[years_str]
gdp_df.head(2)

Unnamed: 0_level_0,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aruba,,,,,,,,,,,...,-3.654626,-6.881302,-5.653502,,,,,,,
Afghanistan,,,,,,,,,,,...,13.740205,3.611368,21.020649,8.43329,6.113685,14.434741,3.900575,2.690522,1.31004,2.366712


In [199]:
gdp_smooth_df = pd.DataFrame()
for c in gdp_df.index:
    if c in countries:
        temp_df = pd.DataFrame()

        temp_df['Year'] = pd.Series(years)
        temp_df['Country'] = pd.Series([c]*len(temp_df), index=temp_df.index)
        temp_df['GDP_growth'] = pd.Series(list(gdp_df.loc[c]), index=temp_df.index)
        
        gdp_smooth_df = gdp_smooth_df.append(temp_df)
gdp_smooth_df.reset_index(drop=True, inplace=True)

Australia
Canada
Switzerland
Germany
Denmark
Spain
France
United Kingdom
Ireland
Italy
Japan
Norway
New Zealand
Sweden
United States


In [200]:
gdp_smooth_df.head()

Unnamed: 0,Year,Country,GDP_growth
0,1965,Australia,5.98345
1,1966,Australia,2.382491
2,1967,Australia,6.302386
3,1968,Australia,5.095868
4,1969,Australia,7.044159


In [201]:
data_df = data_df.merge(gdp_smooth_df, on=['Country', 'Year'], how='outer')
data_df.drop_duplicates(inplace=True)
data_df.reset_index(drop=True, inplace=True)
print(data_df.shape)
data_df.head()

(780, 29)


Unnamed: 0,Country,Year,Top_1,Tax_Rev_to_GDP,Individual_Spending_Rate,Society_Spending_Rate,Top_Rate,1100,1200,1300,...,4600,5100,5110,5111,5120,5121,5200,5300,6000,GDP_growth
0,Australia,1965,0.0634,20.62,,,,34.414809,16.281847,0.0,...,0.0,30.015924,7.36465,0.0,22.651274,15.545382,4.697452,0.0,0.0,5.98345
1,Australia,1966,0.0616,19.84,,,,35.646688,14.974949,0.0,...,0.0,29.708666,7.069957,0.0,22.638708,15.661533,4.768974,0.0,0.0,2.382491
2,Australia,1967,0.0625,20.419,,,,36.493289,14.42953,0.0,...,0.0,29.312081,6.996644,0.0,22.315436,14.983221,4.731544,0.0,0.0,6.302386
3,Australia,1968,0.0605,20.349,,,,35.685333,15.613271,0.0,...,0.0,28.869539,7.416304,0.0,21.453235,14.157033,4.698994,0.0,0.0,5.095868
4,Australia,1969,0.0592,20.691,,,,37.413183,15.686018,0.0,...,0.0,27.781418,7.456428,0.0,20.32499,12.829249,4.403093,0.0,0.0,7.044159


### Reorganize Columns

In [146]:
data_df.columns

Index(['Country', 'Year', 'Top_1', 'Tax_Rev_to_GDP', '1100', '1200', '1300',
       '2100', '2200', '2300', '2400', '4100', '4200', '4300', '4400', '4500',
       '4600', '5100', '5110', '5111', '5120', '5121', '5200', '5300', '6000',
       'Individual_Spending_Rate', 'Society_Spending_Rate', 'Top_Rate'],
      dtype='object')

In [147]:
'''Not going to work after adding GDP!'''
cols = list(data_df.columns)
print(len(cols))
cols = cols[:4] + cols[-3:] + cols[4:-3]
print(len(cols))
print(cols)

28
28
['Country', 'Year', 'Top_1', 'Tax_Rev_to_GDP', 'Individual_Spending_Rate', 'Society_Spending_Rate', 'Top_Rate', '1100', '1200', '1300', '2100', '2200', '2300', '2400', '4100', '4200', '4300', '4400', '4500', '4600', '5100', '5110', '5111', '5120', '5121', '5200', '5300', '6000']


In [148]:
data_df = data_df[cols]
data_df.head()

Unnamed: 0,Country,Year,Top_1,Tax_Rev_to_GDP,Individual_Spending_Rate,Society_Spending_Rate,Top_Rate,1100,1200,1300,...,4500,4600,5100,5110,5111,5120,5121,5200,5300,6000
0,Australia,1965,0.0634,20.62,,,,34.414809,16.281847,0.0,...,0.0,0.0,30.015924,7.36465,0.0,22.651274,15.545382,4.697452,0.0,0.0
1,Australia,1966,0.0616,19.84,,,,35.646688,14.974949,0.0,...,0.0,0.0,29.708666,7.069957,0.0,22.638708,15.661533,4.768974,0.0,0.0
2,Australia,1967,0.0625,20.419,,,,36.493289,14.42953,0.0,...,0.0,0.0,29.312081,6.996644,0.0,22.315436,14.983221,4.731544,0.0,0.0
3,Australia,1968,0.0605,20.349,,,,35.685333,15.613271,0.0,...,0.0,0.0,28.869539,7.416304,0.0,21.453235,14.157033,4.698994,0.0,0.0
4,Australia,1969,0.0592,20.691,,,,37.413183,15.686018,0.0,...,0.0,0.0,27.781418,7.456428,0.0,20.32499,12.829249,4.403093,0.0,0.0


In [149]:
#Need to drop year 2017, does not have target data
data_df = data_df[data_df['Year'] != 2017]
print(data_df.shape)
data_df.head()

(780, 28)


Unnamed: 0,Country,Year,Top_1,Tax_Rev_to_GDP,Individual_Spending_Rate,Society_Spending_Rate,Top_Rate,1100,1200,1300,...,4500,4600,5100,5110,5111,5120,5121,5200,5300,6000
0,Australia,1965,0.0634,20.62,,,,34.414809,16.281847,0.0,...,0.0,0.0,30.015924,7.36465,0.0,22.651274,15.545382,4.697452,0.0,0.0
1,Australia,1966,0.0616,19.84,,,,35.646688,14.974949,0.0,...,0.0,0.0,29.708666,7.069957,0.0,22.638708,15.661533,4.768974,0.0,0.0
2,Australia,1967,0.0625,20.419,,,,36.493289,14.42953,0.0,...,0.0,0.0,29.312081,6.996644,0.0,22.315436,14.983221,4.731544,0.0,0.0
3,Australia,1968,0.0605,20.349,,,,35.685333,15.613271,0.0,...,0.0,0.0,28.869539,7.416304,0.0,21.453235,14.157033,4.698994,0.0,0.0
4,Australia,1969,0.0592,20.691,,,,37.413183,15.686018,0.0,...,0.0,0.0,27.781418,7.456428,0.0,20.32499,12.829249,4.403093,0.0,0.0


### Removing Negative Values  ???

In [153]:
data_df[data_df['1300'] < 0]

Unnamed: 0,Country,Year,Top_1,Tax_Rev_to_GDP,Individual_Spending_Rate,Society_Spending_Rate,Top_Rate,1100,1200,1300,...,4500,4600,5100,5110,5111,5120,5121,5200,5300,6000
325,Italy,1984,0.0654,32.999,,,,26.18548,9.826471,-0.072715,...,0.0,0.0,24.628815,15.087278,15.087278,9.547464,6.206109,0.777345,0.811331,0.0
327,Italy,1986,0.0713,34.018,,,,27.767514,10.590576,-2.177124,...,0.0,0.0,24.89465,14.65318,14.65318,10.271438,6.851126,1.355334,-0.170537,0.0
328,Italy,1987,0.0745,34.109,,,,26.254682,10.548404,-0.672619,...,0.0,0.0,25.020984,14.630595,14.630595,10.349548,7.14841,1.164127,0.252373,0.0
329,Italy,1988,0.076,34.649,10.819965,8.093221,,26.800775,9.366673,-0.499785,...,0.0,0.0,25.744064,15.174214,15.174214,10.53392,7.223059,1.648319,0.643259,0.0
340,Italy,1999,0.0882,40.906,10.044019,7.273702,,26.373989,7.697504,-0.102886,...,0.000108,0.0,24.97172,13.690544,13.690544,11.243899,6.636322,1.17732,1.381153,5.126857
343,Italy,2002,0.0928,39.751,10.822383,6.796448,41.4,25.450882,7.539998,-0.530878,...,0.426198,0.210295,24.016614,15.025712,15.025712,8.990902,5.701142,1.81134,1.129611,5.819468
344,Italy,2003,0.0936,40.068,10.898034,6.477163,41.4,25.171735,6.640451,-0.858721,...,3.155821,0.252143,23.144365,14.195184,14.195184,8.944156,5.865682,1.530264,1.03872,5.797846
345,Italy,2004,0.0928,39.332,10.992412,6.6189,41.4,25.454526,6.881387,-0.882792,...,1.36307,0.217143,23.84398,14.309161,14.309161,9.534818,5.595169,1.521407,1.046044,5.56287
722,United Kingdom,1965,0.0855,30.097,,,,33.05619,4.422111,-0.475103,...,0.0,0.0,31.119233,5.911375,0.0,25.207857,21.772499,1.936958,0.009137,0.0
723,United Kingdom,1966,0.0792,30.808,,,,37.276076,1.180311,-0.962665,...,0.0,-0.150678,30.077013,5.742508,0.0,24.334505,21.337686,1.623975,-0.058597,0.0


### Add Income Tax Rate to USA

In [None]:
tax_usa_df = pd.read_csv(path + 'top_tax_rates_USA.csv')
print(tax_usa_df.shape)
tax_usa_df.head()

In [None]:
tax_usa_df['Country'] = tax_usa_df.loc[:,'Country'].str.replace(
                                                  'USA', 'United States')
tax_usa_df.head()

In [None]:
tax_usa_df['Top_Marginal_Rate'] = (tax_usa_df.loc[:,'Top_Marginal_Rate']
                                   .str.replace('%', '')
                                   .astype(float))
tax_usa_df.head(2)

In [None]:
usa_df = data_df[data_df['Country'] == 'United States']
usa_df.head()

In [None]:
usa_df = usa_df.merge(tax_usa_df.drop('Country', axis=1), on='Year')
print(usa_df.shape)
usa_df.head()

### Add Total GDP

In [None]:
gdp_df = pd.read_csv('Data/GDP.csv')
print(gdp_df.shape)
gdp_df.head()

In [None]:
gdp_df['Year'] = pd.DataFrame([year[0] for year in gdp_df.DATE.str.split('-')]).astype(int)
gdp_df.drop('DATE', axis=1, inplace=True)
gdp_df.head()

In [None]:
usa_df = usa_df.merge(gdp_df, on='Year')
usa_df.head()

### Add Annual Average of NYSE Composite

In [None]:
nya_df = pd.read_csv('Data/NYA.csv')
print(nya_df.shape)
nya_df.head()

In [None]:
nya_df['Year'] = pd.DataFrame([year[0] for year in nya_df.Date.str.split('-')]).astype(int)
nya_df.head()

In [None]:
nya_df[nya_df['Year'] == 2016]

In [None]:
avg_year = []
for year in nya_df.Year.unique():
    avg_year.append(np.mean([close for close in nya_df[nya_df['Year'] == year]['Adj Close']]))
    
# Verify we have 52 years calculated    
print(len(avg_year))

usa_df['NYA'] = pd.DataFrame(avg_year)
usa_df.head()

In [None]:
#Visualize to verify calculations
plt.plot(years, usa_df.NYA)

### Check if NYSE is a good substitution for Wilshire Cap Index (Measure of market capitalization)

In [None]:
wilsh_df = pd.read_csv('Data/WILSH.csv')
print(wilsh_df.shape)
wilsh_df.head()

In [None]:
wilsh_df['Year'] = pd.DataFrame([year[0] for year in wilsh_df.DATE.str.split('-')]).astype(int)
wilsh_df.drop('DATE', axis=1, inplace=True)
wilsh_df.head()

In [None]:
#Visualize normalized indicies
plt.figure(figsize=(6, 12))

plt.subplot(3, 1, 1)
plt.plot(years, usa_df.NYA )
plt.plot(wilsh_df.Year, wilsh_df.WILL5000PRFC)

#Visualize normalized indicies
plt.subplot(3, 1, 2)
plt.plot(years, usa_df.NYA / max(usa_df.NYA) * 100)
plt.plot(wilsh_df.Year, wilsh_df.WILL5000PRFC / max(wilsh_df.WILL5000PRFC) * 100)

max_year = 1980
nya_temp = usa_df[usa_df['Year'] < max_year].NYA
nya_temp_years = usa_df[usa_df['Year'] < max_year].Year
wilsh_temp = wilsh_df[wilsh_df['Year'] < max_year].WILL5000PRFC
wilsh_temp_years = wilsh_df[wilsh_df['Year'] < max_year].Year

#scale nya to match wilsh and check for correlation
nya_scaled = max(wilsh_temp) / max(nya_temp) * nya_temp


plt.subplot(3, 1, 3)
plt.plot(nya_temp_years, nya_temp)
plt.plot(wilsh_temp_years, wilsh_temp)
plt.plot(nya_temp_years, nya_scaled)

In [None]:
int(wilsh_72) 

In [None]:
#Scale NYA to match Wilsh, then create new Column with Wilsh
nya_72 = int(usa_df[usa_df['Year'] == 1972].NYA)
wilsh_72 = int(wilsh_df[wilsh_df['Year'] == 1972].WILL5000PRFC)

nya_72_scaled = usa_df[usa_df['Year'] < 1972].NYA * (wilsh_72 / nya_72)
nya_72_scaled

In [None]:
usa_df[usa_df['Year'] < 1972].NYA

In [None]:
# Create column, total market cap of stock exchanges
usa_df['TMC_SE'] = pd.DataFrame(list(nya_72_scaled) + list(wilsh_df.WILL5000PRFC))

In [None]:
# Verify new column
plt.plot(years, usa_df.TMC_SE )
plt.plot(wilsh_df.Year, wilsh_df.WILL5000PRFC)

In [None]:
usa_df.drop('NYA', axis=1, inplace=True)
usa_df['TMC_to_GDP'] = usa_df['TMC_SE'] / usa_df['GDP']
usa_df.head()

### Create Time Series Rows

In [None]:
usa_df['Top_1_AR1'] = usa_df.Top_1.shift()
usa_df['Top_1_AR2'] = usa_df.Top_1.shift(2)
usa_df.head()

In [None]:
usa_df['Tax_Rev_Change'] = (usa_df['Tax_Rev_to_GDP'] - 
                             usa_df['Tax_Rev_to_GDP'].shift()) / usa_df['Tax_Rev_to_GDP'].shift() *100
usa_df.head()

In [None]:
usa_df['TMC_SE_Change'] = (usa_df['TMC_SE'] - 
                             usa_df['TMC_SE'].shift()) / usa_df['TMC_SE'].shift() *100
usa_df.head()

In [None]:
usa_df['TMC_to_GDP_Change'] = (usa_df['TMC_to_GDP'] - 
                             usa_df['TMC_to_GDP'].shift()) / usa_df['TMC_to_GDP'].shift() *100
usa_df.head()

### Reorganize Columns

In [None]:
usa_df.columns

In [None]:

cols = list(usa_df.columns)
'''Not going to work anymore'''
# print(len(cols))
# cols = cols[:4] + cols[-9:] + cols[4:-9]
# print(len(cols))
# print(cols)

In [None]:
usa_df = usa_df[cols]

### Drop Columns That Are 0

In [None]:
for col in usa_df.columns[1:]:
    if usa_df[col].mean() == 0:
        usa_df.drop(col, axis=1, inplace=True)
        

### Trim Columns

In [210]:
cols_taxes = ['1100', '1200', '1300', '2100', '2200',
              '2300', '2400', '4100', '4200', '4300', 
              '4400', '4500', '4600', '5100', '5110', 
              '5111', '5120', '5121', '5200', '5300', 
              '6000']


In [212]:
cols = data_df.columns
cols

Index(['Country', 'Year', 'Top_1', 'Tax_Rev_to_GDP',
       'Individual_Spending_Rate', 'Society_Spending_Rate', 'Top_Rate', '1100',
       '1200', '1300', '2100', '2200', '2300', '2400', '4100', '4200', '4300',
       '4400', '4500', '4600', '5100', '5110', '5111', '5120', '5121', '5200',
       '5300', '6000', 'GDP_growth'],
      dtype='object')

In [None]:
cols_trimmed = cols[:7] + cols_taxes + 

### Finished, store the data

In [203]:
data_df = data_df[data_df['Year'] != 2017]

In [204]:
data_df.to_csv('cleaned_data.csv', index=False)


In [None]:
usa_df.to_csv('usa_data.csv', index=False)


In [None]:
usa_df = pd.read_csv('usa_data.csv')


In [None]:
usa_df.head()