In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Image

In [9]:
%matplotlib inline

In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

# explicitly require this experimental feature
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
# now you can import normally from ensemble
from sklearn.ensemble import HistGradientBoostingClassifier

# Need the following to encode our target variable
from sklearn.preprocessing import OrdinalEncoder

In [11]:
tax_df = pd.read_csv('TaxInfo.csv')

In [12]:
tax_df.head()

Unnamed: 0,HHI,HHDL,Married,CollegGrads,AHHAge,Cars,Filed_2017,Filed_2016,Filed_2015,PoliticalParty
0,49685,227187,0,0,105,0,1,1,1,Democrat
1,64756,-507342,2,3,68,3,1,0,0,Independent
2,115435,521290,1,3,81,2,0,1,0,Republican
3,99454,251829,2,1,52,4,1,0,0,Republican
4,157274,-472337,0,1,28,1,1,0,1,Independent


In [13]:

tax_df.Married = tax_df.Married.astype('category')
tax_df['Filed_2017'] = tax_df['Filed_2017'].astype('category')
tax_df['Filed_2016'] = tax_df['Filed_2016'].astype('category')
tax_df['Filed_2015'] = tax_df['Filed_2015'].astype('category')


In [14]:
outcome = 'PoliticalParty'
predictors = [c for c in tax_df.columns if c != outcome]

X = tax_df[predictors]
Y = tax_df[outcome]
train_X, valid_X, train_y, valid_y = train_test_split(X, Y, test_size=0.2, random_state=21)

#Cat/Num Columns for tax_df
categorical_cols = train_X.select_dtypes(include=['category']).columns.tolist()
numeric_cols = train_X.select_dtypes(include=['number']).columns.tolist()

In [15]:
numeric_transformer_hgbc = StandardScaler()
categorical_transformer_hgbc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)
target_transformer_hgbc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)

preprocessor_hgbc = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer_hgbc, categorical_cols),
        ('num', numeric_transformer_hgbc, numeric_cols)],
        remainder='passthrough')

# The preprocessor reorders the columns into blocks by type as defined by the transformers. So, now
# the categorical cols are the first columns. We need their index numbers to use in the call to the
# HistGradientBoostingClassifier() so it knows which cols to treat as categoricals (even though they
# have been transformed into meaningless integers.)

categorical_cols_idx = [_ for _ in range(len(categorical_cols))]

# Append classifier to preprocessing pipeline.
clf_hgbc = Pipeline(steps=[('preprocessor', preprocessor_hgbc),
                      ('classifier', HistGradientBoostingClassifier(categorical_features=categorical_cols_idx))])

# Fit model
clf_hgbc.fit(train_X, train_y)
print(f"hgbc training score: {clf_hgbc.score(train_X, train_y):.3f}")

print(f"Training score: {clf_hgbc.score(train_X, train_y):.3f}")
print(f"Test score: {clf_hgbc.score(valid_X, valid_y):.3f}")

hgbc training score: 1.000
Training score: 1.000
Test score: 0.418


Gradient booster does improve our model's performance against validation data, however, our model is still not a reliable predictor. 

In [16]:
tax_df.PoliticalParty = tax_df.PoliticalParty.astype('category')

In [17]:
tax_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1004 entries, 0 to 1003
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   HHI             1004 non-null   int64   
 1   HHDL            1004 non-null   int64   
 2   Married         1004 non-null   category
 3   CollegGrads     1004 non-null   int64   
 4   AHHAge          1004 non-null   int64   
 5   Cars            1004 non-null   int64   
 6   Filed_2017      1004 non-null   category
 7   Filed_2016      1004 non-null   category
 8   Filed_2015      1004 non-null   category
 9   PoliticalParty  1004 non-null   category
dtypes: category(5), int64(5)
memory usage: 44.9 KB


#### Gradient Booster - Filed 2015

In [18]:
outcome = 'Filed_2015'
predictors = [c for c in tax_df.columns if c != outcome]

X = tax_df[predictors]
Y = tax_df[outcome]
train_X, valid_X, train_y, valid_y = train_test_split(X, Y, test_size=0.2, random_state=21)

#Cat/Num Columns for tax_df
categorical_cols = train_X.select_dtypes(include=['category']).columns.tolist()
numeric_cols = train_X.select_dtypes(include=['number']).columns.tolist()

In [19]:
numeric_transformer_hgbc = StandardScaler()
categorical_transformer_hgbc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)
target_transformer_hgbc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)

preprocessor_hgbc = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer_hgbc, categorical_cols),
        ('num', numeric_transformer_hgbc, numeric_cols)],
        remainder='passthrough')

# The preprocessor reorders the columns into blocks by type as defined by the transformers. So, now
# the categorical cols are the first columns. We need their index numbers to use in the call to the
# HistGradientBoostingClassifier() so it knows which cols to treat as categoricals (even though they
# have been transformed into meaningless integers.)

categorical_cols_idx = [_ for _ in range(len(categorical_cols))]

# Append classifier to preprocessing pipeline.
clf_hgbc = Pipeline(steps=[('preprocessor', preprocessor_hgbc),
                      ('classifier', HistGradientBoostingClassifier(categorical_features=categorical_cols_idx))])

# Fit model
clf_hgbc.fit(train_X, train_y)

print(f"Training score: {clf_hgbc.score(train_X, train_y):.3f}")
print(f"Test score: {clf_hgbc.score(valid_X, valid_y):.3f}")

Training score: 0.998
Test score: 0.463


#### Gradient Booster - Filed 2016

In [20]:
outcome = 'Filed_2016'
predictors = [c for c in tax_df.columns if c != outcome]

X = tax_df[predictors]
Y = tax_df[outcome]
train_X, valid_X, train_y, valid_y = train_test_split(X, Y, test_size=0.2, random_state=21)

#Cat/Num Columns for tax_df
categorical_cols = train_X.select_dtypes(include=['category']).columns.tolist()
numeric_cols = train_X.select_dtypes(include=['number']).columns.tolist()

In [21]:
numeric_transformer_hgbc = StandardScaler()
categorical_transformer_hgbc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)
target_transformer_hgbc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)

preprocessor_hgbc = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer_hgbc, categorical_cols),
        ('num', numeric_transformer_hgbc, numeric_cols)],
        remainder='passthrough')

# The preprocessor reorders the columns into blocks by type as defined by the transformers. So, now
# the categorical cols are the first columns. We need their index numbers to use in the call to the
# HistGradientBoostingClassifier() so it knows which cols to treat as categoricals (even though they
# have been transformed into meaningless integers.)

categorical_cols_idx = [_ for _ in range(len(categorical_cols))]

# Append classifier to preprocessing pipeline.
clf_hgbc1 = Pipeline(steps=[('preprocessor', preprocessor_hgbc),
                      ('classifier', HistGradientBoostingClassifier(categorical_features=categorical_cols_idx))])

# Fit model
clf_hgbc1.fit(train_X, train_y)

print(f"Training score: {clf_hgbc1.score(train_X, train_y):.3f}")
print(f"Test score: {clf_hgbc1.score(valid_X, valid_y):.3f}")

Training score: 0.998
Test score: 0.502


#### Gradient Booster - Filed 2017

In [22]:
outcome = 'Filed_2017'
predictors = [c for c in tax_df.columns if c != outcome]

X = tax_df[predictors]
Y = tax_df[outcome]
train_X, valid_X, train_y, valid_y = train_test_split(X, Y, test_size=0.2, random_state=21)

#Cat/Num Columns for tax_df
categorical_cols = train_X.select_dtypes(include=['category']).columns.tolist()
numeric_cols = train_X.select_dtypes(include=['number']).columns.tolist()

In [23]:
numeric_transformer_hgbc = StandardScaler()
categorical_transformer_hgbc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)
target_transformer_hgbc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)

preprocessor_hgbc = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer_hgbc, categorical_cols),
        ('num', numeric_transformer_hgbc, numeric_cols)],
        remainder='passthrough')

# The preprocessor reorders the columns into blocks by type as defined by the transformers. So, now
# the categorical cols are the first columns. We need their index numbers to use in the call to the
# HistGradientBoostingClassifier() so it knows which cols to treat as categoricals (even though they
# have been transformed into meaningless integers.)

categorical_cols_idx = [_ for _ in range(len(categorical_cols))]

# Append classifier to preprocessing pipeline.
clf_hgbc2 = Pipeline(steps=[('preprocessor', preprocessor_hgbc),
                      ('classifier', HistGradientBoostingClassifier(categorical_features=categorical_cols_idx))])

# Fit model
clf_hgbc2.fit(train_X, train_y)

print(f"Training score: {clf_hgbc2.score(train_X, train_y):.3f}")
print(f"Test score: {clf_hgbc2.score(valid_X, valid_y):.3f}")

Training score: 0.995
Test score: 0.522


The use of Gradient Boosting Classifiers to predict whether an individual filed taxes in a given year gives us improved performance when compared with out ability to predict political party. This could be because the output variable consists of two levels as opposed to three output variables for political party. 