In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Image

In [2]:
%matplotlib inline

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

# explicitly require this experimental feature
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
# now you can import normally from ensemble
from sklearn.ensemble import HistGradientBoostingClassifier

# Need the following to encode our target variable
from sklearn.preprocessing import OrdinalEncoder

In [51]:
tax_df = pd.read_csv('TaxInfo.csv')

In [10]:
tax_df.head()

Unnamed: 0,HHI,HHDL,Married,CollegGrads,AHHAge,Cars,Filed_2017,Filed_2016,Filed_2015,PoliticalParty
0,49685,227187,0,0,105,0,1,1,1,Democrat
1,64756,-507342,2,3,68,3,1,0,0,Independent
2,115435,521290,1,3,81,2,0,1,0,Republican
3,99454,251829,2,1,52,4,1,0,0,Republican
4,157274,-472337,0,1,28,1,1,0,1,Independent


In [12]:
tax_df = pd.get_dummies(tax_df, columns=['Married'])

In [13]:
tax_df.Married_0 = tax_df.Married_0.astype('category')
tax_df.Married_1 = tax_df.Married_1.astype('category')
tax_df['Filed_2017'] = tax_df['Filed_2017'].astype('category')
tax_df['Filed_2016'] = tax_df['Filed_2016'].astype('category')
tax_df['Filed_2015'] = tax_df['Filed_2015'].astype('category')
tax_df = tax_df.drop(columns=['Married_2'])

In [52]:
outcome = 'PoliticalParty'
predictors = [c for c in tax_df.columns if c != outcome]

X = tax_df[predictors]
Y = tax_df[outcome]
train_X, valid_X, train_y, valid_y = train_test_split(X, Y, test_size=0.2, random_state=21)

#Cat/Num Columns for tax_df
categorical_cols = train_X.select_dtypes(include=['category']).columns.tolist()
numeric_cols = train_X.select_dtypes(include=['number']).columns.tolist()

In [53]:
numeric_transformer_hgbc = StandardScaler()
categorical_transformer_hgbc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)
target_transformer_hgbc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)

preprocessor_hgbc = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer_hgbc, categorical_cols),
        ('num', numeric_transformer_hgbc, numeric_cols)],
        remainder='passthrough')

# The preprocessor reorders the columns into blocks by type as defined by the transformers. So, now
# the categorical cols are the first columns. We need their index numbers to use in the call to the
# HistGradientBoostingClassifier() so it knows which cols to treat as categoricals (even though they
# have been transformed into meaningless integers.)

categorical_cols_idx = [_ for _ in range(len(categorical_cols))]

# Append classifier to preprocessing pipeline.
clf_hgbc = Pipeline(steps=[('preprocessor', preprocessor_hgbc),
                      ('classifier', HistGradientBoostingClassifier(categorical_features=categorical_cols_idx))])

# Fit model
clf_hgbc.fit(train_X, train_y)
print(f"hgbc training score: {clf_hgbc.score(train_X, train_y):.3f}")

print(f"Training score: {clf_hgbc.score(train_X, train_y):.3f}")
print(f"Test score: {clf_hgbc.score(valid_X, valid_y):.3f}")

hgbc training score: 1.000
Training score: 1.000
Test score: 0.363


In [42]:
tax2_df = pd.get_dummies(tax_df, columns=['PoliticalParty'])

In [54]:
tax_df.PoliticalParty = tax_df.PoliticalParty.astype('category')

In [55]:
tax_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1004 entries, 0 to 1003
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   HHI             1004 non-null   int64   
 1   HHDL            1004 non-null   int64   
 2   Married         1004 non-null   int64   
 3   CollegGrads     1004 non-null   int64   
 4   AHHAge          1004 non-null   int64   
 5   Cars            1004 non-null   int64   
 6   Filed_2017      1004 non-null   int64   
 7   Filed_2016      1004 non-null   int64   
 8   Filed_2015      1004 non-null   int64   
 9   PoliticalParty  1004 non-null   category
dtypes: category(1), int64(9)
memory usage: 71.8 KB


In [44]:
tax_df2 = tax2_df.drop(columns=['PoliticalParty_Independent'])

In [56]:
outcome = 'Filed_2015'
predictors = [c for c in tax_df.columns if c != outcome]

X = tax_df[predictors]
Y = tax_df[outcome]
train_X, valid_X, train_y, valid_y = train_test_split(X, Y, test_size=0.2, random_state=21)

#Cat/Num Columns for tax_df
categorical_cols = train_X.select_dtypes(include=['category']).columns.tolist()
numeric_cols = train_X.select_dtypes(include=['number']).columns.tolist()

In [57]:
numeric_transformer_hgbc = StandardScaler()
categorical_transformer_hgbc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)
target_transformer_hgbc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)

preprocessor_hgbc = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer_hgbc, categorical_cols),
        ('num', numeric_transformer_hgbc, numeric_cols)],
        remainder='passthrough')

# The preprocessor reorders the columns into blocks by type as defined by the transformers. So, now
# the categorical cols are the first columns. We need their index numbers to use in the call to the
# HistGradientBoostingClassifier() so it knows which cols to treat as categoricals (even though they
# have been transformed into meaningless integers.)

categorical_cols_idx = [_ for _ in range(len(categorical_cols))]

# Append classifier to preprocessing pipeline.
clf_hgbc = Pipeline(steps=[('preprocessor', preprocessor_hgbc),
                      ('classifier', HistGradientBoostingClassifier(categorical_features=categorical_cols_idx))])

# Fit model
clf_hgbc.fit(train_X, train_y)
print(f"hgbc training score: {clf_hgbc.score(train_X, train_y):.3f}")

print(f"Training score: {clf_hgbc.score(train_X, train_y):.3f}")
print(f"Test score: {clf_hgbc.score(valid_X, valid_y):.3f}")

hgbc training score: 0.996
Training score: 0.996
Test score: 0.458
