## Homework 1 Final Models (Income Prediction)
Adam Kiehl  
4/10/2023

### Setup

In [32]:
# analysis packages
import keras
from keras.callbacks import EarlyStopping
from keras.layers import Dense, Dropout
from keras import models
from keras.regularizers import l2
from keras.utils import to_categorical
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy
from scipy import stats
import sklearn
from sklearn import compose
from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LogisticRegression
from sklearn import ensemble
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
import statsmodels.api as sm
import warnings
import tensorflow as tf

### Data Preparation

In [33]:
# read data from .csvs
trainDF = pd.read_csv('./adult.csv')
testDF = pd.read_csv('./adult_test.csv')

# drop id column (wont be used for modeling)
trainDF.drop('id', axis = 1, inplace = True)
testIds = testDF['id']
testDF.drop('id', axis = 1, inplace = True)

In [45]:
# scale numeric predictors and encode categorical predictors
findNumPredictors = make_column_selector(dtype_include = int)
findCatPredictors = make_column_selector(dtype_include = object)
transform = make_column_transformer((MinMaxScaler(), findNumPredictors),
                                    (OneHotEncoder(drop = 'first'), findCatPredictors))

# get new column names
colNames = transform.fit(trainDF).get_feature_names_out()

# transform data
modelDF = pd.DataFrame.sparse.from_spmatrix(transform.fit_transform(trainDF), columns = colNames)

# get new column names
colNames = transform.fit(testDF).get_feature_names_out()

# transform data
predDF = pd.DataFrame.sparse.from_spmatrix(transform.fit_transform(testDF), columns = colNames)

# set random seed
np.random.seed(432023)

# split data into predictors and response
respTrain = modelDF['onehotencoder__income_>50K'].rename('income')
modelDF.drop('onehotencoder__income_>50K', axis = 1, inplace = True)

# split data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(modelDF, respTrain, test_size = 0.2)

### Random Forest

In [35]:
# tuned max features hyperparamter
M = 22

# fit random forest model
model1 = RandomForestClassifier(max_features = M,
                                n_estimators = 1000,
                                random_state = 482023)
model1.fit(X_train, y_train) 

# predict on validation set
pred1 = model1.predict(X_valid)

# calculate validation accuracy
print(f"Validation accuracy: {(np.mean(pred1 == np.array(y_valid)) * 100).round(2)}%")

Validation accuracy: 85.94%


### Neural Networks

In [36]:
# set random seeds
np.random.seed(462023)
tf.random.set_seed(482023)

# number of epochs
EPOCHS = 50

# early stopping criteria
earlyStop = EarlyStopping(monitor = 'val_loss', mode = 'min', verbose = 1, patience = 3)

# tuned penalty hyperparameter
PENALTY = 0.001

# define model architecture
model2 = models.Sequential([
    Dense(512, activation = 'relu', kernel_regularizer = l2(PENALTY), input_shape = (X_train.shape[1], )),
    Dense(256, activation = 'relu', kernel_regularizer = l2(PENALTY)),
    Dense(128, activation = 'relu', kernel_regularizer = l2(PENALTY)),
    Dense(2, activation = 'sigmoid')
])

# compile model
model2.compile(optimizer = 'rmsprop',
               loss = 'categorical_crossentropy',
               metrics = ['accuracy'])

# train model
trained2 = model2.fit(X_train, 
                      to_categorical(y_train), 
                      epochs = EPOCHS, 
                      batch_size = 128, 
                      validation_split = 0.2,
                      callbacks = earlyStop,
                      verbose = 0)

# predict on validation set
pred2 = model2.predict(X_valid)

# calculate validation accuracy
print(f"Validation accuracy: {((np.mean(pred2[:,1].round() == np.array(y_valid))) * 100).round(2)}%")

Epoch 27: early stopping
Validation accuracy: 85.77%


In [37]:
# set random seed
np.random.seed(462023)
tf.random.set_seed(482023)

# dropout rate
RATE = 0.25

# define model architecture
model3 = models.Sequential([
    Dense(512, activation = 'relu', input_shape = (X_train.shape[1], )),
    Dropout(rate = RATE),
    Dense(256, activation = 'relu'),
    Dropout(rate = RATE),
    Dense(128, activation = 'relu'),
    Dropout(rate = RATE),
    Dense(2, activation = 'sigmoid')
])

# compile model
model3.compile(optimizer = 'rmsprop',
               loss = 'categorical_crossentropy',
               metrics = ['accuracy'])

# train model
trained3 = model3.fit(X_train, 
                      to_categorical(y_train), 
                      epochs = EPOCHS, 
                      batch_size = 128, 
                      validation_split = 0.2,
                      callbacks = earlyStop,
                      verbose = 0)

# predict on validation set
pred3 = model3.predict(X_valid)

# calculate validation accuracy
print(f"Validation accuracy: {((np.mean(pred3[:,1].round() == np.array(y_valid))) * 100).round(2)}%")

Epoch 8: early stopping
Validation accuracy: 85.51%


### Final Model

In [57]:
# set random seeds
np.random.seed(462023)
tf.random.set_seed(482023)

# number of epochs
EPOCHS = 50

# early stopping criteria
earlyStop = EarlyStopping(monitor = 'val_loss', mode = 'min', verbose = 1, patience = 3)

# tuned penalty hyperparameter
PENALTY = 0.001

# drop predictor not in test set
modelDF.drop('onehotencoder__native_country_Holand-Netherlands', axis = 1, inplace = True)

# define model architecture
model = models.Sequential([
    Dense(512, activation = 'relu', kernel_regularizer = l2(PENALTY), input_shape = (modelDF.shape[1], )),
    Dense(256, activation = 'relu', kernel_regularizer = l2(PENALTY)),
    Dense(128, activation = 'relu', kernel_regularizer = l2(PENALTY)),
    Dense(2, activation = 'sigmoid')
])

# compile model
model.compile(optimizer = 'rmsprop',
              loss = 'categorical_crossentropy',
              metrics = ['accuracy'])

# train model
trained = model.fit(modelDF, 
                    to_categorical(respTrain), 
                    epochs = EPOCHS, 
                    batch_size = 128, 
                    validation_split = 0.2,
                    callbacks = earlyStop,
                    verbose = 0)

# predict on validation set
pred = model.predict(predDF)
pred = pred[:,1].round()

Epoch 14: early stopping


In [58]:
# create submission data frame
submission = pd.DataFrame({'id': testIds, 'income': np.where(pred, '>50K', '<=50K')})

# export submission
submission.to_csv('./submission.csv', index = False)