In [2]:
# import packages
import numpy as np
import pandas as pd
import sklearn
from sklearn import compose
from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data

## Data Preparation

In [None]:
# read data from .csvs
trainDF = pd.read_csv('./adult.csv')
testDF = pd.read_csv('./adult_test.csv')

# drop id column (wont be used for modeling)
trainDF.drop('id', axis = 1, inplace = True)
testIds = testDF['id']
testDF.drop('id', axis = 1, inplace = True)

In [None]:
# scale numeric predictors and encode categorical predictors
findNumPredictors = make_column_selector(dtype_include = int)
findCatPredictors = make_column_selector(dtype_include = object)
transform = make_column_transformer((MinMaxScaler(), findNumPredictors),
                                    (OneHotEncoder(drop = 'first'), findCatPredictors))

# get new column names
colNames = transform.fit(trainDF).get_feature_names_out()

# transform data
modelDF = pd.DataFrame.sparse.from_spmatrix(transform.fit_transform(trainDF), columns = colNames)

# get new column names
colNames = transform.fit(testDF).get_feature_names_out()

# transform data
predDF = pd.DataFrame.sparse.from_spmatrix(transform.fit_transform(testDF), columns = colNames)

# set random seed
np.random.seed(432023)

# split data into predictors and response
respTrain = modelDF['onehotencoder__income_>50K'].rename('income')
modelDF.drop('onehotencoder__income_>50K', axis = 1, inplace = True)

# split data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(modelDF, respTrain, test_size = 0.2)

In [None]:
# convert data to pytorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_valid_tensor = torch.tensor(X_valid, dtype=torch.float32)

# Create dataset and dataloader
dataset = data.TensorDataset(X_train_tensor, y_train_tensor)
dataloader = data.DataLoader(dataset, batch_size=128, shuffle=True)

## Define Model

In [None]:
# define model architecture
class MLP(nn.Module):
    def __init__(self, input_size):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 2)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.sigmoid(self.fc4(x))
        return x

## Train Model

In [None]:
# set random seeds
torch.manual_seed(462023)
np.random.seed(462023)

# number of epochs
EPOCHS = 50

# initialize the model
model = MLP(X_train.shape[1])

# define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.RMSprop(model.parameters())

# train the model
for epoch in range(EPOCHS):
    running_loss = 0.0
    for inputs, labels in dataloader:
        optimizer.zero_grad()
        
        # forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        # backward pass and optimization
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    # print the average loss for the epoch
    print(f"Epoch {epoch+1} - Loss: {running_loss / len(dataloader)}")