In [341]:
# import packages
import numpy as np
import pandas as pd
import sklearn
from sklearn import compose
from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
import torch
import torch.autograd as auto
from torch.autograd import Variable
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from torch.utils.data import Dataset, DataLoader

## Data Preprocessing

In [237]:
# read data from .csvs
trainDF = pd.read_csv('./adult.csv')
testDF = pd.read_csv('./adult_test.csv')

# drop id column (won't be used for modeling)
trainDF.drop('id', axis=1, inplace=True)
testIds = testDF['id']
testDF.drop('id', axis=1, inplace=True)

In [284]:
# define custom transform
class CustomTransform:
    def __init__(self, num_scaler, cat_encoder, out_encoder):
        self.num_scaler = num_scaler
        self.cat_encoder = cat_encoder
        self.out_encoder = out_encoder

    def __call__(self, predictors, labels):        
        num_features = predictors.select_dtypes(include=[np.number])
        cat_features = predictors.select_dtypes(include=[object])
        
        num_scaled = self.num_scaler.transform(num_features)
        cat_encoded = self.cat_encoder.transform(cat_features).toarray()
        out_encoded = self.out_encoder.transform(labels).toarray()

        transformed = np.concatenate([num_scaled, cat_encoded], axis=1)

        return transformed, out_encoded

In [290]:
# define custom dataset
class CustomDataset(Dataset):
    def __init__(self, data, transform=None):
        self.data = data
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data.iloc[[idx]].copy()

        predictors = sample.drop('income', axis = 1)
        labels = pd.DataFrame(sample['income'])

        if self.transform:
            predictors, labels = self.transform(predictors, labels)
        
        return predictors, labels

In [293]:
# scale numeric predictors and encode categorical predictors
num_scaler = MinMaxScaler()
cat_encoder = OneHotEncoder(drop='first')
out_encoder = OneHotEncoder(drop='first')

num_features_train = trainDF.drop('income', axis = 1).select_dtypes(include=[np.number])
cat_features_train = trainDF.drop('income', axis = 1).select_dtypes(include=[object])
out_features_train = pd.DataFrame(trainDF['income'])

num_scaler.fit(num_features_train)
cat_encoder.fit(cat_features_train)
out_encoder.fit(out_features_train)
# out_encoder.fit(np.array(out_features_train).reshape(-1, 1))

# define transformation function
transform = CustomTransform(num_scaler, cat_encoder, out_encoder)

In [294]:
# create custom dataset
train_dataset = CustomDataset(trainDF, transform=transform)
test_dataset = CustomDataset(testDF, transform=transform)

# set random seed
np.random.seed(432023)

# split data into training and validation sets
train_size = int(0.8 * len(train_dataset))
valid_size = len(train_dataset) - train_size
train_data, valid_data = data.random_split(train_dataset, [train_size, valid_size])

# create DataLoader objects
train_loader = DataLoader(train_data, batch_size=128, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=128)

# convert data to PyTorch tensors
X_train_tensor, y_train_tensor = zip(*train_loader)
X_valid_tensor, y_valid_tensor = zip(*valid_loader)

X_train_tensor = torch.cat(list(X_train_tensor), dim=0)
y_train_tensor = torch.cat(list(y_train_tensor), dim=0)
X_valid_tensor = torch.cat(list(X_valid_tensor), dim=0)
y_valid_tensor = torch.cat(list(y_valid_tensor), dim=0)

## Define Model

In [319]:
# define model architecture
class MLP(nn.Module):
    def __init__(self, input_size):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 2)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = x.to(torch.float32)
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.sigmoid(self.fc4(x))
        return x

## Train Model

In [343]:
# set random seeds
torch.manual_seed(462023)
np.random.seed(462023)

# number of epochs
EPOCHS = 10

# initialize the model
model = MLP(X_train_tensor.shape[2])

# define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.RMSprop(model.parameters())

# train the model
for epoch in range(EPOCHS):
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        
        # forward pass
        outputs = model(inputs)
        outputs = torch.Tensor(torch.argmax(outputs[:, 0], dim=1).numpy().astype(int))
        outputs = Variable(outputs, requires_grad = True)
        labels = torch.Tensor(labels[:, 0, 0].numpy().astype(int))
        loss = criterion(outputs, labels)
    
        # backward pass and optimization
        loss.backward()
        optimizer.step()
    
        running_loss += loss.item()

    # Print the average loss for the epoch
    print(f"Epoch {epoch+1} - Loss: {running_loss / len(train_loader)}")

Epoch 1 - Loss: 149.18241687849456
Epoch 2 - Loss: 149.17216450560326
Epoch 3 - Loss: 149.19169470843147
Epoch 4 - Loss: 149.1845801671346
Epoch 5 - Loss: 149.18455449272605
Epoch 6 - Loss: 149.17412185668945
Epoch 7 - Loss: 149.1708838518928
Epoch 8 - Loss: 149.17612535813277
Epoch 9 - Loss: 149.16206797431497
Epoch 10 - Loss: 149.16241238163965


## Model Evaluation

In [346]:
# set the model to evaluation mode
model.eval()

# predict on validation set
with torch.no_grad():
    pred = model(X_valid_tensor)
    pred = torch.argmax(pred[:, 0], dim=1).numpy()

# calculate validation accuracy
accuracy = (pred == y_valid).mean() * 100
print(f"Validation accuracy: {accuracy.round(2)}%")

Validation accuracy: 76.03%
