In [14]:
import math
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_addons as tfa
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score
from tensorflow.keras.callbacks import EarlyStopping

from tabtransformertf.models.tabtransformer import TabTransformer, TabTransformerRTD
from tabtransformertf.utils.preprocessing import df_to_dataset, build_categorical_prep, df_to_pretrain_dataset

In [28]:
from tabtransformertf.utils.helper import corrupt_dataset

def df_to_pretrain_dataset(
    x: pd.DataFrame,
    shuffle: bool = True,
    batch_size: int = 512,
    p_replace: float = 0.3,
):
    x, y = corrupt_dataset(x, p_replace)
    print(x.info())
    dataset = {}
    for key, value in x.items():
        dataset[key] = value[:, tf.newaxis]

    dataset = tf.data.Dataset.from_tensor_slices((dict(dataset), y))

    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(x))
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(batch_size)
    return dataset

## Download Data

In [1]:
CSV_HEADER = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education_num",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "gender",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
    "native_country",
    "income_bracket",
]

train_data_url = (
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
)
train_data = pd.read_csv(train_data_url, header=None, names=CSV_HEADER)

test_data_url = (
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"
)
test_data = pd.read_csv(test_data_url, header=None, names=CSV_HEADER)

print(f"Train dataset shape: {train_data.shape}")
print(f"Test dataset shape: {test_data.shape}")

In [3]:
train_data.head()

## Preprocess

In [None]:
# Column information
NUMERIC_FEATURES = train_data.select_dtypes(include=np.number).columns
CATEGORICAL_FEATURES = train_data.select_dtypes(exclude=np.number).columns[:-1] # exclude label column and DT

FEATURES = list(NUMERIC_FEATURES) + list(CATEGORICAL_FEATURES)
LABEL = 'income_bracket'

In [4]:
# encoding as binary target
train_data[LABEL] = train_data[LABEL].apply(lambda x: int(x == ' >50K')) 
test_data[LABEL] = test_data[LABEL].apply(lambda x: int(x == ' >50K.'))
train_data[LABEL].mean(), test_data[LABEL].mean()

In [None]:
test_data = test_data.iloc[1:, :] # drop invalid row

# Set data types
train_data[CATEGORICAL_FEATURES] = train_data[CATEGORICAL_FEATURES].astype(str)
test_data[CATEGORICAL_FEATURES] = test_data[CATEGORICAL_FEATURES].astype(str)

train_data[NUMERIC_FEATURES] = train_data[NUMERIC_FEATURES].astype(float)
test_data[NUMERIC_FEATURES] = test_data[NUMERIC_FEATURES].astype(float)

# Train/test split
X_train, X_val = train_test_split(train_data, test_size=0.2)

## Modelling Prep


In [5]:
# Category preprocessing layers
category_prep_layers = build_categorical_prep(X_train, CATEGORICAL_FEATURES)

In [None]:
from tabtransformertf.utils.helper import corrupt_dataset

def df_to_pretrain_dataset(
    x: pd.DataFrame,
    numeric_columns,
    categorical_columns,
    shuffle: bool = True,
    batch_size: int = 512,
    p_replace: float = 0.3,
    
):
    x, y = corrupt_dataset(x, p_replace)
    x[numeric_columns] = x[numeric_columns].astype(float)
    x[categorical_columns] = x[categorical_columns].astype(str)
    
    dataset = {}
    for key, value in x.items():
        dataset[key] = value[:, tf.newaxis]

    dataset = tf.data.Dataset.from_tensor_slices((dict(dataset), y))

    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(x))
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(batch_size)
    return dataset

In [6]:
# Prepare pretraining dataset
pretrain_data = pd.concat([X_train[FEATURES], X_val[FEATURES], test_data[FEATURES]])
pretrain_train, pretrain_val = train_test_split(pretrain_data, test_size=0.2)

rtd_train_dataset = df_to_pretrain_dataset(pretrain_train, NUMERIC_FEATURES, CATEGORICAL_FEATURES)
rtd_val_dataset = df_to_pretrain_dataset(pretrain_val, NUMERIC_FEATURES, CATEGORICAL_FEATURES, shuffle=False, p_replace=0.3)

In [7]:
# To TF Dataset
train_dataset = df_to_dataset(X_train[FEATURES + [LABEL]], LABEL)
val_dataset = df_to_dataset(X_val[FEATURES + [LABEL]], LABEL, shuffle=False)  # No shuffle
test_dataset = df_to_dataset(test_data[FEATURES], shuffle=False) # No target, no shuffle

## Pre-Training

In [8]:
tabtransformer_pretraining = TabTransformerRTD(
    numerical_features = NUMERIC_FEATURES,
    categorical_features = CATEGORICAL_FEATURES,
    categorical_lookup=category_prep_layers,
    numerical_discretisers=None, # simply passing the numeric features
    embedding_dim=32,
    depth=4,
    heads=8,
    attn_dropout=0.3,
    ff_dropout=0.2,
    use_column_embedding=True
)

LEARNING_RATE = 0.001
WEIGHT_DECAY = 0.0001
NUM_EPOCHS = 1000

optimizer = tfa.optimizers.AdamW(
        learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY
    )

tabtransformer_pretraining.compile(
    optimizer = optimizer,
    loss = tf.keras.losses.BinaryCrossentropy(),
    metrics= [tf.keras.metrics.AUC(name="AUC", curve='ROC')],
)

early = EarlyStopping(monitor="val_loss", mode="min", patience=10, restore_best_weights=True)
callback_list = [early]

history = tabtransformer_pretraining.fit(
    rtd_train_dataset, 
    epochs=NUM_EPOCHS, 
    validation_data=rtd_val_dataset,
    callbacks=callback_list,
    verbose=1,
)

## Training

In [None]:
tabtransformer = TabTransformer(
        out_dim=1,
        out_activation='sigmoid',
        encoder=tabtransformer_pretraining.encoder
    )
_ = tabtransformer.predict(val_dataset)

In [9]:
# tabtransformer.save('untrained_tabtransformer/')

In [10]:
# tabtransformer = tf.keras.models.load_model('untrained_tabtransformer/')

In [None]:
tabtransformer.encoder.trainable = False

In [11]:
LEARNING_RATE = 0.0001
WEIGHT_DECAY = 0.0001
NUM_EPOCHS = 1000

optimizer = tfa.optimizers.AdamW(
        learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY
    )

tabtransformer.compile(
    optimizer = optimizer,
    loss = tf.keras.losses.BinaryCrossentropy(),
    metrics= [tf.keras.metrics.AUC(name="AUC", curve='ROC')],
)

early = EarlyStopping(monitor="val_loss", mode="min", patience=10, restore_best_weights=True)
callback_list = [early]

history = tabtransformer.fit(
    train_dataset, 
    epochs=NUM_EPOCHS, 
    validation_data=val_dataset,
    callbacks=callback_list,
    verbose=1,
)


In [None]:
test_preds = tabtransformer.predict(test_dataset)

In [12]:
print("Test ROC AUC:", np.round(roc_auc_score(test_data[LABEL], test_preds.ravel()), 4))
print("Test PR AUC:", np.round(average_precision_score(test_data[LABEL], test_preds.ravel()), 4))