In [1]:
!pip install tabtransformertf --quiet

You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
import math
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score, roc_auc_score

%matplotlib inline

  import cryptography.exceptions


In [3]:
from tabtransformertf.models.tabtransformer import TabTransformer
from tabtransformertf.utils.preprocessing import df_to_dataset, build_categorical_prep

In [4]:
import absl.logging
import warnings
import logging

logging.captureWarnings(True)
warnings.filterwarnings('ignore')
absl.logging.set_verbosity(absl.logging.ERROR)

In [5]:
train_data = pd.read_csv('fraud_train_preprocessed.csv')
val_data = pd.read_csv('fraud_val_preprocessed.csv')

## Preprocessing

In [6]:
# Column information
NUMERIC_FEATURES = train_data.select_dtypes(include=np.number).columns[:-2] # exclude label column and DT
CATEGORICAL_FEATURES = train_data.select_dtypes(exclude=np.number).columns

FEATURES = list(NUMERIC_FEATURES) + list(CATEGORICAL_FEATURES)
LABEL = 'isFraud'

In [7]:
# Set data types
train_data[CATEGORICAL_FEATURES] = train_data[CATEGORICAL_FEATURES].astype(str)
val_data[CATEGORICAL_FEATURES] = val_data[CATEGORICAL_FEATURES].astype(str)

train_data[NUMERIC_FEATURES] = train_data[NUMERIC_FEATURES].astype(float)
val_data[NUMERIC_FEATURES] = val_data[NUMERIC_FEATURES].astype(float)

In [8]:
category_prep_layers = build_categorical_prep(train_data, CATEGORICAL_FEATURES)

100%|██████████| 46/46 [00:02<00:00, 21.54it/s]


In [9]:
import gc
gc.collect()

0

### TF Dataset

In [10]:
train_dataset = df_to_dataset(train_data[FEATURES + [LABEL]], LABEL, batch_size=3072)
val_dataset = df_to_dataset(val_data[FEATURES + [LABEL]], LABEL, shuffle=False, batch_size=3072)

## TabTransformer

In [23]:
tabtransformer = TabTransformer(
    numerical_features = NUMERIC_FEATURES,
    categorical_features = CATEGORICAL_FEATURES,
    categorical_lookup=category_prep_layers,
    numerical_discretisers=None, # simply passing the numeric features
    embedding_dim=32,
    out_dim=1,
    out_activation='sigmoid',
    depth=4,
    heads=8,
    attn_dropout=0.1,
    ff_dropout=0.1,
    mlp_hidden_factors=[2, 4],
    use_column_embedding=True,
)

In [26]:
tabtransformer.summary()

Model: "tab_transformer_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_normalization_9 (Layer multiple                  270       
_________________________________________________________________
concatenate_3 (Concatenate)  multiple                  0         
_________________________________________________________________
string_lookup (StringLookup) multiple                  0         
_________________________________________________________________
string_lookup_1 (StringLooku multiple                  0         
_________________________________________________________________
string_lookup_2 (StringLooku multiple                  0         
_________________________________________________________________
string_lookup_3 (StringLooku multiple                  0         
_________________________________________________________________
string_lookup_4 (StringLooku multiple            

In [24]:
LEARNING_RATE = 0.0001
WEIGHT_DECAY = 0.0001
NUM_EPOCHS = 1000

optimizer = tfa.optimizers.AdamW(
        learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY
    )

tabtransformer.compile(
    optimizer = optimizer,
    loss = tf.keras.losses.BinaryCrossentropy(),
    metrics= [tf.keras.metrics.AUC(name="PR AUC", curve='PR')],
)

In [25]:
out_file = './tabTransformerBasic'
checkpoint = ModelCheckpoint(
    out_file, monitor="val_loss", verbose=1, save_best_only=True, mode="min"
)
early = EarlyStopping(monitor="val_loss", mode="min", patience=10, restore_best_weights=True)
callback_list = [checkpoint, early]

history = tabtransformer.fit(
    train_dataset, 
    epochs=NUM_EPOCHS, 
    validation_data=val_dataset,
    callbacks=callback_list
)

Epoch 1/1000

Epoch 00001: val_loss improved from inf to 0.15253, saving model to ./tabTransformerBasic
INFO:tensorflow:Assets written to: ./tabTransformerBasic/assets


INFO:tensorflow:Assets written to: ./tabTransformerBasic/assets


Epoch 2/1000

Epoch 00002: val_loss improved from 0.15253 to 0.12931, saving model to ./tabTransformerBasic
INFO:tensorflow:Assets written to: ./tabTransformerBasic/assets


INFO:tensorflow:Assets written to: ./tabTransformerBasic/assets


Epoch 3/1000

Epoch 00003: val_loss improved from 0.12931 to 0.12446, saving model to ./tabTransformerBasic
INFO:tensorflow:Assets written to: ./tabTransformerBasic/assets


INFO:tensorflow:Assets written to: ./tabTransformerBasic/assets


Epoch 4/1000

Epoch 00004: val_loss did not improve from 0.12446
Epoch 5/1000

Epoch 00005: val_loss improved from 0.12446 to 0.11619, saving model to ./tabTransformerBasic
INFO:tensorflow:Assets written to: ./tabTransformerBasic/assets


INFO:tensorflow:Assets written to: ./tabTransformerBasic/assets


Epoch 6/1000

Epoch 00006: val_loss did not improve from 0.11619
Epoch 7/1000

Epoch 00007: val_loss did not improve from 0.11619
Epoch 8/1000

Epoch 00008: val_loss did not improve from 0.11619
Epoch 9/1000

Epoch 00009: val_loss improved from 0.11619 to 0.11560, saving model to ./tabTransformerBasic
INFO:tensorflow:Assets written to: ./tabTransformerBasic/assets


INFO:tensorflow:Assets written to: ./tabTransformerBasic/assets


Epoch 10/1000

Epoch 00010: val_loss did not improve from 0.11560
Epoch 11/1000

Epoch 00011: val_loss improved from 0.11560 to 0.11343, saving model to ./tabTransformerBasic
INFO:tensorflow:Assets written to: ./tabTransformerBasic/assets


INFO:tensorflow:Assets written to: ./tabTransformerBasic/assets


Epoch 12/1000

Epoch 00012: val_loss did not improve from 0.11343
Epoch 13/1000

Epoch 00013: val_loss improved from 0.11343 to 0.10995, saving model to ./tabTransformerBasic
INFO:tensorflow:Assets written to: ./tabTransformerBasic/assets


INFO:tensorflow:Assets written to: ./tabTransformerBasic/assets


Epoch 14/1000

Epoch 00014: val_loss did not improve from 0.10995
Epoch 15/1000

Epoch 00015: val_loss did not improve from 0.10995
Epoch 16/1000

Epoch 00016: val_loss did not improve from 0.10995
Epoch 17/1000

Epoch 00017: val_loss did not improve from 0.10995
Epoch 18/1000

Epoch 00018: val_loss did not improve from 0.10995
Epoch 19/1000

Epoch 00019: val_loss improved from 0.10995 to 0.10888, saving model to ./tabTransformerBasic
INFO:tensorflow:Assets written to: ./tabTransformerBasic/assets


INFO:tensorflow:Assets written to: ./tabTransformerBasic/assets


Epoch 20/1000

Epoch 00020: val_loss did not improve from 0.10888
Epoch 21/1000

Epoch 00021: val_loss did not improve from 0.10888
Epoch 22/1000

Epoch 00022: val_loss did not improve from 0.10888
Epoch 23/1000

Epoch 00023: val_loss improved from 0.10888 to 0.10835, saving model to ./tabTransformerBasic
INFO:tensorflow:Assets written to: ./tabTransformerBasic/assets


INFO:tensorflow:Assets written to: ./tabTransformerBasic/assets


Epoch 24/1000

Epoch 00024: val_loss did not improve from 0.10835
Epoch 25/1000

Epoch 00025: val_loss did not improve from 0.10835
Epoch 26/1000

Epoch 00026: val_loss improved from 0.10835 to 0.10732, saving model to ./tabTransformerBasic
INFO:tensorflow:Assets written to: ./tabTransformerBasic/assets


INFO:tensorflow:Assets written to: ./tabTransformerBasic/assets


Epoch 27/1000

Epoch 00027: val_loss did not improve from 0.10732
Epoch 28/1000

Epoch 00028: val_loss did not improve from 0.10732
Epoch 29/1000

Epoch 00029: val_loss did not improve from 0.10732
Epoch 30/1000

Epoch 00030: val_loss improved from 0.10732 to 0.10615, saving model to ./tabTransformerBasic
INFO:tensorflow:Assets written to: ./tabTransformerBasic/assets


INFO:tensorflow:Assets written to: ./tabTransformerBasic/assets


Epoch 31/1000

Epoch 00031: val_loss did not improve from 0.10615
Epoch 32/1000

Epoch 00032: val_loss did not improve from 0.10615
Epoch 33/1000

Epoch 00033: val_loss did not improve from 0.10615
Epoch 34/1000

Epoch 00034: val_loss did not improve from 0.10615
Epoch 35/1000

Epoch 00035: val_loss did not improve from 0.10615
Epoch 36/1000

Epoch 00036: val_loss did not improve from 0.10615
Epoch 37/1000

Epoch 00037: val_loss did not improve from 0.10615
Epoch 38/1000

Epoch 00038: val_loss did not improve from 0.10615
Epoch 39/1000

Epoch 00039: val_loss did not improve from 0.10615
Epoch 40/1000

Epoch 00040: val_loss did not improve from 0.10615


In [13]:
tabtransformer = tf.keras.models.load_model('./tabTransformerBasic')

In [12]:
val_preds = tabtransformer.predict(val_dataset)

print(f"PR AUC: {average_precision_score(val_data['isFraud'], val_preds.ravel())}")
print(f"ROC AUC: {roc_auc_score(val_data['isFraud'], val_preds.ravel())}")

PR AUC: 0.4701335294324417
ROC AUC: 0.8794082315520849


## Test set prediction

In [9]:
test_data = pd.read_csv('fraud_test_preprocessed.csv')

In [10]:
test_data[CATEGORICAL_FEATURES] = test_data[CATEGORICAL_FEATURES].astype(str)
test_data[NUMERIC_FEATURES] = test_data[NUMERIC_FEATURES].astype(float)

In [11]:
test_dataset = df_to_dataset(test_data[FEATURES], None, shuffle=False, batch_size=56)

In [14]:
test_preds = tabtransformer.predict(test_dataset)

In [15]:
transaction_ids = pd.read_csv('../test_transaction.csv', usecols=['TransactionID'])
submission_df = pd.DataFrame({"TransactionID": transaction_ids.values.ravel(),
              "isFraud": test_preds.ravel()
             })
submission_df.to_csv("tabtransformer_basic_submission.csv", index=False)

In [23]:
submission_df.sample(4)

Unnamed: 0,TransactionID,isFraud
145394,3808943,0.025497
378395,4041944,0.007631
48681,3712230,0.028034
279758,3943307,0.025208
