#Transformer as classifier on raw dataset

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score
from imblearn.over_sampling import RandomOverSampler
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.models import Sequential


df = pd.read_csv('creditcard.csv')
X = df.drop('Class', axis=1).values
y = df['Class'].values


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

# Oversample the minority class (fraud) to mitigate imbalance
ros = RandomOverSampler(random_state=42)
X_train_bal, y_train_bal = ros.fit_resample(X_train, y_train)


n_features = X_train_bal.shape[1]
d_model    = 64
num_heads  = 4
ff_dim     = 128
num_layers = 2

inputs = tf.keras.Input(shape=(n_features,))
# Reshape each sample to a "sequence" of feature tokens
x = layers.Reshape((n_features, 1))(inputs)
# Project to d_model dimensions
x = layers.Dense(d_model)(x)


positions = tf.range(start=0, limit=n_features, delta=1)
pos_embed = layers.Embedding(input_dim=n_features, output_dim=d_model)
x = x + pos_embed(positions)

# Transformer encoder blocks
for _ in range(num_layers):

    attn_output = layers.MultiHeadAttention(num_heads=num_heads,
                                            key_dim=d_model)(x, x)
    x = layers.Add()([x, attn_output])
    x = layers.LayerNormalization(epsilon=1e-6)(x)
    # Feed-forward network
    ff_output = Sequential([
        layers.Dense(ff_dim, activation='relu'),
        layers.Dense(d_model),
    ])(x)
    x = layers.Add()([x, ff_output])
    x = layers.LayerNormalization(epsilon=1e-6)(x)


x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.3)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

neg, pos = np.bincount(y_train_bal)
class_weight = {0: 1.0, 1: neg / pos}

model.compile(
    optimizer=optimizers.Adam(learning_rate=1e-4),
    loss='binary_crossentropy',
    metrics=[
        tf.keras.metrics.AUC(name='auc'),
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall'),
    ],
)


history = model.fit(
    X_train_bal, y_train_bal,
    epochs=20,
    batch_size=256,
    validation_split=0.2,
    class_weight=class_weight,
    verbose=2
)

y_prob = model.predict(X_test).ravel()
y_pred = (y_prob >= 0.5).astype(int)

print("Classification report:\n", classification_report(y_test, y_pred))
roc_auc = roc_auc_score(y_test, y_prob)
pr_auc  = average_precision_score(y_test, y_prob)
print(f"Test ROC‑AUC: {roc_auc:.4f}")
print(f"Test PR‑AUC : {pr_auc:.4f}")

Epoch 1/20
1422/1422 - 839s - 590ms/step - auc: 0.9663 - loss: 0.1962 - precision: 0.9499 - recall: 0.8616 - val_auc: 0.0000e+00 - val_loss: 0.2269 - val_precision: 1.0000 - val_recall: 0.8960
Epoch 2/20
1422/1422 - 860s - 605ms/step - auc: 0.9912 - loss: 0.1095 - precision: 0.9633 - recall: 0.9195 - val_auc: 0.0000e+00 - val_loss: 0.1205 - val_precision: 1.0000 - val_recall: 0.9338
Epoch 3/20
1422/1422 - 792s - 557ms/step - auc: 0.9974 - loss: 0.0640 - precision: 0.9716 - recall: 0.9614 - val_auc: 0.0000e+00 - val_loss: 0.0736 - val_precision: 1.0000 - val_recall: 0.9715
Epoch 4/20
1422/1422 - 831s - 585ms/step - auc: 0.9987 - loss: 0.0383 - precision: 0.9805 - recall: 0.9849 - val_auc: 0.0000e+00 - val_loss: 0.0334 - val_precision: 1.0000 - val_recall: 0.9871
Epoch 5/20
1422/1422 - 834s - 587ms/step - auc: 0.9992 - loss: 0.0264 - precision: 0.9856 - recall: 0.9939 - val_auc: 0.0000e+00 - val_loss: 0.0055 - val_precision: 1.0000 - val_recall: 1.0000
Epoch 6/20
1422/1422 - 849s - 597ms

Note-Chatgpt was used to undersatnd results and solve errors in encoder block

The Transformer model, when trained on the original highly imbalanced dataset, achieves very high overall accuracy (≈100%) and a strong ROC-AUC of 0.9578, reflecting excellent discrimination between fraudulent and legitimate transactions.

However, looking closer at fraud detection:

Recall for fraud (class 1) is high at 0.86, meaning the model is able to catch the majority of fraudulent cases.

Precision for fraud is relatively low at 0.39, indicating that a substantial proportion of predicted fraud cases are false positives.

The F1-score for fraud is 0.54, showing only moderate balance between precision and recall.

The PR-AUC of 0.7618 further confirms that while the model is effective at ranking instances, its performance under imbalanced conditions still favors the majority class.

In other words, although the Transformer excels at distinguishing fraud from non-fraud in terms of ranking (ROC-AUC), the imbalance in the dataset limits its ability to maintain high precision for the minority fraud class.

Overall, the Transformer shows strong potential with excellent recall, making it effective at catching fraud, but improvements (such as data augmentation, threshold tuning, or hybrid GAN-based oversampling) are needed to boost precision and practical deployment performance.

[Credit card fraud detection using advanced transformer model](https://arxiv.org/abs/2402.00000), https://www.linkedin.com/pulse/credit-card-fraud-detection-how-ai-powered-algorithms-utpal-dutta-f07sc/