

---

# Intro

**Plan**: Import credit card fraud data. Use encoder only transformer network for classifying time series credit card data

**Purpose/Intro**: Task is to develop transformer architecture proof of concept for potential application at work, detecting fraud. In a normal data science project it might be considered best practice to begin with more interpretable models first, for research purposes, but this project is solely for the purpose of assessing the viability of a transformer for this task.

**Hypothesis**: The attention mechanism of the transformer, when combined with an appropriate positional embedding method, is able to capture both long-term and short-term dependencies in time series credit-card fraud data.

**Methodology**: Using cross valdiation techniques on test dataset to calculate appropriate accuracy metrics (adjusting for the significant class imbalance for the dataset), with an aim to assess the viability of transformer networks for fraud classification.





---

# Data Sourcing and Processing



In [2]:

#import packages:

import os
os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'
os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'

from google.colab import drive

try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

if IN_COLAB:
  # Check if drive is mounted by looking for the mount point in the file system.
  # This is a more robust approach than relying on potentially internal variables.
  import os
  if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

#basics
import os
from google.colab import drive
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#table one
!pip install tableone
from tableone import TableOne

#torch
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

#sklearn
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold

from imblearn.over_sampling import RandomOverSampler

Collecting tableone
  Downloading tableone-0.9.1-py3-none-any.whl.metadata (8.5 kB)
Downloading tableone-0.9.1-py3-none-any.whl (41 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.6/41.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tableone
Successfully installed tableone-0.9.1


In [5]:
data_set_filepath = '/content/drive/MyDrive/Colab_Notebooks/Data/creditcard.feather'

df = pd.read_feather(data_set_filepath)

columns = df.columns.tolist()

print(f"The dataset lenghth is {str(len(df))}")
print(f"The number of columns is {str(len(columns))}")
print(f"The column names are {str(columns)}")
df.head(10)

table1 = TableOne(df, columns=columns, groupby= 'class', pval=True)
print(table1)





The dataset lenghth is 284807
The number of columns is 31
The column names are ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'Class']


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0
5,2.0,-0.425966,0.960523,1.141109,-0.168252,0.420987,-0.029728,0.476201,0.260314,-0.568671,...,-0.208254,-0.559825,-0.026398,-0.371427,-0.232794,0.105915,0.253844,0.08108,3.67,0
6,4.0,1.229658,0.141004,0.045371,1.202613,0.191881,0.272708,-0.005159,0.081213,0.46496,...,-0.167716,-0.27071,-0.154104,-0.780055,0.750137,-0.257237,0.034507,0.005168,4.99,0
7,7.0,-0.644269,1.417964,1.07438,-0.492199,0.948934,0.428118,1.120631,-3.807864,0.615375,...,1.943465,-1.015455,0.057504,-0.649709,-0.415267,-0.051634,-1.206921,-1.085339,40.8,0
8,7.0,-0.894286,0.286157,-0.113192,-0.271526,2.669599,3.721818,0.370145,0.851084,-0.392048,...,-0.073425,-0.268092,-0.204233,1.011592,0.373205,-0.384157,0.011747,0.142404,93.2,0
9,9.0,-0.338262,1.119593,1.044367,-0.222187,0.499361,-0.246761,0.651583,0.069539,-0.736727,...,-0.246914,-0.633753,-0.120794,-0.38505,-0.069733,0.094199,0.246219,0.083076,3.68,0




---
# Transformer Model



**No HP Tuning**: First we will implement our model without HP tuning and try to overfit, to just prove that we have the generalization power, and just check that we can actually set up and run the architecture

In [None]:

# **Set device for GPU acceleration**
# If CUDA (NVIDIA GPU) is available, computations will use it. Otherwise, it defaults to CPU.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#get data
data = df

# Separate features (input data) and labels (output/target)
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

# **Normalize features**
# Standardize the `V1` to `V28` columns (zero mean, unit variance) for numerical stability.
# Log-transform the `Amount` column to reduce the effect of large values.
scaler = StandardScaler()
X[:, :-1] = scaler.fit_transform(X[:, :-1])  # Standardize PCA-transformed features
X[:, -1] = np.log1p(X[:, -1])  # Apply log(1 + x) to the 'Amount' column to normalize it

# **Split the data into training, validation, and test sets**
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# **Convert data to PyTorch tensors**
train_data = TensorDataset(torch.tensor(X_train, dtype=torch.float32).to(device),
                           torch.tensor(y_train, dtype=torch.float32).to(device))
val_data = TensorDataset(torch.tensor(X_val, dtype=torch.float32).to(device),
                         torch.tensor(y_val, dtype=torch.float32).to(device))
test_data = TensorDataset(torch.tensor(X_test, dtype=torch.float32).to(device),
                          torch.tensor(y_test, dtype=torch.float32).to(device))

# **DataLoader for batching**
batch_size = 64
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size)
test_loader = DataLoader(test_data, batch_size=batch_size)

# **2. Custom Positional Embedding Layer (from the book)**

class PositionalEmbedding(nn.Module):
    def __init__(self, sequence_length, input_dim, output_dim):
        super(PositionalEmbedding, self).__init__()
        # Token embedding: maps each input feature into a higher-dimensional space
        self.token_embeddings = nn.Embedding(input_dim, output_dim)
        # Position embedding: maps each position in the sequence to a higher-dimensional space
        self.position_embeddings = nn.Embedding(sequence_length, output_dim)
        self.sequence_length = sequence_length

    def forward(self, x):
        # Generate positions (0, 1, ..., sequence_length - 1)
        positions = torch.arange(0, x.size(1), device=x.device).unsqueeze(0)
        # Lookup embeddings for tokens and positions, then add them
        embedded_tokens = self.token_embeddings(x)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

# **3. Define the Transformer Model with Custom Positional Embedding**

class FraudDetectionTransformer(nn.Module):
    def __init__(self, input_dim, embed_dim, num_heads, ff_dim, num_layers, sequence_length):
        super(FraudDetectionTransformer, self).__init__()
        # Positional Embedding Layer (from the book)
        self.positional_embedding = PositionalEmbedding(sequence_length, input_dim, embed_dim)

        # Transformer Encoder Layer (using pre-built PyTorch functionality)
        self.encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,  # Dimension of the embedding space
            nhead=num_heads,  # Number of attention heads
            dim_feedforward=ff_dim,  # Dimension of the feed-forward layer
            dropout=0.1  # Dropout for regularization
        )
        # Stack multiple encoder layers
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)

        # Global Pooling Layer
        self.pooling = nn.AdaptiveAvgPool1d(1)

        # Fully Connected Output Layer
        self.fc = nn.Linear(embed_dim, 1)

        # Sigmoid Activation
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # Get positional embeddings and add them to the token embeddings
        x = self.positional_embedding(x)

        # Transpose the input to match the shape expected by the transformer (sequence, batch, feature)
        x = x.permute(1, 0, 2)  # (batch, seq_len, feature) -> (seq_len, batch, feature)

        # Pass through the transformer encoder
        x = self.transformer_encoder(x)

        # Apply global pooling (average over the sequence length)
        x = self.pooling(x.permute(1, 2, 0))  # (seq_len, batch, feature) -> (batch, feature, 1)

        # Flatten the output to match the input of the fully connected layer
        x = x.view(x.size(0), -1)

        # Final classification layer
        x = self.fc(x)

        # Sigmoid for binary classification (fraud or not)
        x = self.sigmoid(x)

        return x

# **4. Instantiate the Model**
input_dim = X_train.shape[1]  # Number of input features
embed_dim = 256  # Dimensionality of embedding space
num_heads = 4  # Number of attention heads
ff_dim = 512  # Feed-forward layer dimension
num_layers = 4  # Number of transformer layers
sequence_length = X_train.shape[1]  # Sequence length is the number of input features

model = FraudDetectionTransformer(input_dim, embed_dim, num_heads, ff_dim, num_layers, sequence_length).to(device)

# **5. Training Setup**
criterion = nn.BCELoss()  # Binary Cross Entropy loss for binary classification
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# **6. Training Loop**
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=10):
    model.train()  # Set model to training mode

    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()  # Zero the gradients

            # Forward pass
            outputs = model(inputs)

            # Calculate the loss
            loss = criterion(outputs.squeeze(), labels)

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        # Print training loss for every epoch
        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader)}")

        # Optionally, validate the model performance after each epoch (on the validation set)
        validate_model(model, val_loader, criterion)

# **7. Validation Function**
def validate_model(model, val_loader, criterion):
    model.eval()  # Set model to evaluation mode

    with torch.no_grad():
        total_loss = 0
        correct = 0
        total = 0
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            # Forward pass
            outputs = model(inputs)

            # Calculate the loss
            loss = criterion(outputs.squeeze(), labels)
            total_loss += loss.item()

            # Calculate accuracy
            predicted = (outputs.squeeze() > 0.5).float()
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

        print(f"Validation Loss: {total_loss/len(val_loader)}, Accuracy: {correct/total * 100}%")

# **8. Start Training**
train_model(model, train_loader, val_loader, criterion, optimizer, epochs=10)

# **9. Testing the Model**
def test_model(model, test_loader):
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():
        correct = 0
        total = 0
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            # Forward pass
            outputs = model(inputs)

            # Calculate accuracy
            predicted = (outputs.squeeze() > 0.5).float()
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

        print(f"Test Accuracy: {correct/total * 100}%")

# **10. Evaluate on the Test Set**
test_model(model, test_loader)
