# Financial Fraud Detection

In [1]:
# Add the "src" directory to the search path
import os
import sys
parent_dir = os.path.abspath(os.path.join(os.path.dirname(os.getcwd()), 'src'))
sys.path.insert(0, parent_dir)

In [2]:
# should be able to import from "src" folder now
from preprocess_TabFormer import proprocess_data

----
# Step 1: Get and Prepare the data

___Unfortunatley the data need to be downloaded manually___

1. Download the dataset: https://ibm.ent.box.com/v/tabformer-data/folder/130747715605
2. untar and uncompreess the file: tar -xvzf ./transactions.tgz
3. Put card_transaction.v1.csv in in the "./data/TabFormer/raw" folder

In [6]:
# set the path to your dataset root directory
dataset_root_dir = '../data/TabFormer/'

The goal is to produce the following structure
```sh
data/TabFormer 
├── raw
│   ├── card_transaction.v1.csv
```

In [7]:
!tree {dataset_root_dir}

[01;34m../data/TabFormer/[0m
└── [01;34mraw[0m
    ├── card_transaction.v1.csv
    ├── [01;34mgnn[0m
    └── [01;34mxgb[0m

3 directories, 1 file


In [8]:
# Preprocess the data
proprocess_data(dataset_root_dir)

Correlation (Card, Fraud) =   6.59%
Correlation (Chip, Fraud) =   5.63%
Correlation (Errors, Fraud) =   1.81%
Correlation (State, Fraud) =  35.92%
Correlation (City, Fraud) =  32.47%
Correlation (Zip, Fraud) =  14.99%
Correlation (MCC, Fraud) =  12.70%
Correlation (Merchant, Fraud) =  34.88%
Correlation (User, Fraud) =   3.40%
Correlation (Day, Fraud) =   0.26%
Correlation (Month, Fraud) =   0.23%
Correlation (Year, Fraud) =   2.35%
r_pb (Time) = -0.00 with p_value 0.00
r_pb (Amount) = 0.03 with p_value 0.00
Transaction ID range (0, 281107)
Merchant ID range (281108, 322375)
User ID range (322376, 327149)


In [9]:
!tree {dataset_root_dir}

[01;34m../data/TabFormer/[0m
├── [01;34mgnn[0m
│   ├── edges.csv
│   ├── features.csv
│   ├── info.json
│   └── labels.csv
├── [01;34mraw[0m
│   ├── card_transaction.v1.csv
│   ├── [01;34mgnn[0m
│   └── [01;34mxgb[0m
└── [01;34mxgb[0m
    ├── example_transactions.csv
    ├── test.csv
    ├── training.csv
    ├── untransformed_test.csv
    └── validation.csv

5 directories, 10 files


---
# Step 2: Preprocess data and run training using Training NIM
- Call function to preprocess data
- Train models using Training NIM
  - - For local testing, deploy Training NIM
  - - Train model based on input config file

### Create training configuration file
NOTE: Training configuration file must conform to the training schemas defined in Training NIM

In [10]:
# Path to save the trained model
os.makedirs(os.path.join(dataset_root_dir, 'trained_models'), exist_ok=True)

#### !Important: Models and configuration files needed to deploy using Triton Inference server will be saved in trained_models/model-repository

In [11]:
training_config = {
  "paths": {
    "data_dir": "/data", # Mound dataset root directory under /data in the container
    "output_dir": "/data/trained_models" # Mounted path to save the trained models
  },

  "models": [
    {
      "kind": "GraphSAGE_XGBoost",
      "gpu": "single",
      "hyperparameters": {
        "gnn":{
          "hidden_channels": 16,
          "n_hops": 1,
          "dropout_prob": 0.1,
          "batch_size": 1024,
          "fan_out": 16,
          "num_epochs": 16
        },
        "xgb": {
          "max_depth": 6,
          "learning_rate": 0.2,
          "num_parallel_tree": 3,
          "num_boost_round": 512,
          "gamma": 0.0
        }

      }
    }
  ]
}


#### Save the training configuration file as a json file

In [12]:
import os
import json

training_config_file_name = 'training_config.json'

with open(os.path.join(training_config_file_name), 'w') as json_file:
    json.dump(training_config, json_file, indent=4)

### Make sure to pull the training container or build from the source using
```sh
    docker build --no-cache -t training_container /path/to/training_NIM_repo
```

#### Finally train the models according to above defined configuration file

In [13]:
!docker run --cap-add SYS_NICE -it --rm  --gpus all  -v {dataset_root_dir}:/data -v ./{training_config_file_name}:/app/config.json training_container --config /app/config.json

docker: permission denied while trying to connect to the Docker daemon socket at unix:///var/run/docker.sock: Head "http://%2Fvar%2Frun%2Fdocker.sock/_ping": dial unix /var/run/docker.sock: connect: permission denied.
See 'docker run --help'.


#### Make sure that the `model_repository` has been created with right contents in it
According the above defined configuration file, the `model_repository`, which is folder containing the models and configuration files to be deployed on the Triton inference Server, will be created under 
{dataset_root_dir}/trained_models/ and its contents will look like

```sh
├── model
│   ├── 1
│   │   └── graph_sage_node_embedder.onnx
│   └── config.pbtxt
└── xgboost
    ├── 1
    │   └── xgboost_on_embeddings.json
    └── config.pbtxt

```


In [14]:

!tree {dataset_root_dir}/trained_models/model_repository

../data/TabFormer//trained_models/model_repository  [error opening dir]

0 directories, 0 files


----
# Step 3:  Serve your model on Triton Inference Server

!Important: Change MODEL_REPO_PATH to point to the `model repository` folder if you used different path in your training configuration file

#### Install tritonclient

In [15]:
!pip install tritonclient[all]

Collecting tritonclient[all]
  Downloading tritonclient-2.54.0-py3-none-manylinux1_x86_64.whl.metadata (2.8 kB)
Collecting python-rapidjson>=0.9.1 (from tritonclient[all])
  Downloading python_rapidjson-1.20-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting aiohttp<4.0.0,>=3.8.1 (from tritonclient[all])
  Downloading aiohttp-3.11.12-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting geventhttpclient>=2.3.3 (from tritonclient[all])
  Downloading geventhttpclient-2.3.3-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.7 kB)
Collecting grpcio<1.68,>=1.63.0 (from tritonclient[all])
  Downloading grpcio-1.67.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Collecting protobuf<6.0dev,>=5.26.1 (from tritonclient[all])
  Downloading protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Collecting aiohappyeyeballs>=

In [16]:
import os
import time

import tritonclient.grpc as triton_grpc
import tritonclient.http as httpclient
from tritonclient import utils as triton_utils

In [17]:
# Set to False for remote/cloud deployment
run_locally = True 

##### Replace HOST with the actual server URL where your Triton Inference Server is hosted.


In [18]:
if run_locally:
    HOST = 'localhost'
else:
    HOST = '<SERVER_URL>' # Replace with your server URL or IP address

HTTP_PORT = 8000
GRPC_PORT = 8001

### If you are testing a local deployment
- Pull Triton inference server docker image
- Deploy server with  models and configuration files (produced by the training NIM)
- Double check that your model repository folder has the following structures
```sh
├── model
│   ├── 1
│   │   └── graph_sage_node_embedder.onnx
│   └── config.pbtxt
└── xgboost
    ├── 1
    │   └── xgboost_on_embeddings.json
    └── config.pbtxt
```

In [19]:
if run_locally:
    
    # Triton server image
    TRITON_IMAGE = 'nvcr.io/nvidia/tritonserver:25.01-py3'
    MODEL_REPO_PATH = os.path.join(dataset_root_dir, 'trained_models/model_repository')

    # Pull docker 
    !docker pull {TRITON_IMAGE}
    !docker stop tritonserver
    !docker rm tritonserver

    !docker run --gpus all -d -p {HTTP_PORT}:{HTTP_PORT} -p {GRPC_PORT}:{GRPC_PORT} -v {MODEL_REPO_PATH}:/models --name tritonserver {TRITON_IMAGE} tritonserver --model-repository=/models



permission denied while trying to connect to the Docker daemon socket at unix:///var/run/docker.sock: Post "http://%2Fvar%2Frun%2Fdocker.sock/v1.47/images/create?fromImage=nvcr.io%2Fnvidia%2Ftritonserver&tag=25.01-py3": dial unix /var/run/docker.sock: connect: permission denied
permission denied while trying to connect to the Docker daemon socket at unix:///var/run/docker.sock: Post "http://%2Fvar%2Frun%2Fdocker.sock/v1.47/containers/tritonserver/stop": dial unix /var/run/docker.sock: connect: permission denied
permission denied while trying to connect to the Docker daemon socket at unix:///var/run/docker.sock: Delete "http://%2Fvar%2Frun%2Fdocker.sock/v1.47/containers/tritonserver": dial unix /var/run/docker.sock: connect: permission denied
docker: permission denied while trying to connect to the Docker daemon socket at unix:///var/run/docker.sock: Head "http://%2Fvar%2Frun%2Fdocker.sock/_ping": dial unix /var/run/docker.sock: connect: permission denied.
See 'docker run --help'.


### URLs for GRPC and HTTP request to the inference server

In [20]:
client_grpc = triton_grpc.InferenceServerClient(url=f'{HOST}:{GRPC_PORT}')
client_http = httpclient.InferenceServerClient(url=f'{HOST}:{HTTP_PORT}')

### Wait for the triton inference server to come online
NOTE: If the following cell keeps running longer then interrupt execution and run again.

In [21]:

TIMEOUT = 60
client_grpc = triton_grpc.InferenceServerClient(url=f'{HOST}:{GRPC_PORT}')
server_start = time.time()
while True:
    try:
        if client_grpc.is_server_ready() or time.time() - server_start > TIMEOUT:
            break
    except triton_utils.InferenceServerException:
        pass
    time.sleep(1)


KeyboardInterrupt: 

### For local deployment, check if the triton inference server is running properly

In [None]:
if run_locally:
    !docker logs tritonserver

### Read preprocessed input transactions to make query to the triton inference server

In [None]:
import pandas as pd
import numpy as np

test_path = os.path.join(dataset_root_dir, "xgb/test.csv") # already preprocessed data
test_df = pd.read_csv(test_path)
X = test_df.iloc[:, :-1].values.astype(np.float32)
y = test_df.iloc[:, -1].values
edge_index = np.array([[], []]).astype(np.int64) # empty edge_index

### Setup the HTTP request's inputs and output to retrieve embeddings for the input transactions

In [None]:
input_features = httpclient.InferInput("x", X.shape, datatype="FP32")
input_features.set_data_from_numpy(X)

input_edge_indices = httpclient.InferInput("edge_index", edge_index.shape, datatype="INT64")
input_edge_indices.set_data_from_numpy(edge_index)

outputs = httpclient.InferRequestedOutput("output")

### Send a query to retrieve embeddings

In [None]:
# Querying the server
results = client_http.infer(model_name="model", inputs=[input_features, input_edge_indices], outputs=[outputs])
node_embeddings = results.as_numpy('output')
# print(node_embeddings)


### Use the retrieved embeddings as inputs to predict the transactions' fraud scores

In [None]:
xgboost_input = httpclient.InferInput("input__0", node_embeddings.shape, datatype="FP32")
xgboost_input.set_data_from_numpy(node_embeddings)

xgboost_outputs = httpclient.InferRequestedOutput("output__0")

### Send a query to retrieve the fraud scores

In [None]:
results = client_http.infer(model_name="xgboost", inputs=[xgboost_input], outputs=[xgboost_outputs])
predictions = results.as_numpy('output__0')

### Evaluate performance

In [None]:
# Decision threshold to flag a transaction as fraud
#Change to trade-off precision and recall
decision_threshold = 0.5

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

y_pred = (predictions > decision_threshold).astype(int)


# Compute evaluation metrics
accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred, zero_division=0)
recall = recall_score(y, y_pred, zero_division=0)
f1 = f1_score(y, y_pred, zero_division=0)

print("----Summary---")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


### Compute confusion matrix 

In [None]:
import pandas as pd
# Create a DataFrame with labeled rows and columns
classes = ['Non-Fraud', 'Fraud']
columns = pd.MultiIndex.from_product([["Predicted"], classes])
index = pd.MultiIndex.from_product([["Actual"], classes])

conf_mat = confusion_matrix(y, y_pred)
cm_df = pd.DataFrame(conf_mat, index=index, columns=columns)
print(cm_df)

### Plot confusion matrix

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

# Plot the confusion matrix directly from predictions
disp = ConfusionMatrixDisplay.from_predictions(
    y, y_pred, display_labels=classes)
disp.ax_.set_title('Confusion Matrix')
plt.show()