# BentoML Demo - IEEE-CIS Fraud Detection

Accept dataset rules on Kaggle before downloading: https://www.kaggle.com/competitions/ieee-fraud-detection/data

In [1]:
# Set Kaggle Credentials for downloading dataset
%env KAGGLE_USERNAME=s3sheng
%env KAGGLE_KEY=0e3966223300cd8314f8ce78b2d56058

env: KAGGLE_USERNAME=s3sheng
env: KAGGLE_KEY=0e3966223300cd8314f8ce78b2d56058


In [2]:
!kaggle competitions download -c ieee-fraud-detection
!rm -rf ./data/
!unzip -d ./data/ ieee-fraud-detection.zip && rm ieee-fraud-detection.zip

Downloading ieee-fraud-detection.zip to /Users/ssheng/github/BentoML/examples/fraud_detection
100%|███████████████████████████████████████▉| 118M/118M [00:36<00:00, 3.48MB/s]
100%|████████████████████████████████████████| 118M/118M [00:36<00:00, 3.42MB/s]
Archive:  ieee-fraud-detection.zip
  inflating: ./data/sample_submission.csv  
  inflating: ./data/test_identity.csv  
  inflating: ./data/test_transaction.csv  
  inflating: ./data/train_identity.csv  
  inflating: ./data/train_transaction.csv  


In [2]:
import pandas as pd
import numpy as np

df_transactions = pd.read_csv("./data/train_transaction.csv")

X = df_transactions.drop(columns=["isFraud"])
y = df_transactions.isFraud

In [5]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    StandardScaler,
    OneHotEncoder,
    LabelEncoder,
    OrdinalEncoder,
)
from sklearn.feature_selection import SelectPercentile, chi2

numeric_features = df_transactions.select_dtypes(include="float64").columns
categorical_features = df_transactions.select_dtypes(include="object").columns

preprocessor = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), numeric_features),
        (
            "cat",
            OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
            categorical_features,
        ),
    ],
    verbose_feature_names_out=False,
    remainder="passthrough",
)
# preprocessor.set_output(transform="pandas")

In [6]:
X = preprocessor.fit_transform(X)

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [8]:
import xgboost as xgb


def train(n_estimators, max_depth):
    return xgb.XGBClassifier(
        tree_method="hist",
        n_estimators=n_estimators,
        max_depth=max_depth,
        eval_metric="aucpr",
        objective="binary:logistic",
        enable_categorical=True,
    ).fit(X_train, y_train, eval_set=[(X_test, y_test)])

In [9]:
# small model with 300 gradient boosted trees and a maximum tree depth of 5
model_sm = train(10, 5)

[0]	validation_0-aucpr:0.35748
[1]	validation_0-aucpr:0.39296
[2]	validation_0-aucpr:0.42478
[3]	validation_0-aucpr:0.43739
[4]	validation_0-aucpr:0.44986
[5]	validation_0-aucpr:0.45934
[6]	validation_0-aucpr:0.47784
[7]	validation_0-aucpr:0.48783
[8]	validation_0-aucpr:0.49259
[9]	validation_0-aucpr:0.49932


In [10]:
import bentoml

bentoml.xgboost.save_model(
    "ieee-fraud-detection-sm",
    model_sm,
    signatures={
        "predict_proba": {"batchable": True},
    },
    custom_objects={"preprocessor": preprocessor},
)

Model(tag="ieee-fraud-detection-sm:ovtzbdgbg6esccvj", path="/Users/ssheng/bentoml/models/ieee-fraud-detection-sm/ovtzbdgbg6esccvj/")

In [11]:
model_ref = bentoml.xgboost.get("ieee-fraud-detection-sm:latest")
model_ref

Model(tag="ieee-fraud-detection-sm:ovtzbdgbg6esccvj", path="/Users/ssheng/bentoml/models/ieee-fraud-detection-sm/ovtzbdgbg6esccvj")

In [12]:
import bentoml
import pandas as pd
import numpy as np

model_ref = bentoml.xgboost.get("ieee-fraud-detection-sm:latest")
model_runner = model_ref.to_runner()
model_runner.init_local()
model_preprocessor = model_ref.custom_objects["preprocessor"]

test_transactions = pd.read_csv("./data/test_transaction.csv")[0:500]
test_transactions = model_preprocessor.transform(test_transactions)
result = model_runner.predict_proba.run(test_transactions)
np.argmax(result, axis=1)

'Runner.init_local' is for debugging and testing only. Make sure to remove it before deploying to production.


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

For the Inference Graph demo, let's train two additional models by tweaking the parameters:

In [None]:
# large model with 3000 gradient boosted trees and a maximum tree depth of 15
model_lg = train(3000, 15)

In [None]:
import bentoml

bentoml.xgboost.save_model(
    "ieee-fraud-detection-lg",
    model_lg,
    signatures={
        "predict_proba": {"batchable": True},
    },
    custom_objects={"preprocessor": preprocessor},
)

In [None]:
# tiny model with 300 gradient boosted trees and a maximum tree depth of 5
model_tiny = train(100, 3)

In [None]:
import bentoml

bentoml.xgboost.save_model(
    "ieee-fraud-detection-tiny",
    model_tiny,
    signatures={
        "predict_proba": {"batchable": True},
    },
    custom_objects={"preprocessor": preprocessor},
)