# BentoML Demo - IEEE-CIS Fraud Detection

Accept dataset rules on Kaggle before downloading: https://www.kaggle.com/competitions/ieee-fraud-detection/data

In [None]:
# Set Kaggle Credentials for downloading dataset
%env KAGGLE_USERNAME=
%env KAGGLE_KEY=

In [None]:
!kaggle competitions download -c ieee-fraud-detection
!unzip -d ./data/ ieee-fraud-detection.zip && rm ieee-fraud-detection.zip

In [None]:
import pandas as pd
import numpy as np

df_transactions = pd.read_csv("./data/train_transaction.csv")

X = df_transactions.drop(columns=['isFraud'])
y = df_transactions.isFraud

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.feature_selection import SelectPercentile, chi2

numeric_features = df_transactions.select_dtypes(include='float64').columns
categorical_features = df_transactions.select_dtypes(include='object').columns

preprocessor = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), numeric_features),
        ("cat", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_features),
    ],
    verbose_feature_names_out=False,
    remainder='passthrough'
)
preprocessor.set_output(transform="pandas")

In [None]:
X = preprocessor.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
import xgboost as xgb

def train(n_estimators, max_depth):
    return xgb.XGBClassifier(
        tree_method='hist',
        n_estimators=n_estimators,
        max_depth=max_depth,
        eval_metric="aucpr",
        objective='binary:logistic',
        enable_categorical=True,
    ).fit(
        X_train,
        y_train,
        eval_set=[(X_test, y_test)]
    )

In [None]:
# small model with 300 gradient boosted trees and a maximum tree depth of 5
model_sm = train(300, 5)

In [None]:
import bentoml
bentoml.xgboost.save_model(
    "ieee-fraud-detection-sm",
    model_sm,
    signatures={
        "predict_proba": {"batchable": True},
    },
    custom_objects={
        "preprocessor": preprocessor
    }
)

In [None]:
import bentoml
import pandas as pd
import numpy as np

model_ref = bentoml.xgboost.get("ieee-fraud-detection-sm:latest")
model_runner = model_ref.to_runner()
model_runner.init_local()
model_preprocessor = model_ref.custom_objects['preprocessor']

test_transactions = pd.read_csv("./data/test_transaction.csv")[0:500]
test_transactions = model_preprocessor.transform(test_transactions)
result = model_runner.predict_proba.run(test_transactions)
np.argmax(result, axis=1)

In [None]:
# large model with 3000 gradient boosted trees and a maximum tree depth of 15
model_lg = train(3000, 15)

In [None]:
import bentoml
bentoml.xgboost.save_model(
    "ieee-fraud-detection-lg",
    model_lg,
    signatures={
        "predict_proba": {"batchable": True},
    },
    custom_objects={
        "preprocessor": preprocessor
    }
)

In [None]:
import bentoml
import pandas as pd
import numpy as np

model_ref = bentoml.xgboost.get("ieee-fraud-detection-lg:latest")
model_runner = model_ref.to_runner()
model_runner.init_local()
model_preprocessor = model_ref.custom_objects['preprocessor']

test_transactions = pd.read_csv("./data/test_transaction.csv")[0:500]
test_transactions = model_preprocessor.transform(test_transactions)
result = model_runner.predict_proba.run(test_transactions)
np.argmax(result, axis=1)