In [1]:
import json
import kfp
import requests
import pandas as pd
from tensorflow import keras


%load_ext lab_black

# Fraud Detection Inference with ibmi db

In [2]:
MODEL_NAME = "fraud-detection-3d42b"
with open("/var/run/secrets/kubernetes.io/serviceaccount/namespace") as f:
    NAMESPACE = f.read()

print(MODEL_NAME)
print(NAMESPACE)

fraud-detection-3d42b
user-example-com


In [3]:
HOST = f"{MODEL_NAME}-predictor-default.{NAMESPACE}"
HEADERS = {"Host": HOST}
MODEL_ENDPOINT = f"http://{MODEL_NAME}-predictor-default/v2/models/model"

res_svc = requests.get(MODEL_ENDPOINT, headers=HEADERS)
response_svc = json.loads(res_svc.text)
response_svc

{'name': 'model',
 'versions': ['1'],
 'platform': 'onnxruntime_onnx',
 'inputs': [{'name': 'input_1', 'datatype': 'FP32', 'shape': [-1, 4, 103]}],
 'outputs': [{'name': 'dense', 'datatype': 'FP32', 'shape': [-1, 1]}]}

In [94]:
def get_data_table(rows: int):
    import pandas as pd
    from trino.dbapi import Connection

    with Connection(
        host="trino.trino",
        port="8080",
        user="anybody",
        catalog="jtopen",
        schema="demo",
    ) as conn:
        link = conn.cursor()
        link.execute(f"SELECT * FROM fraud LIMIT {rows}")
        return pd.DataFrame(link.fetchall(), columns=[i.name for i in link.description])


rdf = get_data_table(1000000)
print(f"Retrieved {len(rdf)} rows")
rdf.head()

Retrieved 1000000 rows


Unnamed: 0,user,card,year,month,day,time,amount,use chip,merchant name,merchant city,merchant state,zip,mcc,errors?,is fraud?
0,0,0,2002,9,1,06:21,$134.09,Swipe Transaction,3527213246127876953,La Verne,CA,91750,5300,...,No
1,0,0,2002,9,1,06:42,$38.48,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754,5411,...,No
2,0,0,2002,9,2,06:22,$120.34,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754,5411,...,No
3,0,0,2002,9,2,17:45,$128.95,Swipe Transaction,3414527459579106770,Monterey Park,CA,91754,5651,...,No
4,0,0,2002,9,3,06:23,$104.71,Swipe Transaction,5817218446178736267,La Verne,CA,91750,5912,...,No


In [78]:
len(rdf["merchant name"].unique())

10453

In [76]:
le = LabelEncoder()
le.fit(rdf["merchant name"].astype(str))
len(le.classes_)

10453

In [91]:
import math
import numpy as np
import os
import pandas as pd
import pickle

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    LabelEncoder,
    OneHotEncoder,
    FunctionTransformer,
    MinMaxScaler,
    LabelBinarizer,
)
from sklearn_pandas import DataFrameMapper


def save_to_dir(x, y, directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
    np.savez(os.path.join(directory, "data.npz"), x=x, y=y)


def split_dataset(n, df):
    test = df.iloc[:n, :]
    train = df.iloc[n:, :]
    return test, train


def merge_splits(frauds, non_frauds, split):
    print(
        f"{split} ratio fraud ({len(frauds)}) / non-fraud ({len(non_frauds)}):",
        len(frauds) / len(non_frauds),
    )
    df = pd.concat([frauds, non_frauds])
    df.sort_values("year_month_day_time", inplace=True)

    x, y = df.drop(["is fraud?"], axis=1), df["is fraud?"]
    min_ind = math.floor(len(x) / 128)
    x, y = x[-min_ind * 128 :], y[-min_ind * 128 :]
    y = y.astype("int")
    return x, y


def timeEncoder(X):
    X_hm = X["time"].str.split(":", expand=True)
    d = pd.to_datetime(
        dict(
            year=X["year"],
            month=X["month"],
            day=X["day"],
            hour=X_hm[0],
            minute=X_hm[1],
        )
    ).astype(int)
    return pd.DataFrame(d)


def amtEncoder(X):
    amt = (
        X.apply(lambda x: x[1:])
        .astype(float)
        .map(lambda amt: max(1, amt))
        .map(math.log)
    )
    return pd.DataFrame(amt)


def decimalEncoder(X, length=5):
    dnew = pd.DataFrame()
    for i in range(length):
        dnew[i] = np.mod(X, 10)
        X = np.floor_divide(X, 10)
    return dnew


def fraudEncoder(X):
    return np.where(X == "Yes", 1, 0).astype(int)


# df_nf = pd.read_csv(f"{os.getenv('HOME')}/card_transactions_non-frauds.csv")
# df_f = pd.read_csv(f"{os.getenv('HOME')}/card_transactions_frauds.csv")
# tdf = pd.concat([df_nf, df_f])
print("read in raw data")
tdf = rdf.copy()
tdf.columns = map(str.lower, tdf.columns)
tdf["merchant name"] = tdf["merchant name"].astype(str)
tdf.drop(["mcc", "zip", "merchant state"], axis=1, inplace=True)
tdf.sort_values(by=["user", "card"], inplace=True)
tdf.reset_index(inplace=True, drop=True)

test_encoders = {
    "merchant_name": LabelEncoder().fit(tdf["merchant name"]),
    "merchant_city": LabelEncoder().fit(tdf["merchant city"]),
}

test_mapper = DataFrameMapper(
    [
        ("is fraud?", FunctionTransformer(fraudEncoder)),
        (
            "merchant name",
            [
                test_encoders["merchant_name"],
                FunctionTransformer(decimalEncoder),
                OneHotEncoder(handle_unknown="ignore"),
            ],
        ),
        (
            "merchant city",
            [
                test_encoders["merchant_city"],
                FunctionTransformer(decimalEncoder),
                OneHotEncoder(handle_unknown="ignore"),
            ],
        ),
        (["use chip"], [SimpleImputer(strategy="constant"), LabelBinarizer()]),
        (["errors?"], [SimpleImputer(strategy="constant"), LabelBinarizer()]),
        (
            ["year", "month", "day", "time"],
            [FunctionTransformer(timeEncoder), MinMaxScaler()],
        ),
        ("amount", [FunctionTransformer(amtEncoder), MinMaxScaler()]),
    ],
    input_df=True,
    df_out=True,
)
print("fit and transform dataframe")
test_mapper.fit(tdf)b
tdf = test_mapper.transform(tdf)
print("done")

read in raw data
fit and transform dataframe
done


In [101]:
with open("mapper.pkl", "wb") as f:
    pickle.dump(test_mapper, f)

In [62]:
tdf.head()

Unnamed: 0,is fraud?,merchant name_0,merchant name_1,merchant name_2,merchant name_3,merchant name_4,merchant name_5,merchant name_6,merchant name_7,merchant name_8,...,"errors?_Bad Expiration,Technical Glitch,","errors?_Bad PIN,","errors?_Bad PIN,Insufficient Balance,","errors?_Bad PIN,Technical Glitch,","errors?_Bad Zipcode,","errors?_Insufficient Balance,","errors?_Insufficient Balance,Technical Glitch,","errors?_Technical Glitch,",year_month_day_time,amount
0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0.268375,0.554906
1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0.268377,0.41349
2,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0.26849,0.54265
3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0.268544,0.550478
4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0.268604,0.52689


In [63]:
list(tdf.columns)

['is fraud?',
 'merchant name_0',
 'merchant name_1',
 'merchant name_2',
 'merchant name_3',
 'merchant name_4',
 'merchant name_5',
 'merchant name_6',
 'merchant name_7',
 'merchant name_8',
 'merchant name_9',
 'merchant name_10',
 'merchant name_11',
 'merchant name_12',
 'merchant name_13',
 'merchant name_14',
 'merchant name_15',
 'merchant name_16',
 'merchant name_17',
 'merchant name_18',
 'merchant name_19',
 'merchant name_20',
 'merchant name_21',
 'merchant name_22',
 'merchant name_23',
 'merchant name_24',
 'merchant name_25',
 'merchant name_26',
 'merchant name_27',
 'merchant name_28',
 'merchant name_29',
 'merchant name_30',
 'merchant name_31',
 'merchant name_32',
 'merchant name_33',
 'merchant name_34',
 'merchant name_35',
 'merchant name_36',
 'merchant name_37',
 'merchant name_38',
 'merchant name_39',
 'merchant name_40',
 'merchant name_41',
 'merchant city_0',
 'merchant city_1',
 'merchant city_2',
 'merchant city_3',
 'merchant city_4',
 'merchant cit

In [81]:
len(test_encoders["merchant_name"].classes_)

10453

In [19]:
import boto3
import botocore
import pickle
import os
from minio import Minio
from minio.error import S3Error

import requests
from requests.auth import HTTPBasicAuth
import urllib3

encoder_artifact = {
    "name": "preprocess-dataset-dataset_encoder_dir",
    "path": "/tmp/outputs/dataset_encoder_dir/data",
    "s3": {
        "key": "artifacts/fraud-detection-7jpgd/2023/11/28/fraud-detection-7jpgd-2708517467/preprocess-dataset-dataset_encoder_dir.tgz"
    },
}

# MinIO configuration
minio_endpoint = "172.30.131.125:9000"  # e.g., 'localhost:9000'
minio_access_key = "minio"
minio_secret_key = "minio123"

# File details
bucket_name = "mlpipeline"
object_name = "fraud-detection-7jpgd/2023/11/28/fraud-detection-7jpgd-2708517467/preprocess-dataset-dataset_encoder_dir.tgz"
local_file_path = os.path.abspath("preprocess-dataset-dataset_encoder_dir.tgz")

# Create a MinIO client with the endpoint and access keys
minio_client = Minio(
    minio_endpoint,
    access_key=minio_access_key,
    secret_key=minio_secret_key,
    secure=False,  # Set to False if the MinIO server doesn't support HTTPS
)

try:
    # Get the object and write it to the local file path
    minio_client.fget_object(
        bucket_name, encoder_artifact["s3"]["key"], local_file_path
    )
    print(f"File downloaded successfully to {local_file_path}")
except Exception as exc:
    print(f"An error occurred: {exc}")

File downloaded successfully to /home/jovyan/kubeflow-ppc64le-sandbox/notebooks/preprocess-dataset-dataset_encoder_dir.tgz


In [28]:
import tarfile

encoders = {}
with tarfile.open(local_file_path, "r:gz") as tar:
    tar.extractall("encoders")

with open(os.path.join("encoders", "data", "encoders"), "rb") as f:
    encoders = pickle.load(f)

encoders

{'merchant_name': LabelEncoder(), 'merchant_city': LabelEncoder()}

In [45]:
encoders["merchant_name"].inverse_transform(list(encoders['merchant_name'].classes_))

  mask &= (ar1 != a)


ValueError: y contains previously unseen labels: ['-34551508091458520' '-4500542936415012428' '-5475680618560174533'
 '-6733168469687845480' '-7146670748125200898' '-727612092139916043'
 '-9092677072201095172' '2027553650310142703' '3414527459579106770'
 '3527213246127876953' '4055257078481058705' '4060646732831064559'
 '5817218446178736267']

In [82]:
import math
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    LabelEncoder,
    OneHotEncoder,
    FunctionTransformer,
    MinMaxScaler,
    LabelBinarizer,
)
from sklearn_pandas import DataFrameMapper
from encoders import fraudEncoder, amtEncoder, decimalEncoder, timeEncoder


class FraudDatasetTransformer:
    def __init__(self):
        ...

    #     def timeEncoder(self, X):
    #         X_hm = X["time"].str.split(":", expand=True)
    #         d = pd.to_datetime(
    #             dict(
    #                 year=X["year"],
    #                 month=X["month"],
    #                 day=X["day"],
    #                 hour=X_hm[0],
    #                 minute=X_hm[1],
    #             )
    #         ).astype(int)
    #         return pd.DataFrame(d)

    #     def amtEncoder(self, X):
    #         amt = (
    #             X.apply(lambda x: x[1:])
    #             .astype(float)
    #             .map(lambda amt: max(1, amt))
    #             .map(math.log)
    #         )
    #         return pd.DataFrame(amt)

    #     def decimalEncoder(self, X, length=5):
    #         dnew = pd.DataFrame()
    #         for i in range(length):
    #             dnew[i] = np.mod(X, 10)
    #             X = np.floor_divide(X, 10)
    #         return dnew

    #     def fraudEncoder(self, X):
    #         return np.where(X == "Yes", 1, 0).astype(int)

    def transform(self, dataset: pd.DataFrame, encoders: dict):
        tdf = dataset.copy()
        tdf["merchant name"] = tdf["merchant name"].astype(str)
        tdf.drop(["mcc", "zip", "merchant state"], axis=1, inplace=True)
        tdf.sort_values(by=["user", "card"], inplace=True)
        tdf.reset_index(inplace=True, drop=True)

        mapper = DataFrameMapper(
            [
                ("is fraud?", FunctionTransformer(fraudEncoder)),
                (
                    "merchant name",
                    [
                        encoders["merchant_name"],
                        FunctionTransformer(decimalEncoder),
                        OneHotEncoder(),
                    ],
                ),
                (
                    "merchant city",
                    [
                        encoders["merchant_city"],
                        FunctionTransformer(decimalEncoder),
                        OneHotEncoder(),
                    ],
                ),
                (["use chip"], [SimpleImputer(strategy="constant"), LabelBinarizer()]),
                (["errors?"], [SimpleImputer(strategy="constant"), LabelBinarizer()]),
                (
                    ["year", "month", "day", "time"],
                    [FunctionTransformer(timeEncoder), MinMaxScaler()],
                ),
                ("amount", [FunctionTransformer(amtEncoder), MinMaxScaler()]),
            ],
            input_df=True,
            df_out=True,
        )
        mapper.fit(tdf)
        tdf = mapper.transform(tdf)
        return tdf

In [97]:
for c in test_encoders["merchant_name"].classes_:
    if c == "3527213246127876953":
        print("found")

found


In [98]:
dataset_transformer = FraudDatasetTransformer()

test_data = get_data_table(1)


def transform_df(tdf):
    tdf["merchant name"] = tdf["merchant name"].astype(str)
    tdf.drop(["mcc", "zip", "merchant state"], axis=1, inplace=True)
    tdf.sort_values(by=["user", "card"], inplace=True)
    tdf.reset_index(inplace=True, drop=True)


transform_df(test_data)


# vdf = dataset_transformer.transform(test_data, test_encoders)
test_transformed = test_mapper.transform(test_data)
test_transformed.head()

Unnamed: 0,is fraud?,merchant name_0,merchant name_1,merchant name_2,merchant name_3,merchant name_4,merchant name_5,merchant name_6,merchant name_7,merchant name_8,...,"errors?_Bad Expiration,Technical Glitch,","errors?_Bad PIN,","errors?_Bad PIN,Insufficient Balance,","errors?_Bad PIN,Technical Glitch,","errors?_Bad Zipcode,","errors?_Insufficient Balance,","errors?_Insufficient Balance,Technical Glitch,","errors?_Technical Glitch,",year_month_day_time,amount
0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0.268375,0.554906


In [31]:
x, y = vdf.drop(vdf.columns.values[0], axis=1).to_numpy(), vdf[
    vdf.columns.values[0]
].to_numpy().reshape(len(vdf), 1)

dataset = keras.preprocessing.timeseries_dataset_from_array(
    x, y, sequence_length=response_svc["inputs"][0]["shape"][1], batch_size=128
)
dataset

2023-11-27 17:01:20.467748: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-11-27 17:01:20.467785: W tensorflow/stream_executor/cuda/cuda_driver.cc:263] failed call to cuInit: UNKNOWN ERROR (303)
2023-11-27 17:01:20.467854: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:163] no NVIDIA GPU device is present: /dev/nvidia0 does not exist


<BatchDataset element_spec=(TensorSpec(shape=(None, None, 27), dtype=tf.float64, name=None), TensorSpec(shape=(None, 1), dtype=tf.int64, name=None))>

In [48]:
PREDICT_ENDPOINT = MODEL_ENDPOINT + "/infer"

for batch in dataset.take(10):
    input_d, output_d = batch[0], batch[1]
    for in_x, out_y in zip(input_d, output_d):
        payload = {
            "inputs": [
                {
                    "name": response_svc["inputs"][0]["name"],
                    "shape": [
                        1,
                        4,
                        27,
                    ],  # has to match response_svc["inputs"][0]["shape"] (except for 1. dimension)
                    "datatype": response_svc["inputs"][0]["datatype"],
                    "data": in_x.numpy().tolist(),
                }
            ]
        }
        res = requests.post(PREDICT_ENDPOINT, headers=HEADERS, data=json.dumps(payload))
        response = json.loads(res.text)
        print(response)
        break

{'error': "unexpected shape for input 'input_1' for model 'model'. Expected [-1,4,103], got [1,4,27]"}


In [47]:
in_x.numpy().shape

(4, 27)