In [12]:
import json
import kfp
from kfp.components import InputPath, OutputPath
import kfp.dsl as dsl
from kfp.dsl import PipelineConf, data_passing_methods
from kubernetes.client.models import V1Volume, V1PersistentVolumeClaimVolumeSource
import os
import requests
from requests import post
from tensorflow import keras
import pandas as pd


%load_ext lab_black

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [1]:
def get_data_table(rows: int):
    import pandas as pd
    from trino.dbapi import Connection

    with Connection(
        host="trino.trino",
        port="8080",
        user="anybody",
        catalog="jtopen",
        schema="demo",
    ) as conn:
        link = conn.cursor()
        link.execute(f"SELECT * FROM fraud LIMIT {rows}")
        return pd.DataFrame(link.fetchall(), columns=[i.name for i in link.description])


rdf = get_data_table(1000000)
print(f"Retrieved {len(rdf)} rows")
rdf.head()

Retrieved 1000000 rows


Unnamed: 0,user,card,year,month,day,time,amount,use chip,merchant name,merchant city,merchant state,zip,mcc,errors?,is fraud?
0,0,0,2002,9,1,06:21,$134.09,Swipe Transaction,3527213246127876953,La Verne,CA,91750,5300,...,No
1,0,0,2002,9,1,06:42,$38.48,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754,5411,...,No
2,0,0,2002,9,2,06:22,$120.34,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754,5411,...,No
3,0,0,2002,9,2,17:45,$128.95,Swipe Transaction,3414527459579106770,Monterey Park,CA,91754,5651,...,No
4,0,0,2002,9,3,06:23,$104.71,Swipe Transaction,5817218446178736267,La Verne,CA,91750,5912,...,No


In [5]:
import pickle
import os
from minio import Minio
from minio.error import S3Error

import requests
from requests.auth import HTTPBasicAuth
import urllib3

encoder_artifact = {
    "name": "preprocess-dataset-dataset_encoder_dir",
    "path": "/tmp/outputs/dataset_encoder_dir/data",
    "s3": {
        "key": "artifacts/fraud-detection-bvfhf/2023/11/29/fraud-detection-bvfhf-1498492315/preprocess-dataset-dataset_encoder_dir.tgz"
    },
}

# MinIO configuration
minio_endpoint = "172.30.131.125:9000"  # e.g., 'localhost:9000'
minio_access_key = "minio"
minio_secret_key = "minio123"

# File details
bucket_name = "mlpipeline"
local_file_path = os.path.abspath("preprocess-dataset-dataset_encoder_dir.tgz")

# Create a MinIO client with the endpoint and access keys
minio_client = Minio(
    minio_endpoint,
    access_key=minio_access_key,
    secret_key=minio_secret_key,
    secure=False,  # Set to False if the MinIO server doesn't support HTTPS
)

try:
    # Get the object and write it to the local file path
    minio_client.fget_object(
        bucket_name, encoder_artifact["s3"]["key"], local_file_path
    )
    print(f"File downloaded successfully to {local_file_path}")
except Exception as exc:
    print(f"An error occurred: {exc}")

File downloaded successfully to /home/jovyan/kubeflow-ppc64le-sandbox/notebooks/preprocess-dataset-dataset_encoder_dir.tgz


In [7]:
import tarfile
import dill

with tarfile.open(local_file_path, "r:gz") as tar:
    tar.extractall("encoders")

with open(os.path.join("encoders", "data", "mapper.pkl"), "rb") as f:
    t_mapper = dill.load(f)

t_mapper

In [8]:
from sklearn.preprocessing import (
    LabelEncoder,
    OneHotEncoder,
    FunctionTransformer,
    MinMaxScaler,
    LabelBinarizer,
)
from sklearn_pandas import DataFrameMapper
from encoders import fraudEncoder, amtEncoder, decimalEncoder, timeEncoder


class FraudDatasetTransformer:
    def __init__(self):
        ...

    def transform(self, dataset: pd.DataFrame, mapper: DataFrameMapper):
        tdf = dataset.copy()
        tdf["merchant name"] = tdf["merchant name"].astype(str)
        tdf.drop(["mcc", "zip", "merchant state"], axis=1, inplace=True)
        tdf.sort_values(by=["user", "card"], inplace=True)
        tdf.reset_index(inplace=True, drop=True)

        tdf = mapper.transform(tdf)
        return tdf

In [9]:
dataset_transformer = FraudDatasetTransformer()

test_data = get_data_table(20)

vdf = dataset_transformer.transform(test_data, t_mapper)
vdf.head()

Unnamed: 0,is fraud?,merchant name_0,merchant name_1,merchant name_2,merchant name_3,merchant name_4,merchant name_5,merchant name_6,merchant name_7,merchant name_8,...,"errors?_Bad Expiration,Technical Glitch,","errors?_Bad PIN,","errors?_Bad PIN,Insufficient Balance,","errors?_Bad PIN,Technical Glitch,","errors?_Bad Zipcode,","errors?_Insufficient Balance,","errors?_Insufficient Balance,Technical Glitch,","errors?_Technical Glitch,",year_month_day_time,amount
0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0.268375,0.554906
1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0.268377,0.41349
2,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0.26849,0.54265
3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0.268544,0.550478
4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0.268604,0.52689


In [45]:
HOST = f"{MODEL_NAME}-predictor-default.{NAMESPACE}"
HEADERS = {"Host": HOST}
MODEL_ENDPOINT = f"http://{MODEL_NAME}-predictor-default/v2/models/model"

res_svc = requests.get(MODEL_ENDPOINT, headers=HEADERS)
response_svc = json.loads(res_svc.text)
response_svc

{'name': 'model',
 'versions': ['1'],
 'platform': 'onnxruntime_onnx',
 'inputs': [{'name': 'input_1', 'datatype': 'FP32', 'shape': [-1, 4, 103]}],
 'outputs': [{'name': 'dense', 'datatype': 'FP32', 'shape': [-1, 1]}]}

In [50]:
x, y = vdf.drop(vdf.columns.values[0], axis=1).to_numpy(), vdf[
    vdf.columns.values[0]
].to_numpy().reshape(len(vdf), 1)

dataset = keras.preprocessing.timeseries_dataset_from_array(
    x, y, sequence_length=response_svc["inputs"][0]["shape"][1], batch_size=128
)

PREDICT_ENDPOINT = MODEL_ENDPOINT + "/infer"
# Rest of your code for making the request


for batch in dataset.take(10):
    input_d, output_d = batch[0], batch[1]
    for in_x, out_y in zip(input_d, output_d):
        payload = {
            "inputs": [
                {
                    "name": response_svc["inputs"][0]["name"],
                    "shape": [
                        1,
                        4,
                        103,
                    ],  # has to match response_svc["inputs"][0]["shape"] (except for 1. dimension)
                    "datatype": response_svc["inputs"][0]["datatype"],
                    "data": in_x.numpy().tolist(),
                }
            ]
        }
        res = requests.post(PREDICT_ENDPOINT, headers=HEADERS, data=json.dumps(payload))
        response = json.loads(res.text)
        print(response["outputs"])
        pred = response["outputs"][0]["data"][0]
        print(
            f"Actual ({out_y.numpy()[0]}) vs. Prediction ({round(pred, 3)} => {int(round(pred, 0))})"
        )

[{'name': 'dense', 'datatype': 'FP32', 'shape': [1, 1], 'data': [0.0002695024013519287]}]
Actual (0) vs. Prediction (0.0 => 0)
[{'name': 'dense', 'datatype': 'FP32', 'shape': [1, 1], 'data': [0.00039082765579223633]}]
Actual (0) vs. Prediction (0.0 => 0)
[{'name': 'dense', 'datatype': 'FP32', 'shape': [1, 1], 'data': [0.00031810998916625977]}]
Actual (0) vs. Prediction (0.0 => 0)
[{'name': 'dense', 'datatype': 'FP32', 'shape': [1, 1], 'data': [0.0001963973045349121]}]
Actual (0) vs. Prediction (0.0 => 0)
[{'name': 'dense', 'datatype': 'FP32', 'shape': [1, 1], 'data': [0.0002713203430175781]}]
Actual (0) vs. Prediction (0.0 => 0)
[{'name': 'dense', 'datatype': 'FP32', 'shape': [1, 1], 'data': [0.0003179609775543213]}]
Actual (0) vs. Prediction (0.0 => 0)
[{'name': 'dense', 'datatype': 'FP32', 'shape': [1, 1], 'data': [0.0003872215747833252]}]
Actual (0) vs. Prediction (0.0 => 0)
[{'name': 'dense', 'datatype': 'FP32', 'shape': [1, 1], 'data': [0.00028336048126220703]}]
Actual (0) vs. Pre