In [54]:
import json
import os
import requests

from tensorflow import keras
import pandas as pd

%load_ext lab_black

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [65]:
def get_data_table(rows: int):
    import pandas as pd
    from trino.dbapi import Connection

    with Connection(
        host="trino.trino",
        port="8080",
        user="anybody",
        catalog="jtopen",
        schema="demo",
    ) as conn:
        link = conn.cursor()
        link.execute(f"SELECT * FROM fraud offset 1000000 LIMIT {rows}")
        return pd.DataFrame(link.fetchall(), columns=[i.name for i in link.description])


rdf = get_data_table(10)
print(f"Retrieved {len(rdf)} rows")
rdf.head()

Retrieved 10 rows


Unnamed: 0,user,card,year,month,day,time,amount,use chip,merchant name,merchant city,merchant state,zip,mcc,errors?,is fraud?
0,74,3,2013,1,14,11:14,$109.11,Swipe Transaction,-8346396146708022106,Rockwall,TX,75032,5411,...,No
1,74,3,2013,1,14,23:36,$55.43,Swipe Transaction,3043415681375770481,Rockwall,TX,75032,5812,...,No
2,74,3,2013,1,15,11:00,$162.50,Swipe Transaction,4722913068560264812,Frisco,TX,75034,5411,...,No
3,74,3,2013,1,15,22:43,$74.97,Swipe Transaction,3043415681375770481,Rockwall,TX,75032,5812,...,No
4,74,3,2013,1,19,11:07,$121.71,Swipe Transaction,-5467922351692495955,Rockwall,TX,75032,5912,...,No


In [3]:
import pickle
import os
from minio import Minio
from minio.error import S3Error

import requests
from requests.auth import HTTPBasicAuth
import urllib3

encoder_artifact = {
    "name": "preprocess-dataset-dataset_encoder_dir",
    "path": "/tmp/outputs/dataset_encoder_dir/data",
    "s3": {
        "key": "artifacts/fraud-detection-jhjcw/2024/01/30/fraud-detection-jhjcw-3114545985/preprocess-dataset-dataset_encoder_dir.tgz"
    },
}

# MinIO configuration
minio_endpoint = "172.30.131.125:9000"  # e.g., 'localhost:9000'
minio_access_key = "minio"
minio_secret_key = "minio123"

# File details
bucket_name = "mlpipeline"
local_file_path = os.path.abspath("preprocess-dataset-dataset_encoder_dir.tgz")

# Create a MinIO client with the endpoint and access keys
minio_client = Minio(
    minio_endpoint,
    access_key=minio_access_key,
    secret_key=minio_secret_key,
    secure=False,  # Set to False if the MinIO server doesn't support HTTPS
)

try:
    # Get the object and write it to the local file path
    minio_client.fget_object(
        bucket_name, encoder_artifact["s3"]["key"], local_file_path
    )
    print(f"File downloaded successfully to {local_file_path}")
except Exception as exc:
    print(f"An error occurred: {exc}")

File downloaded successfully to /home/jovyan/kubeflow-ppc64le-sandbox/notebooks/preprocess-dataset-dataset_encoder_dir.tgz


In [56]:
import tarfile
import dill
import shutil

# shutil.rmtree("encoders")
with tarfile.open(local_file_path, "r:gz") as tar:
    tar.extractall("encoders")

with open(os.path.join("encoders", "data", "mapper.pkl"), "rb") as f:
    t_mapper = dill.load(f)

t_mapper

In [110]:
from sklearn.preprocessing import (
    LabelEncoder,
    OneHotEncoder,
    FunctionTransformer,
    MinMaxScaler,
    LabelBinarizer,
)
from sklearn_pandas import DataFrameMapper
from encoders import fraudEncoder, amtEncoder, decimalEncoder, timeEncoder


class FraudDatasetTransformer:
    def __init__(self):
        ...

    def transform(self, dataset: pd.DataFrame, mapper: DataFrameMapper):
        tdf = dataset.copy()
        tdf["merchant name"] = tdf["merchant name"].astype(str)
        tdf.drop(["mcc", "zip", "merchant state"], axis=1, inplace=True)
        tdf.sort_values(by=["user", "card"], inplace=True)
        tdf.reset_index(inplace=True, drop=True)

        tdf = mapper.transform(tdf)
        return tdf

In [111]:
MODEL_NAME = "fraud-detection-fd6e7"
NAMESPACE = "user-example-com"
HOST = f"{MODEL_NAME}-predictor-default.{NAMESPACE}"
HEADERS = {"Host": HOST}
MODEL_ENDPOINT = f"http://{MODEL_NAME}-predictor-default/v2/models/model"
PREDICT_ENDPOINT = MODEL_ENDPOINT + "/infer"

res_svc = requests.get(MODEL_ENDPOINT, headers=HEADERS)
response_svc = json.loads(res_svc.text)
response_svc

{'name': 'model',
 'versions': ['2'],
 'platform': 'onnxruntime_onnx',
 'inputs': [{'name': 'input_1', 'datatype': 'FP32', 'shape': [-1, 4, 103]}],
 'outputs': [{'name': 'dense', 'datatype': 'FP32', 'shape': [-1, 1]}]}

In [122]:
test_input = {
    "user": 0,
    "card": 0,
    "merchant name": "-8346396146708022106",
    "amount": "$340800000000000000",
    "year": 2022,
    "month": 1,
    "day": 1,
    "use chip": "Swipe Transaction ",
    "merchant city": "Rockwall                  ",
    "merchant state": "TX                              ",
    "zip": 0,
    "errors?": "Bad Card Number,                                    ",
    "mcc": 0,
    "is fraud?": "No",
    "time": "11:14",
}

dataset_transformer = FraudDatasetTransformer()


test_data = pd.DataFrame([test_input])
test_data

vdf = dataset_transformer.transform(test_data, t_mapper)
vdf.head()

Unnamed: 0,is fraud?,merchant name_0,merchant name_1,merchant name_2,merchant name_3,merchant name_4,merchant name_5,merchant name_6,merchant name_7,merchant name_8,...,"errors?_Bad Expiration,Technical Glitch,","errors?_Bad PIN,","errors?_Bad PIN,Insufficient Balance,","errors?_Bad PIN,Technical Glitch,","errors?_Bad Zipcode,","errors?_Insufficient Balance,","errors?_Insufficient Balance,Technical Glitch,","errors?_Technical Glitch,",year_month_day_time,amount
0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1.076999,4.573142


In [123]:
import numpy as np
import json
import requests
from tensorflow import keras

# Data preparation
x = vdf.drop(vdf.columns.values[0], axis=1).to_numpy()
y = np.array([vdf[vdf.columns.values[0]].iloc[0]])

# Adjust the shape of x to match model expectations
# We need to expand or repeat our single data point to match the sequence length and feature count
# Assuming your single row is a flat array of features, reshape and repeat it
sequence_length = 4  # Model's expected sequence length
num_features = 103  # Model's expected number of features per sequence element

# Check if the original features match the required total features
original_features = x.shape[1]
print(original_features)
if original_features < num_features:
    print("pad maybe?")
    # If fewer, we may need to pad or adjust the data; this is situational and may not be exactly correct without more context
    # For now, let's assume padding with zeros is acceptable
    x_padded = np.pad(
        x,
        ((0, 0), (0, num_features - original_features)),
        mode="constant",
        constant_values=0,
    )
else:
    print("reshape accordingly")
    # If it matches or exceeds, truncate or reshape accordingly (though unusual for a single data point)
    x_padded = x[:, :num_features]

# Reshape to [1, sequence_length, num_features], replicating the single data point across the new sequence length
x_reshaped = np.tile(x_padded, (sequence_length, 1)).reshape(
    1, sequence_length, num_features
)

# Preparing the payload
payload = {
    "inputs": [
        {
            "name": "input_1",
            "shape": [1, sequence_length, num_features],
            "datatype": "FP32",
            "data": x_reshaped.tolist(),
        }
    ]
}

# Sending the request
res = requests.post(PREDICT_ENDPOINT, headers=HEADERS, data=json.dumps(payload))
response = json.loads(res.text)

# Handle response
if "error" in response:
    print(f"Error: {response['error']}")
else:
    print(response["outputs"])
    pred = response["outputs"][0]["data"][0]
    print(f"Actual ({y[0]}) vs. Prediction ({round(pred, 3)} => {int(round(pred, 0))})")

103
reshape accordingly
[{'name': 'dense', 'datatype': 'FP32', 'shape': [1, 1], 'data': [0.6552416086196899]}]
Actual (0) vs. Prediction (0.655 => 1)


In [76]:
dataset_transformer = FraudDatasetTransformer()

test_data = get_data_table(20)
test_data.iloc[0].to_dict()

# vdf = dataset_transformer.transform(test_data, t_mapper)
# vdf.head()

Unnamed: 0,is fraud?,merchant name_0,merchant name_1,merchant name_2,merchant name_3,merchant name_4,merchant name_5,merchant name_6,merchant name_7,merchant name_8,...,"errors?_Bad Expiration,Technical Glitch,","errors?_Bad PIN,","errors?_Bad PIN,Insufficient Balance,","errors?_Bad PIN,Technical Glitch,","errors?_Bad Zipcode,","errors?_Insufficient Balance,","errors?_Insufficient Balance,Technical Glitch,","errors?_Technical Glitch,",year_month_day_time,amount
0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0.702125,0.531553
1,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0.702184,0.454835
2,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0.702238,0.576675
3,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0.702294,0.489042
4,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0.702697,0.543932


In [73]:
MODEL_NAME = "fraud-detection-fd6e7"
NAMESPACE = "user-example-com"
HOST = f"{MODEL_NAME}-predictor-default.{NAMESPACE}"
HEADERS = {"Host": HOST}
MODEL_ENDPOINT = f"http://{MODEL_NAME}-predictor-default/v2/models/model"
PREDICT_ENDPOINT = MODEL_ENDPOINT + "/infer"

res_svc = requests.get(MODEL_ENDPOINT, headers=HEADERS)
response_svc = json.loads(res_svc.text)
response_svc

{'name': 'model',
 'versions': ['1'],
 'platform': 'onnxruntime_onnx',
 'inputs': [{'name': 'input_1', 'datatype': 'FP32', 'shape': [-1, 4, 103]}],
 'outputs': [{'name': 'dense', 'datatype': 'FP32', 'shape': [-1, 1]}]}

In [81]:
x, y = vdf.drop(vdf.columns.values[0], axis=1).to_numpy(), vdf[
    vdf.columns.values[0]
].to_numpy().reshape(len(vdf), 1)

print(x)
print(y)


dataset = keras.preprocessing.timeseries_dataset_from_array(
    x, y, sequence_length=response_svc["inputs"][0]["shape"][1], batch_size=128, sampling_rate=0
)
# Rest of your code for making the request


for batch in dataset.take(10):
    input_d, output_d = batch[0], batch[1]
    for in_x, out_y in zip(input_d, output_d):
        payload = {
            "inputs": [
                {
                    "name": response_svc["inputs"][0]["name"],
                    "shape": [
                        1,
                        4,
                        103,
                    ],  # has to match response_svc["inputs"][0]["shape"] (except for 1. dimension)
                    "datatype": response_svc["inputs"][0]["datatype"],
                    "data": in_x.numpy().tolist(),
                }
            ]
        }
        res = requests.post(PREDICT_ENDPOINT, headers=HEADERS, data=json.dumps(payload))
        response = json.loads(res.text)
        print(response["outputs"])
        pred = response["outputs"][0]["data"][0]
        print(
            f"Actual ({out_y.numpy()[0]}) vs. Prediction ({round(pred, 3)} => {int(round(pred, 0))})"
        )

[[0.         1.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         1.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         1.         0.         0.         0.
  0.         0.         0.         0.         1.         0.
  0.         0.         0.         0.         1.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         1.         1.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         1.         0.         0.
  0.         0.         0.         1.         0.         1.
  0.         0.         1.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.   

ValueError: `sampling_rate` must be higher than 0. Received: sampling_rate=0

In [47]:
import json
import os
import requests
import dill

from tensorflow import keras
import pandas as pd
from sklearn_pandas import DataFrameMapper


MODEL_NAME = "fraud-detection-fd6e7"
NAMESPACE = "user-example-com"
HOST = f"{MODEL_NAME}-predictor-default.{NAMESPACE}"
HEADERS = {"Host": HOST}
MODEL_ENDPOINT = f"http://{MODEL_NAME}-predictor-default/v2/models/model"
PREDICT_ENDPOINT = MODEL_ENDPOINT + "/infer"


def get_data_table(rows: int):
    import pandas as pd
    from trino.dbapi import Connection

    with Connection(
        host="trino.trino",
        port="8080",
        user="anybody",
        catalog="jtopen",
        schema="demo",
    ) as conn:
        link = conn.cursor()
        link.execute(f"SELECT * FROM fraud offset 1000000 LIMIT {rows}")
        return pd.DataFrame(link.fetchall(), columns=[i.name for i in link.description])


class FraudDatasetTransformer:
    def __init__(self):
        ...

    def transform(self, dataset: pd.DataFrame, mapper: DataFrameMapper):
        tdf = dataset.copy()
        tdf["merchant name"] = tdf["merchant name"].astype(str)
        tdf.drop(["mcc", "zip", "merchant state"], axis=1, inplace=True)
        tdf.sort_values(by=["user", "card"], inplace=True)
        tdf.reset_index(inplace=True, drop=True)

        tdf = mapper.transform(tdf)
        return tdf


def predict(vdf: pd.DataFrame):

    outputs = []

    res_svc = requests.get(MODEL_ENDPOINT, headers=HEADERS)
    response_svc = json.loads(res_svc.text)
    print(response_svc)

    x, y = vdf.drop(vdf.columns.values[0], axis=1).to_numpy(), vdf[
        vdf.columns.values[0]
    ].to_numpy().reshape(len(vdf), 1)

    dataset = keras.preprocessing.timeseries_dataset_from_array(
        x, y, sequence_length=response_svc["inputs"][0]["shape"][1], batch_size=128
    )

    # code for making the request
    count = 0
    for batch in dataset.take(10):
        input_d, output_d = batch[0], batch[1]
        for in_x, out_y in zip(input_d, output_d):
            payload = {
                "inputs": [
                    {
                        "name": response_svc["inputs"][0]["name"],
                        "shape": [
                            1,
                            4,
                            103,
                        ],  # has to match response_svc["inputs"][0]["shape"] (except for 1. dimension)
                        "datatype": response_svc["inputs"][0]["datatype"],
                        "data": in_x.numpy().tolist(),
                    }
                ]
            }
            res = requests.post(
                PREDICT_ENDPOINT, headers=HEADERS, data=json.dumps(payload)
            )
            count += 1
            response = json.loads(res.text)
            print(response["outputs"])
            pred = response["outputs"][0]["data"][0]
            out_str = f"Actual ({out_y.numpy()[0]}) vs. Prediction ({round(pred, 3)} => {int(round(pred, 0))})"
            response["outputs"][0]["actual"] = out_y.numpy()[0]
            response["outputs"][0]["pred"] = int(round(pred, 0))
            print(out_str)

            outputs.append(response)
    print(f"count: {count}")
    return outputs


with open(os.path.join("encoders", "data", "mapper.pkl"), "rb") as f:
    mapper = dill.load(f)

dst = FraudDatasetTransformer()
td = get_data_table(10)
vdf = dst.transform(td, mapper)
outputs = predict(vdf)

{'name': 'model', 'versions': ['1'], 'platform': 'onnxruntime_onnx', 'inputs': [{'name': 'input_1', 'datatype': 'FP32', 'shape': [-1, 4, 103]}], 'outputs': [{'name': 'dense', 'datatype': 'FP32', 'shape': [-1, 1]}]}
[{'name': 'dense', 'datatype': 'FP32', 'shape': [1, 1], 'data': [0.0033446848392486572]}]
Actual (0) vs. Prediction (0.003 => 0)
[{'name': 'dense', 'datatype': 'FP32', 'shape': [1, 1], 'data': [0.0006699264049530029]}]
Actual (0) vs. Prediction (0.001 => 0)
[{'name': 'dense', 'datatype': 'FP32', 'shape': [1, 1], 'data': [0.0010393261909484863]}]
Actual (0) vs. Prediction (0.001 => 0)
[{'name': 'dense', 'datatype': 'FP32', 'shape': [1, 1], 'data': [0.0007579922676086426]}]
Actual (0) vs. Prediction (0.001 => 0)
[{'name': 'dense', 'datatype': 'FP32', 'shape': [1, 1], 'data': [0.0012444257736206055]}]
Actual (0) vs. Prediction (0.001 => 0)
[{'name': 'dense', 'datatype': 'FP32', 'shape': [1, 1], 'data': [0.017306149005889893]}]
Actual (0) vs. Prediction (0.017 => 0)
[{'name': 'd

In [48]:
outputs

[{'model_name': 'model',
  'model_version': '1',
  'outputs': [{'name': 'dense',
    'datatype': 'FP32',
    'shape': [1, 1],
    'data': [0.0033446848392486572],
    'actual': 0,
    'pred': 0}]},
 {'model_name': 'model',
  'model_version': '1',
  'outputs': [{'name': 'dense',
    'datatype': 'FP32',
    'shape': [1, 1],
    'data': [0.0006699264049530029],
    'actual': 0,
    'pred': 0}]},
 {'model_name': 'model',
  'model_version': '1',
  'outputs': [{'name': 'dense',
    'datatype': 'FP32',
    'shape': [1, 1],
    'data': [0.0010393261909484863],
    'actual': 0,
    'pred': 0}]},
 {'model_name': 'model',
  'model_version': '1',
  'outputs': [{'name': 'dense',
    'datatype': 'FP32',
    'shape': [1, 1],
    'data': [0.0007579922676086426],
    'actual': 0,
    'pred': 0}]},
 {'model_name': 'model',
  'model_version': '1',
  'outputs': [{'name': 'dense',
    'datatype': 'FP32',
    'shape': [1, 1],
    'data': [0.0012444257736206055],
    'actual': 0,
    'pred': 0}]},
 {'model_n

In [49]:
import pandas as pd

# Normalize data to create a flat table
df = pd.json_normalize(
    outputs, "outputs", ["model_name", "model_version"], record_prefix="outputs_"
)

# Display the DataFrame
df

Unnamed: 0,outputs_name,outputs_datatype,outputs_shape,outputs_data,outputs_actual,outputs_pred,model_name,model_version
0,dense,FP32,"[1, 1]",[0.0033446848392486572],0,0,model,1
1,dense,FP32,"[1, 1]",[0.0006699264049530029],0,0,model,1
2,dense,FP32,"[1, 1]",[0.0010393261909484863],0,0,model,1
3,dense,FP32,"[1, 1]",[0.0007579922676086426],0,0,model,1
4,dense,FP32,"[1, 1]",[0.0012444257736206055],0,0,model,1
5,dense,FP32,"[1, 1]",[0.017306149005889893],0,0,model,1
6,dense,FP32,"[1, 1]",[0.020819783210754395],0,0,model,1


In [22]:
td = get_data_table(5)
j_d = td.to_html().strip()
type(j_d)

str

In [26]:
j_d

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>user</th>\n      <th>card</th>\n      <th>year</th>\n      <th>month</th>\n      <th>day</th>\n      <th>time</th>\n      <th>amount</th>\n      <th>use chip</th>\n      <th>merchant name</th>\n      <th>merchant city</th>\n      <th>merchant state</th>\n      <th>zip</th>\n      <th>mcc</th>\n      <th>errors?</th>\n      <th>is fraud?</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>74</td>\n      <td>3</td>\n      <td>2013</td>\n      <td>1</td>\n      <td>14</td>\n      <td>11:14</td>\n      <td>$109.11</td>\n      <td>Swipe Transaction</td>\n      <td>-8346396146708022106</td>\n      <td>Rockwall</td>\n      <td>TX</td>\n      <td>75032</td>\n      <td>5411</td>\n      <td></td>\n      <td>No</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>74</td>\n      <td>3</td>\n      <td>2013</td>\n      <td>1</td>\n      <td>14</td>\n      <td>

In [None]:
import boto3
from botocore.client import Config
import numpy as np
import onnxruntime as ort

MODEL_NAME = "fraud-detection-fd6e7"
MODEL_VERSION = 1
model_path = 'model.onnx'

s3_config = {
  "s3_user": "minio",
  "s3_pass": "minio123",
  "s3_host": "http://kubeflow-minio-gateway-525eca1d5089dbdc-istio-system.apps.b2s001.pbm.ihost.com",
  "s3_bucket": "projects",
  "s3_object": "{MODEL_NAME}/model/{MODEL_VERSION}/model.onnx",
}

s3_client = boto3.session.Session().resource(
  service_name="s3",
  endpoint_url=s3_config['s3_host'],
  aws_access_key_id=s3_config['s3_user'],
  aws_secret_access_key=s3_config['s3_pass'],
  config=Config(signature_version="s3v4"),
)
bucket = s3_client.Bucket(s3_config['s3_bucket'])
bucket.download_file(s3_config['s3_object'], model_path)

In [26]:
import json
import os
import requests

MODEL_NAME = "fraud-detection-fd6e7"
NAMESPACE = "user-example-com"
HOST = f"{MODEL_NAME}-predictor-default.{NAMESPACE}"
HEADERS = {"Host": HOST}
MODEL_ENDPOINT = f"http://{MODEL_NAME}-predictor-default/v2/models/model"
PREDICT_ENDPOINT = MODEL_ENDPOINT + "/infer"
print(PREDICT_ENDPOINT)

try:
    res_svc = requests.get(MODEL_ENDPOINT, headers=HEADERS)
    response_svc = json.loads(res_svc.text)
    print(response_svc)
except Exception as e:
    print(e)

http://fraud-detection-fd6e7-predictor-default/v2/models/model/infer
{'name': 'model', 'versions': ['1'], 'platform': 'onnxruntime_onnx', 'inputs': [{'name': 'input_1', 'datatype': 'FP32', 'shape': [-1, 4, 103]}], 'outputs': [{'name': 'dense', 'datatype': 'FP32', 'shape': [-1, 1]}]}
