In [10]:
! pip install mlflow  kfp-kubernetes

Collecting kfp-kubernetes
  Downloading kfp-kubernetes-1.3.0.tar.gz (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.9/42.9 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting protobuf<6,>=3.12.0 (from mlflow-skinny==2.17.1->mlflow)
  Downloading protobuf-4.25.5-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Downloading protobuf-4.25.5-cp37-abi3-manylinux2014_x86_64.whl (294 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.6/294.6 kB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: kfp-kubernetes
  Building wheel for kfp-kubernetes (setup.py) ... [?25ldone
[?25h  Created wheel for kfp-kubernetes: filename=kfp_kubernetes-1.3.0-py3-none-any.whl size=20539 sha256=77f79989e76b120c8846d8db94f3b0fd2d104699a6b199c0407bd01e98639b3f
  Stored in directory: /home/jovyan/.cache/pip/wheels/05/c6/9a/aa55f5fcf8646a39643462605d7296ed837d417785e82b

In [14]:
from kfp import dsl, compiler
from kfp.dsl import InputPath, OutputPath
import os

@dsl.component(base_image="python:3.9", packages_to_install=["kagglehub", "pandas", "scikit-learn"])
def download_and_split_data(output_train: OutputPath(), output_val: OutputPath(), output_test: OutputPath()):
    import kagglehub
    import pandas as pd
    from sklearn.model_selection import train_test_split
    
    path = kagglehub.dataset_download("nelgiriyewithana/credit-card-fraud-detection-dataset-2023")
    df = pd.read_csv(f"{path}/creditcard_2023.csv")
    
    train_df, temp_df = train_test_split(df, train_size=0.6, random_state=42)
    val_df, test_df = train_test_split(temp_df, train_size=0.5, random_state=42)
    
    train_df.to_csv(output_train, index=False)
    val_df.to_csv(output_val, index=False)
    test_df.to_csv(output_test, index=False)

@dsl.component(
    base_image="python:3.9",
    packages_to_install=["tensorflow", "scikit-learn", "pandas", "onnx", "onnxruntime", "tf2onnx"]
)
def train_model(train_data: InputPath(), val_data: InputPath(), model_output: OutputPath()):
    import pandas as pd
    import tensorflow as tf
    import numpy as np
    from sklearn.preprocessing import StandardScaler
    import tf2onnx
    import onnx
    
    df_train = pd.read_csv(train_data)
    df_val = pd.read_csv(val_data)
    
    X_train = df_train.drop(['Class', 'id'], axis=1)
    y_train = df_train['Class']
    X_val = df_val.drop(['Class', 'id'], axis=1)
    y_val = df_val['Class']
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(64, activation='relu', input_shape=(29,)),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    class_weights = {0: 1, 1: len(y_train[y_train==0])/len(y_train[y_train==1])}
    
    model.fit(X_train, y_train, epochs=3, validation_data=(X_val, y_val), class_weight=class_weights)

    # # Before conversion, set the output names
    # model.outputs[0]._name = 'output_name'  # Give your output a unique name
    
    # model_proto, _ = tf2onnx.convert.from_keras(model)
    # Convert the Keras model to ONNX
    import tensorflow as tf
    import tf2onnx
    import onnx
    import os
    
    # Wrap the model in a tf.function
    @tf.function(input_signature=[tf.TensorSpec([None, X_train.shape[1]], tf.float32, name='dense_input')])
    def model_fn(x):
        return model(x)
    
    # Convert the Keras model to ONNX
    model_proto, external_tensor_storage = tf2onnx.convert.from_function(
        model_fn,
        input_signature=[tf.TensorSpec([None, X_train.shape[1]], tf.float32, name='dense_input')],
        opset=13  # You can specify the ONNX opset version here
    )

    onnx.save(model_proto, model_output)

@dsl.component(base_image="python:3.9", packages_to_install=["boto3"])
def upload_to_s3(model_path: InputPath()):
    import boto3
    import os
    
    s3_client = boto3.client(
        's3',
        aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
        aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
        endpoint_url=os.getenv('AWS_S3_ENDPOINT')
    )
    
    s3_client.upload_file(model_path, os.getenv('AWS_S3_BUCKET'), 'models/fraud/1/model.onnx')

@dsl.pipeline(name='fraud-detection-pipeline')
def fraud_detection_pipeline():
    data_op = download_and_split_data()
    
    train_op = train_model(
        train_data=data_op.outputs['output_train'],
        val_data=data_op.outputs['output_val'],
    )
    
    upload_op = upload_to_s3(model_path=train_op.outputs['model_output'])
    upload_op.set_env_variable("S3_KEY", "models/fraud/1/model.onnx")
    
    from kfp import kubernetes
    kubernetes.use_secret_as_env(
        upload_op,
        'aws-connection-my-storage',
        {
            'AWS_ACCESS_KEY_ID': 'AWS_ACCESS_KEY_ID',
            'AWS_SECRET_ACCESS_KEY': 'AWS_SECRET_ACCESS_KEY',
            'AWS_DEFAULT_REGION': 'AWS_DEFAULT_REGION', 
            'AWS_S3_BUCKET': 'AWS_S3_BUCKET',
            'AWS_S3_ENDPOINT': 'AWS_S3_ENDPOINT'
        }
    )

if __name__ == '__main__':
    compiler.Compiler().compile(pipeline_func=fraud_detection_pipeline, package_path='fraud_detection_pipeline.yaml')

In [15]:
import kfp
client = kfp.Client()
client.create_run_from_pipeline_package(
    'fraud_detection_pipeline.yaml',
    arguments={},
    experiment_name='Fraud Detection'
)

RunPipelineResult(run_id=d8a1edf9-6ace-4290-b35a-705ae7fad343)