In [1]:
! pip install mlflow  kfp-kubernetes

Collecting mlflow
  Downloading mlflow-2.17.2-py3-none-any.whl.metadata (29 kB)
Collecting kfp-kubernetes
  Downloading kfp-kubernetes-1.3.0.tar.gz (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.9/42.9 kB[0m [31m366.8 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting mlflow-skinny==2.17.2 (from mlflow)
  Downloading mlflow_skinny-2.17.2-py3-none-any.whl.metadata (30 kB)
Collecting Flask<4 (from mlflow)
  Downloading flask-3.0.3-py3-none-any.whl.metadata (3.2 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.13.3-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.1-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting pyarrow<18,>=4.0.0 (from mlflow)
  Downloading pyarrow-17.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (

In [2]:
from kfp import dsl, compiler
from kfp.dsl import InputPath, OutputPath, Output
import os

# First two components remain the same
@dsl.component(base_image="python:3.9", packages_to_install=["kagglehub", "pandas", "scikit-learn"])
def download_and_split_data(output_train: OutputPath(), output_val: OutputPath(), output_test: OutputPath()):
    import kagglehub
    import pandas as pd
    from sklearn.model_selection import train_test_split
   
    path = kagglehub.dataset_download("nelgiriyewithana/credit-card-fraud-detection-dataset-2023")
    df = pd.read_csv(f"{path}/creditcard_2023.csv")
   
    train_df, temp_df = train_test_split(df, train_size=0.6, random_state=42)
    val_df, test_df = train_test_split(temp_df, train_size=0.5, random_state=42)
   
    train_df.to_csv(output_train, index=False)
    val_df.to_csv(output_val, index=False)
    test_df.to_csv(output_test, index=False)

@dsl.component(
    base_image="python:3.9",
    packages_to_install=["tensorflow", "scikit-learn", "pandas", "onnx", "onnxruntime", "tf2onnx"]
)
def train_model(train_data: InputPath(), val_data: InputPath(), model_output: OutputPath()):
    import pandas as pd
    import tensorflow as tf
    import numpy as np
    from sklearn.preprocessing import StandardScaler
    import tf2onnx
    import onnx
   
    df_train = pd.read_csv(train_data)
    df_val = pd.read_csv(val_data)
   
    X_train = df_train.drop(['Class', 'id'], axis=1)
    y_train = df_train['Class']
    X_val = df_val.drop(['Class', 'id'], axis=1)
    y_val = df_val['Class']
   
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
   
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(64, activation='relu', input_shape=(29,)),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
   
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
   
    class_weights = {0: 1, 1: len(y_train[y_train==0])/len(y_train[y_train==1])}
   
    model.fit(X_train, y_train, epochs=3, validation_data=(X_val, y_val), class_weight=class_weights)

    # # Before conversion, set the output names
    # model.outputs[0]._name = 'output_name'  # Give your output a unique name
   
    # model_proto, _ = tf2onnx.convert.from_keras(model)
    # Convert the Keras model to ONNX
    import tensorflow as tf
    import tf2onnx
    import onnx
    import os
   
    # Wrap the model in a tf.function
    @tf.function(input_signature=[tf.TensorSpec([None, X_train.shape[1]], tf.float32, name='dense_input')])
    def model_fn(x):
        return model(x)
   
    # Convert the Keras model to ONNX
    model_proto, external_tensor_storage = tf2onnx.convert.from_function(
        model_fn,
        input_signature=[tf.TensorSpec([None, X_train.shape[1]], tf.float32, name='dense_input')],
        opset=13  # You can specify the ONNX opset version here
    )

    onnx.save(model_proto, model_output)


@dsl.component(
    base_image="python:3.9",
    packages_to_install=["boto3"]
)
def upload_to_minio(
    model_path: InputPath(),
    minio_endpoint: str,
    minio_access_key: str,
    minio_secret_key: str
) -> str:  # Change to return string instead of using Output parameter
    import boto3
    import os
    from botocore.client import Config
   
    # Clean the endpoint URL
    minio_endpoint = minio_endpoint.strip()
   
    # Create S3 client with explicit credentials
    s3_client = boto3.client(
        's3',
        endpoint_url=minio_endpoint,
        aws_access_key_id=minio_access_key,
        aws_secret_access_key=minio_secret_key,
        config=Config(
            signature_version='s3v4',
            s3={'addressing_style': 'path'}
        )
    )
   
    bucket_name = 'mlflow'  # Or your bucket name
    object_name = 'models/fraud/1/model.onnx'
   
    print(f"Uploading {model_path} to {bucket_name}/{object_name}")
   
    # Upload the file
    s3_client.upload_file(
        Filename=model_path,
        Bucket=bucket_name,
        Key=object_name
    )
   
    # Return the MinIO URL
    return f"{minio_endpoint}/{bucket_name}/{object_name}"

@dsl.pipeline(name='fraud-detection-pipeline')
def fraud_detection_pipeline():
    data_op = download_and_split_data()
   
    train_op = train_model(
        train_data=data_op.outputs['output_train'],
        val_data=data_op.outputs['output_val'],
    )
   
    # Get environment variables
    minio_endpoint = 'http://minio.kubeflow.svc.cluster.local:9000'
    minio_access_key = 'admin1234'
    minio_secret_key = 'admin1234'
   
    # Pass credentials explicitly to the component
    minio_url = upload_to_minio(
        model_path=train_op.outputs['model_output'],
        minio_endpoint=minio_endpoint,
        minio_access_key=minio_access_key,
        minio_secret_key=minio_secret_key
    )

if __name__ == '__main__':
    compiler.Compiler().compile(
        pipeline_func=fraud_detection_pipeline,
        package_path='fraud_detection_pipeline.yaml'
    )

In [3]:
import kfp
client = kfp.Client()
client.create_run_from_pipeline_package(
    'fraud_detection_pipeline.yaml',
    arguments={},
    experiment_name='Fraud Detection'
)



RunPipelineResult(run_id=e0653d17-fa2d-4348-8ec1-4308c477d69d)