In [54]:
!pip install onnx onnxruntime tf2onnx kagglehub



In [55]:
import numpy as np
import pandas as pd
import datetime
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, Activation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import class_weight
import tf2onnx
import onnx
import pickle
import kagglehub
from pathlib import Path

In [59]:
from sklearn.model_selection import train_test_split

# Read the CSV file
def split_dataset(input_file, train_size=0.6, val_size=0.2, test_size=0.2):
    # Read the CSV file
    print("Reading the dataset...")
    df = pd.read_csv(input_file)
    
    # First split: separate training set from the rest
    train_df, temp_df = train_test_split(
        df, 
        train_size=train_size,
        random_state=42  # For reproducibility
    )
    
    # Second split: divide the remaining data into validation and test sets
    val_df, test_df = train_test_split(
        temp_df,
        train_size=val_size/(val_size + test_size),
        random_state=42  # For reproducibility
    )
    
    # Save the splits to new CSV files
    print("Saving the splits...")
    train_df.to_csv('fraud-detection/data/train_data.csv', index=False)
    val_df.to_csv('fraud-detection/data/validation_data.csv', index=False)
    test_df.to_csv('fraud-detection/data/test_data.csv', index=False)
    
    # Print the sizes of each split
    print(f"\nDataset split complete:")
    print(f"Total samples: {len(df)}")
    print(f"Training samples: {len(train_df)} ({len(train_df)/len(df)*100:.1f}%)")
    print(f"Validation samples: {len(val_df)} ({len(val_df)/len(df)*100:.1f}%)")
    print(f"Testing samples: {len(test_df)} ({len(test_df)/len(df)*100:.1f}%)")


In [63]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("nelgiriyewithana/credit-card-fraud-detection-dataset-2023")

print("Path to dataset files:", path)

Path to dataset files: /home/jovyan/.cache/kagglehub/datasets/nelgiriyewithana/credit-card-fraud-detection-dataset-2023/versions/1


In [64]:
input_file = path + '/creditcard_2023.csv'  
split_dataset(input_file)

Reading the dataset...
Saving the splits...

Dataset split complete:
Total samples: 568630
Training samples: 341178 (60.0%)
Validation samples: 113726 (20.0%)
Testing samples: 113726 (20.0%)


In [65]:
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

# Read the split datasets
print("Reading the datasets...")
df_train = pd.read_csv('fraud-detection/data/train_data.csv')
df_val = pd.read_csv('fraud-detection/data/validation_data.csv')

# Check the data was loaded correctly
print(f"Training set shape: {df_train.shape}")
print(f"Validation set shape: {df_val.shape}")

# Separate features and target
X_train = df_train.drop(['Class', 'id'], axis=1)  # Remove 'Class' and 'id' columns
y_train = df_train['Class']

X_val = df_val.drop(['Class', 'id'], axis=1)
y_val = df_val['Class']

# Scale the features (important for fraud detection)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# Calculate class weights (important because fraud detection data is usually imbalanced)
n_negative = np.sum(y_train == 0)
n_positive = np.sum(y_train == 1)
class_weights = {
    0: 1,
    1: n_negative/n_positive  # gives more weight to the minority class
}

# Convert to numpy arrays if not already
X_train = np.array(X_train)
y_train = np.array(y_train)
X_val = np.array(X_val)
y_val = np.array(y_val)

# Define the model with 29 input features
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(29,)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.AUC()]
)

# Train the model
import time
start = time.time()
epochs = 3  # You might want more epochs for better results

history = model.fit(
    X_train,
    y_train,
    epochs=epochs,
    batch_size=32,  # Adjust based on your memory constraints
    validation_data=(X_val, y_val),
    verbose=1,
    class_weight=class_weights,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=3,
            restore_best_weights=True
        )
    ]
)
end = time.time()
print(f"Training of model is complete. Took {end-start} seconds")

Reading the datasets...
Training set shape: (341178, 31)
Validation set shape: (113726, 31)
Epoch 1/3
Epoch 2/3
Epoch 3/3
Training of model is complete. Took 46.82496738433838 seconds


In [69]:
import tensorflow as tf
import tf2onnx
import onnx
import os

# Wrap the model in a tf.function
@tf.function(input_signature=[tf.TensorSpec([None, X_train.shape[1]], tf.float32, name='dense_input')])
def model_fn(x):
    return model(x)

# Convert the Keras model to ONNX
model_proto, external_tensor_storage = tf2onnx.convert.from_function(
    model_fn,
    input_signature=[tf.TensorSpec([None, X_train.shape[1]], tf.float32, name='dense_input')],
    opset=13  # You can specify the ONNX opset version here
)

# Create directory if it doesn't exist
os.makedirs("fraud-detection/models/fraud/1", exist_ok=True)

# Save the model as ONNX
onnx.save(model_proto, "fraud-detection/models/fraud/1/model.onnx")

print("Model saved successfully as ONNX")

# Optionally verify the saved model
model_loaded = onnx.load("fraud-detection/models/fraud/1/model.onnx")
print("Model loaded successfully for verification")

Model saved successfully as ONNX
Model loaded successfully for verification


2024-10-30 08:45:19.621790: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0
2024-10-30 08:45:19.621974: I tensorflow/core/grappler/clusters/single_machine.cc:361] Starting new session
2024-10-30 08:45:19.685812: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0
2024-10-30 08:45:19.686018: I tensorflow/core/grappler/clusters/single_machine.cc:361] Starting new session


In [70]:
df_test = pd.read_csv('fraud-detection/data/test_data.csv')
# First, verify the shapes
print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

# Make sure X_test has the same preprocessing as X_train
# Assuming df_test is your test dataframe
X_test = df_test.drop(['Class', 'id'], axis=1)  # Keep the same columns as training
X_test = scaler.transform(X_test)  # Use the same scaler that was fit on training data

# Now verify the ONNX model
session = ort.InferenceSession("fraud-detection/models/fraud/1/model.onnx")
input_name = session.get_inputs()[0].name

# Print shapes for verification
print(f"\nExpected input shape: {session.get_inputs()[0].shape}")
print(f"Actual input shape: {X_test.shape}")

# Try a prediction with sample data
sample_input = X_test[:1].astype(np.float32)
pred_onnx = session.run(None, {input_name: sample_input})[0]

print("ONNX model verification successful")
print(f"Prediction shape: {pred_onnx.shape}")
print(f"Sample prediction: {pred_onnx[0]}")

Training data shape: (341178, 29)
Test data shape: (113726, 29)

Expected input shape: ['unk__31', 29]
Actual input shape: (113726, 29)
ONNX model verification successful
Prediction shape: (1, 1)
Sample prediction: [0.]
