In [1]:
import mlrun
import mlrun.feature_store as fstore

# 1. Load the existing project
# Since we fixed 'project_setup.py', this will automatically load the correct configuration
project = mlrun.get_or_create_project(
    name="fraud-demo",
    context="./",
    user_project=True
)

print(f"Project loaded successfully.")

Project Source: git://github.com/mlrun/demo-fraud.git
> 2025-12-12 22:08:08,136 [info] Project loaded successfully: {"project_name":"fraud-demo-jovyan"}
Project loaded successfully.


In [2]:
# 2. Define Feature Vector
# Select which features we want to use for training
features = [
    "transactions.*",      # All transaction features (including aggregations)
    "events.*",            # All user event features
]

# Create the Feature Vector object
fv = fstore.FeatureVector(
    "fraud-vec",                   # Name of the feature vector
    features,                      # List of features
    label_feature="labels.label",  # Specify the target label column
    description="Fraud detection feature vector"
)

# Save the definition to the project
fv.save()

print("Feature Vector 'fraud-vec' created and saved.")

Feature Vector 'fraud-vec' created and saved.


In [3]:
# 3. Get Offline Features
from mlrun.datastore.targets import ParquetTarget
# This merges data based on point-in-time correctness to prevent data leakage
print("Merging data (this may take a few seconds)...")

offline_features = fv.get_offline_features(
    target=ParquetTarget(name="parquet", path=f"./store/feature-vectors/{project.name}/fraud-vec.parquet")
)
fv.save()

# Convert to DataFrame for inspection
df = offline_features.to_dataframe()

print(f"Data merge complete! Total rows: {df.shape}")
df.head(5)

from mlrun.datastore.targets import ParquetTarget

# 1. Identify columns to drop
# These columns contain raw string identifiers (like Merchant ID) that the 
# Random Forest model cannot handle. We drop them to ensure only numeric 
# features (or one-hot encoded ones) are used for training.
cols_to_drop = ["target", "device", "zipcodeOri", "zipMerchant", "step"]

print(f"⏳ Re-materializing Feature Vector (Dropping columns: {cols_to_drop})...")

# 2. Re-generate the Parquet file with specific columns dropped
# We use get_offline_features with the 'drop_columns' parameter to create a clean dataset
fv.get_offline_features(
    target=ParquetTarget(name="parquet", path=f"./store/feature-vectors/{project.name}/fraud-vec.parquet"),
    drop_columns=cols_to_drop
)

# Save the updated Feature Vector definition
fv.save()
print("Clean training dataset has been saved!")

Merging data (this may take a few seconds)...
> 2025-12-12 22:08:08,568 [info] wrote target: {'kind': 'parquet', 'name': 'parquet', 'size': 49559, 'path': './store/feature-vectors/fraud-demo-jovyan/fraud-vec.parquet', 'partitioned': False, 'status': 'ready', 'updated': '2025-12-12T22:08:08.568072+00:00'}
Data merge complete! Total rows: (47, 59)
⏳ Re-materializing Feature Vector (Dropping columns: ['target', 'device', 'zipcodeOri', 'zipMerchant', 'step'])...
> 2025-12-12 22:08:08,961 [info] wrote target: {'kind': 'parquet', 'name': 'parquet', 'size': 44410, 'path': './store/feature-vectors/fraud-demo-jovyan/fraud-vec.parquet', 'partitioned': False, 'status': 'ready', 'updated': '2025-12-12T22:08:08.961361+00:00'}
Clean training dataset has been saved!


In [5]:
# 4. Run Training Job

# The "NoSuchBucket" error happens because the default cloud bucket doesn't exist in your local Docker.
# We simply tell the project to store artifacts in a folder named "./artifacts" relative to this notebook.
project.artifact_path = "./artifacts"
print(f"Project artifact path set to: {project.artifact_path}")

# 2. Define columns to drop (same as before)
cols_to_drop = ["target", "device", "zipcodeOri", "zipMerchant", "step"]

# 3. Run Training Job (Standard MLRun Method)
print(f"Submitting training job for project: {project.name}...")

train_run = project.run_function(
    "train",
    # Input: The Feature Vector URI
    inputs={"dataset": f"store://feature-vectors/{project.name}/fraud-vec"},
    
    params={
        "model_class": "sklearn.ensemble.RandomForestClassifier",
        "train_test_split_size": 0.2,
        "label_columns": "label",
        "model_name": "fraud_model",
        # Use the official parameter to ignore string columns
        "drop_columns": cols_to_drop
    },
    local=True  # Run locally
)

print("Model training completed successfully!")

Project artifact path set to: ./artifacts
Submitting training job for project: fraud-demo-jovyan...
> 2025-12-12 22:23:35,656 [info] Storing function: {"db":null,"name":"train-train","uid":"dcefa5aab1444e9a88460f51ffa3a4e3"}
> 2025-12-12 22:23:35,843 [info] label columns: label
> 2025-12-12 22:23:36,154 [info] Sample set not given, using the whole training set as the sample set
> 2025-12-12 22:23:36,159 [info] training 'fraud_model'



A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.



> 2025-12-12 22:23:36,975 [error] Execution error, Traceback (most recent call last):
  File "/opt/conda/envs/mlrun/lib/python3.11/site-packages/mlrun/runtimes/local.py", line 504, in exec_from_params
    val = mlrun.handler(
          ^^^^^^^^^^^^^^
  File "/opt/conda/envs/mlrun/lib/python3.11/site-packages/mlrun/package/__init__.py", line 137, in wrapper
    func_outputs = func(*args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/tmp1f_7ktmd.py", line 270, in train
    model.fit(x_train, y_train, **fit_kwargs)
  File "/opt/conda/envs/mlrun/lib/python3.11/site-packages/mlrun/frameworks/sklearn/mlrun_interface.py", line 118, in wrapper
    self._post_fit(x=X, y=y)
  File "/opt/conda/envs/mlrun/lib/python3.11/site-packages/mlrun/frameworks/sklearn/mlrun_interface.py", line 249, in _post_fit
    self._post_predict(
  File "/opt/conda/envs/mlrun/lib/python3.11/site-packages/mlrun/frameworks/sklearn/mlrun_interface.py", line 325, in _post_predict
    self._producer.produc


F-score is ill-defined and being set to 0.0 due to no true nor predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.


Recall is ill-defined and being set to 0.0 due to no true samples. Use `zero_division` parameter to control this behavior.


No negative samples in y_true, false positive value should be meaningless

Only one class present in y_true. ROC AUC score is not defined in that case.


project,uid,iter,start,end,state,kind,name,labels,inputs,parameters,results,artifacts
fraud-demo-jovyan,...ffa3a4e3,0,Dec 12 22:23:35,2025-12-12 22:23:36.990956+00:00,error,run,train-train,kind=localowner=jovyanhost=mlrun-jupyter-5676fb9986-67qpr,dataset,"model_class=sklearn.ensemble.RandomForestClassifiertrain_test_split_size=0.2label_columns=labelmodel_name=fraud_modeldrop_columns=['target', 'device', 'zipcodeOri', 'zipMerchant', 'step']",accuracy=1.0f1_score=0.0precision_score=0.0recall_score=0.0,confusion-matrixtest_setfeature-importance





> 2025-12-12 22:23:37,076 [info] Run execution finished: {"name":"train-train","status":"error"}


RunError: Only one class present in y_true. ROC AUC score is not defined in that case.