# MLflow Validation Test - Self-Contained Model Registration

This notebook generates synthetic data and registers a simple model to validate our MLflow infrastructure.

In [1]:
import numpy as np
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os

print("Libraries imported successfully")
print(f"MLflow tracking URI: {mlflow.get_tracking_uri()}")

Libraries imported successfully
MLflow tracking URI: file:///app/mlruns


In [2]:
# Generate synthetic sensor data
np.random.seed(42)

# Create 1000 samples with 5 features
n_samples = 1000
n_features = 5

# Normal operation data (90% of samples)
normal_data = np.random.normal(0, 1, (int(n_samples * 0.9), n_features))

# Anomalous data (10% of samples)
anomalous_data = np.random.normal(3, 1.5, (int(n_samples * 0.1), n_features))

# Combine data
X_synthetic = np.vstack([normal_data, anomalous_data])
y_synthetic = np.hstack([np.zeros(len(normal_data)), np.ones(len(anomalous_data))])

# Create feature names
feature_names = [f'feature_{i}' for i in range(n_features)]

print(f"Generated {len(X_synthetic)} synthetic samples")
print(f"Features: {feature_names}")
print(f"Normal samples: {len(normal_data)}, Anomalous samples: {len(anomalous_data)}")

Generated 1000 synthetic samples
Features: ['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4']
Normal samples: 900, Anomalous samples: 100


In [3]:
# Configure MLflow
mlflow.set_tracking_uri("http://mlflow:5000")
mlflow.set_experiment("Synthetic_Data_Validation")

print(f"MLflow configured to: {mlflow.get_tracking_uri()}")
print(f"Active experiment: {mlflow.get_experiment_by_name('Synthetic_Data_Validation')}")

MLflow configured to: http://mlflow:5000
Active experiment: <Experiment: artifact_location='/mlruns/4', creation_time=1755866523352, experiment_id='4', last_update_time=1755866523352, lifecycle_stage='active', name='Synthetic_Data_Validation', tags={}>


In [4]:
# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(
    X_synthetic, y_synthetic, test_size=0.3, random_state=42, stratify=y_synthetic
)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set: {X_train_scaled.shape}")
print(f"Test set: {X_test_scaled.shape}")

Training set: (700, 5)
Test set: (300, 5)


In [5]:
# Train and register model with MLflow
with mlflow.start_run(run_name="Synthetic_Validation_IsolationForest") as run:
    # Log parameters
    contamination = 0.1
    mlflow.log_param("model_type", "IsolationForest")
    mlflow.log_param("contamination", contamination)
    mlflow.log_param("n_samples", n_samples)
    mlflow.log_param("n_features", n_features)
    mlflow.log_param("data_type", "synthetic")
    
    # Train model
    model = IsolationForest(contamination=contamination, random_state=42)
    model.fit(X_train_scaled)
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    y_pred_binary = (y_pred == -1).astype(int)  # Convert to binary (1 for anomaly)
    
    # Calculate basic metrics
    from sklearn.metrics import accuracy_score, precision_score, recall_score
    
    accuracy = accuracy_score(y_test, y_pred_binary)
    precision = precision_score(y_test, y_pred_binary, zero_division=0)
    recall = recall_score(y_test, y_pred_binary, zero_division=0)
    
    # Log metrics
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    
    # Log feature names as artifact
    with open("/tmp/feature_names_synthetic.txt", "w") as f:
        f.write("\n".join(feature_names))
    mlflow.log_artifact("/tmp/feature_names_synthetic.txt")
    
    # Register the model
    model_name = "synthetic_validation_isolation_forest"
    mlflow.sklearn.log_model(
        model, 
        "model",
        registered_model_name=model_name
    )
    
    print(f"Model registered as: {model_name}")
    print(f"Run ID: {run.info.run_id}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")





Registered model 'synthetic_validation_isolation_forest' already exists. Creating a new version of this model...
2025/08/22 12:47:16 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: synthetic_validation_isolation_forest, version 2


Model registered as: synthetic_validation_isolation_forest
Run ID: 226b133c20c14f93b633fa03482fcaa7
Accuracy: 0.9933
Precision: 0.9667
Recall: 0.9667
🏃 View run Synthetic_Validation_IsolationForest at: http://mlflow:5000/#/experiments/4/runs/226b133c20c14f93b633fa03482fcaa7
🧪 View experiment at: http://mlflow:5000/#/experiments/4


Created version '2' of model 'synthetic_validation_isolation_forest'.


In [6]:
# Test model loading from MLflow
print("\n=== Testing Model Loading ===")
try:
    # Load the model back from MLflow
    loaded_model = mlflow.sklearn.load_model(f"models:/{model_name}/latest")
    
    # Test prediction with the loaded model
    test_sample = X_test_scaled[:5]  # Test with first 5 samples
    predictions = loaded_model.predict(test_sample)
    
    print(f"Successfully loaded model: {model_name}")
    print(f"Test predictions: {predictions}")
    print("✅ MLflow model loading validation PASSED")
    
except Exception as e:
    print(f"❌ MLflow model loading validation FAILED: {e}")
    raise


=== Testing Model Loading ===


Successfully loaded model: synthetic_validation_isolation_forest
Test predictions: [1 1 1 1 1]
✅ MLflow model loading validation PASSED


In [7]:
print("\n=== Synthetic Data Validation Summary ===")
print("✅ Synthetic data generation: SUCCESS")
print("✅ Model training: SUCCESS")
print("✅ MLflow logging: SUCCESS")
print("✅ Model registration: SUCCESS")
print("✅ Model loading validation: SUCCESS")
print("\n🎉 All validation tests passed! MLflow infrastructure is working correctly.")


=== Synthetic Data Validation Summary ===
✅ Synthetic data generation: SUCCESS
✅ Model training: SUCCESS
✅ MLflow logging: SUCCESS
✅ Model registration: SUCCESS
✅ Model loading validation: SUCCESS

🎉 All validation tests passed! MLflow infrastructure is working correctly.
