# MLflow Validation Test - Self-Contained Model Registration

This notebook generates synthetic data and registers a simple model to validate our MLflow infrastructure.

In [None]:
import numpy as np
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os

print("Libraries imported successfully")
print(f"MLflow tracking URI: {mlflow.get_tracking_uri()}")

In [None]:
# Generate synthetic sensor data
np.random.seed(42)
n_samples = 1000

# Create features that mimic sensor readings
data = {
    'temperature': np.random.normal(25, 5, n_samples),
    'vibration': np.random.normal(0.5, 0.2, n_samples),
    'pressure': np.random.normal(100, 15, n_samples),
    'humidity': np.random.normal(60, 10, n_samples),
    'rotation_speed': np.random.normal(1800, 200, n_samples)
}

# Add some anomalies (outliers)
anomaly_indices = np.random.choice(n_samples, size=50, replace=False)
for idx in anomaly_indices:
    data['temperature'][idx] += np.random.normal(0, 20)
    data['vibration'][idx] += np.random.normal(0, 1)

df = pd.DataFrame(data)
print(f"Generated synthetic dataset with {len(df)} samples")
print(f"Features: {list(df.columns)}")
print("\nDataset summary:")
print(df.describe())

In [None]:
# Prepare data for training
X = df.values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"Data shape: {X_scaled.shape}")
print("Data preprocessing completed")

In [None]:
# Set MLflow tracking URI and experiment
mlflow.set_tracking_uri("http://mlflow:5000")
experiment_name = "mlflow_validation_test"

try:
    experiment_id = mlflow.create_experiment(experiment_name)
    print(f"Created new experiment: {experiment_name} (ID: {experiment_id})")
except mlflow.exceptions.MlflowException as e:
    if "already exists" in str(e):
        experiment = mlflow.get_experiment_by_name(experiment_name)
        experiment_id = experiment.experiment_id
        print(f"Using existing experiment: {experiment_name} (ID: {experiment_id})")
    else:
        raise e

mlflow.set_experiment(experiment_name)

In [None]:
# Train and register the model
with mlflow.start_run(run_name="validation_test_run") as run:
    # Train Isolation Forest for anomaly detection
    model = IsolationForest(
        contamination=0.1,
        random_state=42,
        n_estimators=100
    )
    
    model.fit(X_scaled)
    
    # Log parameters
    mlflow.log_param("contamination", 0.1)
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("random_state", 42)
    mlflow.log_param("data_shape", str(X_scaled.shape))
    
    # Make predictions for evaluation
    predictions = model.predict(X_scaled)
    anomaly_count = np.sum(predictions == -1)
    normal_count = np.sum(predictions == 1)
    
    # Log metrics
    mlflow.log_metric("anomaly_count", anomaly_count)
    mlflow.log_metric("normal_count", normal_count)
    mlflow.log_metric("anomaly_ratio", anomaly_count / len(predictions))
    
    # Save feature names for later validation
    feature_names = list(df.columns)
    mlflow.log_param("feature_names", str(feature_names))
    
    # Log the model
    mlflow.sklearn.log_model(
        model, 
        "model",
        registered_model_name="anomaly_detector_validation"
    )
    
    print(f"Model trained and logged successfully!")
    print(f"Run ID: {run.info.run_id}")
    print(f"Anomalies detected: {anomaly_count}/{len(predictions)} ({100*anomaly_count/len(predictions):.1f}%)")
    print(f"Feature names: {feature_names}")

In [None]:
# Verify model registration
from mlflow.tracking import MlflowClient

client = MlflowClient()
model_name = "anomaly_detector_validation"

try:
    model_versions = client.search_model_versions(f"name='{model_name}'")
    print(f"\nModel '{model_name}' registered successfully!")
    print(f"Total versions: {len(model_versions)}")
    
    for version in model_versions:
        print(f"  Version {version.version}: {version.current_stage} (Run: {version.run_id})")
    
    # Get the latest version
    if model_versions:
        latest_version = max([int(v.version) for v in model_versions])
        print(f"\nLatest version: {latest_version}")
        print(f"Ready for API testing with model_name='{model_name}' and model_version='{latest_version}'")
        
except Exception as e:
    print(f"Error checking model registration: {e}")