<span style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">An Exception was encountered at '<a href="#papermill-error-cell">In [3]</a>'.</span>

In [1]:
import pandas as pd
import mlflow
import json
import os
import socket
import time
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt
import seaborn as sns

from apps.ml.features import SensorFeatureTransformer

# MLflow setup (inside docker network use service name)
DEFAULT_TRACKING_URI = "http://mlflow:5000"
tracking_uri = os.getenv("MLFLOW_TRACKING_URI", DEFAULT_TRACKING_URI)
mlflow.set_tracking_uri(tracking_uri)
mlflow.set_experiment("Anomaly Detection")

def wait_for_port(host: str, port: int, timeout: int = 60, interval: float = 2.0):
    start = time.time()
    while time.time() - start < timeout:
        try:
            with socket.create_connection((host, port), timeout=2):
                return True
        except OSError:
            time.sleep(interval)
    raise RuntimeError(f"MLflow not reachable at {host}:{port} after {timeout}s")

# Only wait if using the default internal URI
if "mlflow:5000" in tracking_uri:
    wait_for_port("mlflow", 5000)

print(f"MLflow tracking URI set to: {mlflow.get_tracking_uri()}")

mkdir -p failed for path /.config/matplotlib: [Errno 13] Permission denied: '/.config'


Matplotlib created a temporary cache directory at /tmp/matplotlib-vi7nz7z_ because there was an issue with the default path (/.config/matplotlib); it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


MLflow tracking URI set to: http://mlflow:5000


In [2]:
df = pd.read_csv('data/sensor_data.csv', parse_dates=['timestamp'])

# Initialize the ENHANCED feature transformer
feature_transformer = SensorFeatureTransformer(n_lags=5, scale_columns=['value', 'quality'])
X_transformed = feature_transformer.fit_transform(df)
feature_names = feature_transformer.get_feature_names_out()

print("Data transformed successfully with new transformer.")
print(f"Features ({len(feature_names)}): {feature_names}")
X_transformed.head()

Data transformed successfully with new transformer.
Features (7): ['value_lag_1', 'value_lag_2', 'value_lag_3', 'value_lag_4', 'value_lag_5', 'value_scaled', 'quality_scaled']


  .apply(lambda g: g.ffill().bfill())


Unnamed: 0,value_lag_1,value_lag_2,value_lag_3,value_lag_4,value_lag_5,value_scaled,quality_scaled
0,51.183,51.183,51.183,51.183,51.183,0.545164,0.726316
1,51.183,51.183,51.183,51.183,51.183,0.640252,0.526316
2,57.875,51.183,51.183,51.183,51.183,0.74513,0.821053
3,65.256,57.875,51.183,51.183,51.183,0.682396,0.747368
4,60.841,65.256,57.875,51.183,51.183,0.699092,0.694737


<span id="papermill-error-cell" style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">Execution using papermill encountered an exception here and stopped:</span>

In [3]:
with mlflow.start_run(run_name="IsolationForest_v2_refined") as run:
    mlflow.log_param("model_type", "IsolationForest")
    mlflow.log_param("feature_engineering_version", "v2")

    # --- Log feature details for reproducibility ---
    mlflow.log_param("feature_count", len(feature_names))
    # Log feature names as a text artifact
    with open("feature_names.txt", "w") as f:
        json.dump(feature_names, f)
    mlflow.log_artifact("feature_names.txt")

    # --- Train Model ---
    contamination = 0.05
    mlflow.log_param("contamination", contamination)
    model = IsolationForest(contamination=contamination, random_state=42)
    model.fit(X_transformed)

    # --- Log Metrics ---
    df['anomaly'] = model.predict(X_transformed)
    anomaly_rate = (df['anomaly'] == -1).mean()
    mlflow.log_metric("anomaly_rate", anomaly_rate)
    
    # Log feature statistics as a dictionary
    feature_stats = X_transformed.describe().to_dict()
    mlflow.log_dict(feature_stats, "feature_summary_stats.json")

    # --- Log Model and Artifacts ---
    mlflow.sklearn.log_model(model, "model", registered_model_name="anomaly_detector_refined_v2")

    plt.figure(figsize=(15, 7))
    sns.scatterplot(data=df, x='timestamp', y='value', hue='anomaly', palette={1: 'blue', -1: 'red'}, s=10)
    plt.title('Anomaly Detection (v2 - Refined Features)')
    plot_path = "docs/ml/anomaly_scatter_v2.png"
    plt.savefig(plot_path)
    mlflow.log_artifact(plot_path, "plots")

    print(f"Run {run.info.run_name} complete. Anomaly rate: {anomaly_rate:.2%}")
    print(f"Model registered as 'anomaly_detector_refined_v2'. Check MLflow UI.")



🏃 View run IsolationForest_v2_refined at: http://mlflow:5000/#/experiments/1/runs/ad260c321faf494e9b78712ea76352c8
🧪 View experiment at: http://mlflow:5000/#/experiments/1


PermissionError: [Errno 13] Permission denied: 'feature_names.txt'