In [0]:
# Install Databricks Feature Engineering for Feature Store
%pip install databricks-feature-engineering --quiet

# Install project requirements
!pip install -r /Workspace/Users/ashish.kamboj@tigeranalytics.com/home-credit-hyperpersonalization/requirements.txt

dbutils.library.restartPython()

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
mlflow 3.6.0 requires mlflow-skinny==3.6.0, but you have mlflow-skinny 3.0.1 which is incompatible.[0m[31m
[0m[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m
Collecting pyarrow==20.0.0 (from -r /Workspace/Users/ashish.kamboj@tigeranalytics.com/home-credit-hyperpersonalization/requirements.txt (line 9))
  Downloading pyarrow-20.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting mlflow==3.0.1 (from -r /Workspace/Users/ashish.kamboj@tigeranalytics.com/home-credit-hyperpersonalization/requirements.txt (line 17))
  Downloading mlflow-3.0.1-py3-none-any.whl.metadata (29 kB)
Downloading pyarrow-20.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (42.3 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32

In [0]:
import sys
import os
import logging
import pandas as pd
sys.path.append(os.path.abspath('../'))

from utils.common_utils import load_config, setup_logging, get_spark_session, print_section_header, Timer
from utils.data_loader import load_data_from_source
from utils.model_monitoring import *

In [0]:
config = load_config('../config/config.yaml')
setup_logging(config)

print_section_header("Model Monitoring")


                                Model Monitoring                                



## 1. Load Reference and Current Data

In [0]:
spark = None
if config['data_source']['type'] == 'unity_catalog':
    spark = get_spark_session(config)

# Load reference (training) features, targets, and current predictions
if config['data_source']['type'] == 'unity_catalog':
    uc_config = config['data_source']['unity_catalog']
    catalog = uc_config['catalog']
    output_schema = uc_config['output_schema']
    
    # Define table names
    feature_store_table = f"{catalog}.{output_schema}.customer_features_fs"
    features_table = f"{catalog}.{output_schema}.customer_features"
    targets_table = f"{catalog}.{output_schema}.customer_targets"
    predictions_table = f"{catalog}.{output_schema}.product_recommendations"
    
    print(f"📊 Loading reference features from Feature Store: {feature_store_table}")
    print(f"📂 Loading targets from: {targets_table}")
    print(f"📂 Loading predictions from: {predictions_table}")
    
    # Try to load features from Feature Store first
    try:
        from databricks.feature_engineering import FeatureEngineeringClient
        
        fe = FeatureEngineeringClient()
        reference_features_spark = fe.read_table(name=feature_store_table)
        reference_features = reference_features_spark.toPandas()
        print(f"✅ Loaded reference features from Feature Store")
        
    except ImportError:
        print(f"⚠️ Feature Engineering client not available, using direct table read")
        reference_features = spark.table(features_table).toPandas()
        print(f"✅ Loaded reference features from Unity Catalog table")
    except Exception as e:
        print(f"⚠️ Error reading from Feature Store: {str(e)}")
        reference_features = spark.table(features_table).toPandas()
        print(f"✅ Loaded reference features from Unity Catalog table")
    
    # Load targets
    target_data = spark.table(targets_table).toPandas()
    
    # Load predictions
    try:
        current_predictions = spark.table(predictions_table).toPandas()
        print(f"✅ Loaded predictions from Unity Catalog")
    except Exception as e:
        print(f"⚠️ Could not load predictions table: {str(e)}")
        print(f"   Table may not exist yet. Run 05_batch_inference.ipynb first.")
        # Create dummy predictions for demo
        current_predictions = pd.DataFrame({
            'CUSTOMERID': reference_features['CUSTOMERID'].sample(100).values,
            'PREDICTED_PRODUCT_ID': [101, 102, 103] * 33 + [101]
        })
        print(f"⚠️ Using dummy predictions for demo purposes")
        
else:
    # CSV mode
    reference_path = os.path.abspath('../data/processed/customer_features.csv')
    target_path = os.path.abspath('../data/processed/customer_targets.csv')
    predictions_path = os.path.abspath('../outputs/predictions/product_recommendations.csv')
    
    print(f"📂 Loading reference features from: {reference_path}")
    print(f"📂 Loading target data from: {target_path}")
    print(f"📂 Loading predictions from: {predictions_path}")
    
    reference_features = pd.read_csv(reference_path)
    target_data = pd.read_csv(target_path)
    
    try:
        current_predictions = pd.read_csv(predictions_path)
    except FileNotFoundError:
        print(f"⚠️ Predictions file not found. Using dummy data for demo.")
        current_predictions = pd.DataFrame({
            'CUSTOMERID': reference_features['CUSTOMERID'].sample(100).values,
            'PREDICTED_PRODUCT_ID': [101, 102, 103] * 33 + [101]
        })

print(f"\n✅ Loaded reference features: {len(reference_features)} records")
print(f"✅ Loaded target data: {len(target_data)} records")
print(f"✅ Loaded current predictions: {len(current_predictions)} records")

📊 Loading reference features from Feature Store: datafabric_catalog.ml_outputs.customer_features_fs
📂 Loading targets from: datafabric_catalog.ml_outputs.customer_targets
📂 Loading predictions from: datafabric_catalog.ml_outputs.product_recommendations
✅ Loaded reference features from Feature Store
✅ Loaded predictions from Unity Catalog

✅ Loaded reference features: 1000 records
✅ Loaded target data: 467 records
✅ Loaded current predictions: 2376 records


## 2. Analyze Prediction Distribution

In [0]:
pred_distribution = calculate_prediction_distribution(current_predictions)

print(f"\nPrediction Distribution:")
print(f"  Total Predictions: {pred_distribution['total_predictions']}")
print(f"  Unique Products: {pred_distribution['unique_products']}")
print(f"  Entropy: {pred_distribution['entropy']:.4f}")


Prediction Distribution:
  Total Predictions: 2376
  Unique Products: 10
  Entropy: 2.9897


## 3. Detect Feature Drift

In [0]:
# For drift detection, we'd need current production data
# For demo, we'll use a sample
current_sample = reference_features.sample(n=min(1000, len(reference_features)), random_state=42)

drift_results = detect_feature_drift(
    reference_features.select_dtypes(include=['number']),
    current_sample.select_dtypes(include=['number']),
    threshold=config['monitoring']['drift_thresholds']['feature_drift']
)

print(f"\nDrift Detection:")
print(f"  Features Checked: {drift_results['total_features_checked']}")
print(f"  Drifted Features: {drift_results['drifted_features_count']}")

if drift_results['drifted_features']:
    print(f"  Drifted Features: {', '.join(drift_results['drifted_features'][:5])}")

  'mean_change_pct': float((curr_values.mean() - ref_values.mean()) / ref_values.mean() * 100)



Drift Detection:
  Features Checked: 39
  Drifted Features: 0


## 4. Calculate Data Quality Metrics

In [0]:
data_quality = calculate_data_quality_metrics(current_sample)

print(f"\nData Quality:")
print(f"  Quality Score: {data_quality['quality_score']:.2f}")
print(f"  Missing Percentage: {data_quality['missing_percentage']:.2f}%")
print(f"  Duplicate Rows: {data_quality['duplicate_rows']}")


Data Quality:
  Quality Score: 1.00
  Missing Percentage: 0.00%
  Duplicate Rows: 0


## 5. Compare Model Performance

In [0]:
# Baseline metrics (from training)
baseline_metrics = {
    'accuracy': 0.75,
    'precision_weighted': 0.73,
    'recall_weighted': 0.75,
    'f1_weighted': 0.74
}

# Current metrics (simulated for demo)
current_metrics = {
    'accuracy': 0.72,
    'precision_weighted': 0.70,
    'recall_weighted': 0.72,
    'f1_weighted': 0.71
}

performance_comparison = compare_model_performance(
    current_metrics,
    baseline_metrics,
    threshold=config['monitoring']['drift_thresholds']['prediction_drift']
)

print(f"\nPerformance Comparison:")
print(f"  Degraded Metrics: {performance_comparison['degraded_metrics_count']}")

for metric in performance_comparison['metric_comparisons'].values():
    print(f"  {metric}")


Performance Comparison:
  Degraded Metrics: 0
  {'baseline': 0.75, 'current': 0.72, 'change': -0.030000000000000027, 'change_percentage': -4.0000000000000036, 'degraded': False}
  {'baseline': 0.73, 'current': 0.7, 'change': -0.030000000000000027, 'change_percentage': -4.109589041095894, 'degraded': False}
  {'baseline': 0.75, 'current': 0.72, 'change': -0.030000000000000027, 'change_percentage': -4.0000000000000036, 'degraded': False}
  {'baseline': 0.74, 'current': 0.71, 'change': -0.030000000000000027, 'change_percentage': -4.054054054054058, 'degraded': False}


## 6. Generate Monitoring Report

In [0]:
report_df = generate_monitoring_report(
    pred_distribution,
    drift_results,
    data_quality,
    performance_comparison,
    config
)

print("\nMonitoring Report:")
print(report_df.to_string(index=False))


Monitoring Report:
      monitoring_timestamp overall_status alerts  prediction_entropy  unique_products_predicted  total_predictions  drifted_features_count  data_quality_score  missing_percentage  degraded_metrics_count
2025-11-21 06:49:58.935603        HEALTHY   None            2.989731                         10               2376                       0                 1.0                 0.0                       0


## 7. Save Monitoring Results

In [0]:
detailed_results = {
    'prediction_distribution': pred_distribution,
    'drift_results': drift_results,
    'data_quality': data_quality,
    'performance_comparison': performance_comparison
}

save_monitoring_results(report_df, detailed_results, config)

print("\n✅ Monitoring results saved (CSV, JSON, HTML)!")


✅ Monitoring results saved (CSV, JSON, HTML)!


## 9. Monitoring Summary

## 8. Generate Evidently Reports

Generate an HTML report using Evidently's DataDriftPreset to check feature drift between reference (training) and current data.

The report is saved to the configured monitoring output directory with a timestamped filename.


In [0]:
# Generate Evidently HTML report for Data Drift
print("\n🔬 Generating Evidently Data Drift report...")

import datetime
from pathlib import Path
import os
import pandas as pd
import numpy as np

try:
    from evidently.report import Report
    from evidently.metric_preset import DataDriftPreset
    import evidently
    ev_version = getattr(evidently, '__version__', 'unknown')
    print(f"✅ Using Evidently version: {ev_version}")
except ImportError as e:
    print(f"❌ Evidently not available: {str(e)}")
    print("\n⚠️ Evidently was installed in cell 1. Please:")
    print("   1. Restart Python: dbutils.library.restartPython()")
    print("   2. Rerun cells 2-20")
    raise ImportError("Evidently not available. Python restart may be needed.")

# Prepare output directory
output_dir = config['monitoring']['output_path']
if not os.path.isabs(output_dir):
    output_dir = os.path.abspath(os.path.join('..', output_dir))
Path(output_dir).mkdir(parents=True, exist_ok=True)

# Timestamp for file names
ts = datetime.datetime.utcnow().strftime('%Y%m%d_%H%M%S')

# Build reference and current DataFrames
reference_df = reference_features.copy()
current_df = current_sample.copy()

# Convert all object columns to numeric (fix Decimal serialization)
print("\n🔧 Preparing data for Evidently...")
for col in reference_df.select_dtypes(include=['object']).columns:
    try:
        # Try to convert to numeric
        reference_df[col] = pd.to_numeric(reference_df[col], errors='coerce')
        current_df[col] = pd.to_numeric(current_df[col], errors='coerce')
        print(f"   • Converted {col} to numeric")
    except:
        pass

# Identifier-like columns to drop for drift analysis
id_like_cols = [c for c in reference_df.columns if c.lower() in {"customerid", "customer_id", "id", "partyid"}]

# Prepare feature-only DataFrames for drift detection (exclude ID columns)
ref_features_only = reference_df.drop(columns=id_like_cols, errors='ignore')
curr_features_only = current_df.drop(columns=id_like_cols, errors='ignore')

# Drop columns that are all NaN (empty after conversion)
empty_cols = []
for col in ref_features_only.columns:
    if ref_features_only[col].isna().all() or curr_features_only[col].isna().all():
        empty_cols.append(col)

if empty_cols:
    print(f"   • Dropping {len(empty_cols)} empty columns: {', '.join(empty_cols)}")
    ref_features_only = ref_features_only.drop(columns=empty_cols)
    curr_features_only = curr_features_only.drop(columns=empty_cols)

print(f"\n📊 Analyzing drift between:")
print(f"   Reference: {len(ref_features_only)} rows, {len(ref_features_only.columns)} features")
print(f"   Current: {len(curr_features_only)} rows, {len(curr_features_only.columns)} features")

# Generate Data Drift Report
print("\n   • Generating DataDriftPreset report...")
data_drift_report = Report(metrics=[DataDriftPreset()])
data_drift_path = os.path.join(output_dir, f"evidently_data_drift_{ts}.html")

try:
    data_drift_report.run(reference_data=ref_features_only, current_data=curr_features_only)
    data_drift_report.save_html(data_drift_path)
    print(f"     ✅ Saved: {os.path.basename(data_drift_path)}")
    print(f"\n✅ Evidently Data Drift report generation complete.")
    print(f"📊 Report saved to: {data_drift_path}")
except Exception as ex:
    print(f"     ⚠️ Failed to generate report: {str(ex)}")
    print(f"     Continuing without Evidently report...")


🔬 Generating Evidently Data Drift report...
✅ Using Evidently version: 0.6.1

🔧 Preparing data for Evidently...
   • Converted AGE_GROUP to numeric
   • Converted TENURE_GROUP to numeric
   • Converted MIN_INTEREST_RATE to numeric
   • Converted MAX_INTEREST_RATE to numeric
   • Dropping 2 empty columns: AGE_GROUP, TENURE_GROUP

📊 Analyzing drift between:
   Reference: 1000 rows, 39 features
   Current: 1000 rows, 39 features

   • Generating DataDriftPreset report...


  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]


     ✅ Saved: evidently_data_drift_20251121_065001.html

✅ Evidently Data Drift report generation complete.
📊 Report saved to: /Workspace/Users/ashish.kamboj@tigeranalytics.com/home-credit-hyperpersonalization/outputs/monitoring/evidently_data_drift_20251121_065001.html


In [0]:
print_section_header("Monitoring Summary")

status = report_df['overall_status'].values[0]
alerts = report_df['alerts'].values[0]

# Resolve absolute monitoring output path
import os
monitoring_path = config['monitoring']['output_path']
if not os.path.isabs(monitoring_path):
    workspace_root = os.path.abspath('..')
    monitoring_path = os.path.join(workspace_root, monitoring_path.lstrip('./'))

print(f"""
Overall Status: {status}
Alerts: {alerts}

Key Metrics:
- Prediction Entropy: {pred_distribution['entropy']:.4f}
- Drifted Features: {drift_results['drifted_features_count']}
- Data Quality Score: {data_quality['quality_score']:.2f}
- Degraded Metrics: {performance_comparison['degraded_metrics_count']}

📁 Monitoring outputs saved to: {monitoring_path}

📊 Reports Generated:
   ✓ monitoring_report_<timestamp>.csv - Summary metrics table
   ✓ monitoring_details_<timestamp>.json - Detailed results
   ✓ monitoring_report_<timestamp>.html - Interactive HTML dashboard
   ✓ evidently_data_drift_<timestamp>.html - Data Drift analysis report

💡 Open the HTML files in a browser for interactive visualizations!

✅ Model monitoring completed!
""")


                               Monitoring Summary                               


Overall Status: HEALTHY
Alerts: None

Key Metrics:
- Prediction Entropy: 2.9897
- Drifted Features: 0
- Data Quality Score: 1.00
- Degraded Metrics: 0

📁 Monitoring outputs saved to: /Workspace/Users/ashish.kamboj@tigeranalytics.com/home-credit-hyperpersonalization/outputs/monitoring

📊 Reports Generated:
   ✓ monitoring_report_<timestamp>.csv - Summary metrics table
   ✓ monitoring_details_<timestamp>.json - Detailed results
   ✓ monitoring_report_<timestamp>.html - Interactive HTML dashboard
   ✓ evidently_data_drift_<timestamp>.html - Data Drift analysis report

💡 Open the HTML files in a browser for interactive visualizations!

✅ Model monitoring completed!

