In [0]:
# Install required packages (run only once)
%pip install evidently==0.7.19 pyyaml==6.0.2 pandas==2.2.3 numpy==2.2.1 azure-storage-file-datalake==12.20.0 azure-identity==1.19.0 plotly==5.24.1

print("Packages installed successfully")

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m
Packages installed successfully


In [0]:
# Import required libraries
import sys
import os
from pathlib import Path
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Add utils to path (adjust path as needed for Databricks)
sys.path.append('/Workspace/Users/<<USER_ID>>/data-drift-evidently-ai')
sys.path.append(str(Path.cwd().parent))  # For local testing

# Import custom utilities
from utils import ConfigManager, DriftDetector, ReportManager, DataLoader, setup_logger

print("âœ“ All libraries imported successfully")


âœ“ All libraries imported successfully


## Configuration and Setup

In [0]:
# Configuration file path (adjust as needed)
CONFIG_PATH = '../config/drift_config.yaml'

# For Databricks, use dbutils to work with workspace files
# CONFIG_PATH = '/Workspace/Repos/<your-repo>/data-drift-evidently-ai/config/drift_config.yaml'

# Initialize configuration manager
try:
    config = ConfigManager(CONFIG_PATH)
    print("âœ“ Configuration loaded successfully")
    print(f"  - Catalog: {config.get_catalog_name()}")
    print(f"  - Schema: {config.get_schema_name()}")
    print(f"  - Tables to monitor: {len(config.get_tables())}")
except Exception as e:
    print(f"âœ— Error loading configuration: {e}")
    raise

âœ“ Configuration loaded successfully
  - Catalog: data_catalog
  - Schema: outputs
  - Tables to monitor: 3


In [0]:
# Setup logging
logger = setup_logger(
    log_level=config.get_log_level(),
    logger_name='data_drift',
    adls_config=config.get_adls_config() if config.is_adls_output_enabled() else None
)

logger.info("="*80)
logger.info("Data Drift Detection - Python Version")
logger.info("="*80)
logger.info(f"Configuration loaded from: {CONFIG_PATH}")

INFO - Data Drift Detection - Python Version
INFO - Configuration loaded from: ../config/drift_config.yaml


## Initialize Components

In [0]:
# Initialize drift detector
drift_detector = DriftDetector(config)
logger.info("âœ“ Drift detector initialized")

# Initialize report manager
report_manager = ReportManager(config)
logger.info("âœ“ Report manager initialized")

# Initialize data loader (without Spark for Python version)
data_loader = DataLoader(config, spark=spark)  # Pass spark session from Databricks
logger.info("âœ“ Data loader initialized")

print("\nâœ“ All components initialized successfully")

INFO - âœ“ Drift detector initialized
INFO - âœ“ Report manager initialized
INFO - âœ“ Data loader initialized

âœ“ All components initialized successfully


## Process Each Table

Loop through configured tables and detect drift

In [0]:
# Get list of tables from configuration
tables = config.get_tables()
results_summary = []

print(f"\nProcessing {len(tables)} tables for drift detection...")
print("="*80)

for idx, table_config in enumerate(tables, 1):
    table_name = table_config['name']
    columns = table_config.get('columns', 'all')
    
    logger.info(f"\n{'='*80}")
    logger.info(f"Processing table {idx}/{len(tables)}: {table_name}")
    logger.info(f"{'='*80}")
    
    try:
        # Load data versions
        logger.info(f"Loading data for table: {table_name}")
        reference_data, current_data = data_loader.load_table_versions(
            table_name=table_name,
            use_spark=True  # Using Spark to load from Unity Catalog
        )
        
        logger.info(f"Data loaded - Reference: {len(reference_data)} rows, Current: {len(current_data)} rows")
        
        # Apply sampling if configured
        reference_data = drift_detector.apply_sampling(reference_data)
        current_data = drift_detector.apply_sampling(current_data)
        
        # Detect drift
        logger.info(f"Running drift detection...")
        report, drift_summary = drift_detector.detect_drift(
            reference_data=reference_data,
            current_data=current_data,
            table_name=table_name,
            columns=columns if columns != 'all' else None,
        )
        
        # Log drift results
        if drift_summary['dataset_drift']:
            logger.warning(f"âš ï¸  DRIFT DETECTED in {table_name}")
            logger.warning(f"   Drifted columns: {drift_summary['drifted_columns']}")
        else:
            logger.info(f"âœ“ No significant drift detected in {table_name}")
        
        logger.info(f"   Total columns: {drift_summary['num_columns']}")
        logger.info(f"   Drifted columns: {drift_summary['num_drifted_columns']}")
        logger.info(f"   Drift share: {drift_summary['drift_share']:.2%}")
        
        # Save reports
        logger.info(f"Saving reports for {table_name}...")
        saved_paths = report_manager.save_reports(
            report=report,
            drift_summary=drift_summary,
            table_name=table_name
        )
        
        for report_type, path in saved_paths.items():
            logger.info(f"   {report_type}: {path}")
        
        # Add to summary
        results_summary.append({
            'table_name': table_name,
            'total_columns': drift_summary['num_columns'],
            'drifted_columns': drift_summary['num_drifted_columns'],
            'drift_share': drift_summary['drift_share'],
            'dataset_drift': drift_summary['dataset_drift'],
            'report_paths': saved_paths
        })
        
        print(f"\nâœ“ Completed: {table_name}")
        if drift_summary['dataset_drift']:
            print(f"  âš ï¸  DRIFT DETECTED ({drift_summary['num_drifted_columns']} columns)")
        else:
            print(f"  âœ“ No drift detected")
    
    except Exception as e:
        logger.error(f"âœ— Error processing table {table_name}: {e}", exc_info=True)
        print(f"\nâœ— Error processing {table_name}: {e}")
        results_summary.append({
            'table_name': table_name,
            'error': str(e)
        })
    
    print("-"*80)

logger.info("\n" + "="*80)
logger.info("Drift detection completed for all tables")
logger.info("="*80)


Processing 3 tables for drift detection...
INFO - 
INFO - Processing table 1/3: customer_data
INFO - Loading data for table: customer_data
Comparing UC versions: 2 (reference) vs 3 (current)
INFO - Data loaded - Reference: 10000 rows, Current: 10000 rows
INFO - Running drift detection...
INFO -    Total columns: 9
INFO -    Drifted columns: 6
INFO -    Drift share: 75.00%
INFO - Saving reports for customer_data...
INFO -    local_html: /Workspace/Users/ashu.009kamboj@gmail.com/data-drift-evidently-ai/reports/html/customer_data_20260120_165052_drift_report.html
INFO -    local_json: /Workspace/Users/ashu.009kamboj@gmail.com/data-drift-evidently-ai/reports/json/customer_data_20260120_165052_drift_report.json

âœ“ Completed: customer_data
  âš ï¸  DRIFT DETECTED (6 columns)
--------------------------------------------------------------------------------
INFO - 
INFO - Processing table 2/3: product_sales
INFO - Loading data for table: product_sales
Comparing UC versions: 2 (reference) vs

In [0]:
results_summary

[{'table_name': 'customer_data',
  'total_columns': 9,
  'drifted_columns': 6,
  'drift_share': 0.75,
  'dataset_drift': True,
  'report_paths': {'local_html': '/Workspace/Users/ashu.009kamboj@gmail.com/data-drift-evidently-ai/reports/html/customer_data_20260120_165052_drift_report.html',
   'local_json': '/Workspace/Users/ashu.009kamboj@gmail.com/data-drift-evidently-ai/reports/json/customer_data_20260120_165052_drift_report.json'}},
 {'table_name': 'product_sales',
  'total_columns': 4,
  'drifted_columns': 3,
  'drift_share': 0.75,
  'dataset_drift': True,
  'report_paths': {'local_html': '/Workspace/Users/ashu.009kamboj@gmail.com/data-drift-evidently-ai/reports/html/product_sales_20260120_165059_drift_report.html',
   'local_json': '/Workspace/Users/ashu.009kamboj@gmail.com/data-drift-evidently-ai/reports/json/product_sales_20260120_165059_drift_report.json'}},
 {'table_name': 'user_behavior',
  'total_columns': 9,
  'drifted_columns': 5,
  'drift_share': 0.625,
  'dataset_drift': 

## Summary Report

Display overall drift detection results

In [0]:
# Create summary DataFrame
summary_df = pd.DataFrame(results_summary)
print(summary_df)

print("\n" + "="*80)
print("DRIFT DETECTION SUMMARY")
print("="*80)

if 'error' in summary_df.columns:
    # Show tables with errors
    error_tables = summary_df[summary_df['error'].notna()]
    if not error_tables.empty:
        print("\nâš ï¸  Tables with errors:")
        for _, row in error_tables.iterrows():
            print(f"  - {row['table_name']}: {row['error']}")
else:
    # Ensure 'error' column exists (add it if missing)
    summary_df['error'] = None  # or pd.NA

# Show tables without errors
success_df = summary_df[~summary_df['table_name'].isin(summary_df[summary_df.get('error', pd.Series()).notna()]['table_name'])]

if not success_df.empty:
    print(f"\nâœ“ Successfully processed {len(success_df)} tables\n")
    
    # Display summary table
    display_df = success_df[['table_name', 'total_columns', 'drifted_columns', 'drift_share', 'dataset_drift']].copy()
    display_df['drift_share'] = display_df['drift_share'].apply(lambda x: f"{x:.2%}")
    display_df['drift_status'] = display_df['dataset_drift'].apply(lambda x: 'âš ï¸ DRIFT' if x else 'âœ“ OK')
    display_df = display_df.drop('dataset_drift', axis=1)
    display_df.columns = ['Table', 'Total Columns', 'Drifted Columns', 'Drift Share', 'Status']
    
    print(display_df.to_string(index=False))
    
    # Overall statistics
    total_drifted = display_df[display_df['Status'] == 'âš ï¸ DRIFT'].shape[0]
    total_ok = display_df[display_df['Status'] == 'âœ“ OK'].shape[0]
    
    print(f"\n{'='*80}")
    print(f"Overall Statistics:")
    print(f"  - Total tables processed: {len(display_df)}")
    print(f"  - Tables with drift: {total_drifted}")
    print(f"  - Tables without drift: {total_ok}")
    print(f"{'='*80}\n")

print("To view reports, check the output locations:")
print(f"  Local: reports (HTML: reports/html, JSON: reports/json)")
if config.is_adls_output_enabled():
    adls_config = config.get_adls_config()
    print(f"  ADLS: {adls_config['container']}/{adls_config['base_path']}")

      table_name  ...                                       report_paths
0  customer_data  ...  {'local_html': '/Workspace/Users/ashu.009kambo...
1  product_sales  ...  {'local_html': '/Workspace/Users/ashu.009kambo...
2  user_behavior  ...  {'local_html': '/Workspace/Users/ashu.009kambo...

[3 rows x 6 columns]

DRIFT DETECTION SUMMARY

âœ“ Successfully processed 3 tables

        Table  Total Columns  Drifted Columns Drift Share   Status
customer_data              9                6      75.00% âš ï¸ DRIFT
product_sales              4                3      75.00% âš ï¸ DRIFT
user_behavior              9                5      62.50% âš ï¸ DRIFT

Overall Statistics:
  - Total tables processed: 3
  - Tables with drift: 3
  - Tables without drift: 0

To view reports, check the output locations:
  Local: reports (HTML: reports/html, JSON: reports/json)


## View Sample Report

Display a sample drift report inline (optional)

In [0]:
# Uncomment to display HTML report inline in notebook
# from IPython.display import HTML, display

# if len(results_summary) > 0 and 'report_paths' in results_summary[0]:
#     # Get first HTML report path
#     for result in results_summary:
#         if 'report_paths' in result:
#             paths = result['report_paths']
#             html_path = paths.get('local_html')
#             if html_path and os.path.exists(html_path):
#                 print(f"Displaying report for: {result['table_name']}")
#                 with open(html_path, 'r', encoding='utf-8') as f:
#                     html_content = f.read()
#                 display(HTML(html_content))
#                 break

print("To view reports, check the output directories:")
if config.is_local_output_enabled():
    print(f"  Local: {config.get_local_output_path()}")
if config.is_adls_output_enabled():
    adls_config = config.get_adls_config()
    print(f"  ADLS: {adls_config['container']}/{adls_config['base_path']}")

To view reports, check the output directories:


## Next Steps

### Viewing Reports
- **HTML Reports**: Open in browser for interactive visualization
- **JSON Reports**: Use for programmatic analysis or integration with monitoring systems

### Next Steps
1. Review detailed reports for tables with detected drift
2. Investigate root causes of drift in specific columns
3. Adjust statistical test thresholds if needed in configuration
4. Schedule this notebook to run periodically for continuous monitoring
5. Set up alerting based on drift detection results

### Configuration Tips
- Adjust `column_drift_threshold` to control sensitivity
- Add or remove statistical tests based on your data characteristics
- Enable ADLS output for production use
- Configure sampling for very large datasets