# Regular Data Quality Monitoring

## Objectives
1. Set up automated monitoring schedules
2. Create quality metrics dashboard
3. Implement alert system for quality issues

## Usage Guide
This notebook provides a flexible scheduling system for data quality monitoring. Here's how to use it:

### 1. Basic Setup
```python
# Create monitoring instance
monitoring = ScheduledMonitoring()
```

### 2. Schedule Jobs
You can schedule jobs with different intervals:
- Seconds: `monitoring.schedule_job('quick_check', 'seconds', 30)`
- Minutes: `monitoring.schedule_job('regular_check', 'minutes', 5)`
- Hourly: `monitoring.schedule_job('hourly_check', 'hourly', 2)`
- Daily: `monitoring.schedule_job('daily_report', 'daily', 1)`

### 3. Manage Jobs
- List jobs: `monitoring.list_jobs()`
- Remove job: `monitoring.remove_job('job_id')`

### 4. Run Scheduler
```python
monitoring.run_scheduler()  # Press Ctrl+C to stop
```

### Example Usage
```python
# Initialize
monitoring = ScheduledMonitoring()

# Schedule various checks
monitoring.schedule_job('quick_check', 'seconds', 30)  # Every 30 seconds
monitoring.schedule_job('hourly_report', 'hourly', 1)  # Every hour

# View scheduled jobs
print(monitoring.list_jobs())

# Remove a job if needed
monitoring.remove_job('quick_check')

# Start monitoring
monitoring.run_scheduler()
```

In [11]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import logging
import json
import schedule
import time
from pathlib import Path

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('data_quality_monitoring.log'),
        logging.StreamHandler()
    ]
)

In [12]:
# Load HR data files
def load_hr_data():
    data_files = {
        'people': '../../dummy_data/PER_ALL_PEOPLE_F_20241216.csv',
        'assignments': '../../dummy_data/PER_ALL_ASSIGNMENTS_F_20241216.csv',
        'addresses': '../../dummy_data/HR_ALL_ADDRESSES_20241216.csv'
    }
    
    return {name: pd.read_csv(path) for name, path in data_files.items()}

In [13]:
class DataQualityMonitor:
    def __init__(self):
        self.logger = logging.getLogger(__name__)
        self.metrics_history = []
        
    def calculate_metrics(self, df):
        """Calculate key data quality metrics"""
        metrics = {
            'timestamp': datetime.now().isoformat(),
            'total_records': len(df),
            'missing_values_pct': (df.isnull().sum() / len(df)).to_dict(),
            'duplicate_records': df.duplicated().sum(),
            'column_stats': {}
        }
        
        # Calculate column-specific statistics
        for column in df.columns:
            if df[column].dtype in ['int64', 'float64']:
                metrics['column_stats'][column] = {
                    'mean': df[column].mean(),
                    'std': df[column].std(),
                    'min': df[column].min(),
                    'max': df[column].max()
                }
        
        self.metrics_history.append(metrics)
        return metrics
    
    def detect_anomalies(self, current_metrics, threshold=2):
        """Detect anomalies in metrics compared to historical data"""
        if len(self.metrics_history) < 2:
            return []
        
        anomalies = []
        historical_metrics = pd.DataFrame(self.metrics_history[:-1])
        
        # Check for significant deviations
        for metric in ['total_records', 'duplicate_records']:
            historical_mean = historical_metrics[metric].mean()
            historical_std = historical_metrics[metric].std()
            
            if historical_std > 0:
                z_score = abs(current_metrics[metric] - historical_mean) / historical_std
                if z_score > threshold:
                    anomalies.append(f"{metric} shows unusual pattern (z-score: {z_score:.2f})")
        
        return anomalies

In [14]:
class QualityDashboard:
    def __init__(self):
        self.logger = logging.getLogger(__name__)
        
    def generate_report(self, metrics, anomalies):
        """Generate a formatted report of quality metrics and anomalies"""
        report = ["Data Quality Dashboard"]
        report.append("=" * 50)
        
        # Basic metrics
        report.append(f"\nTimestamp: {metrics['timestamp']}")
        report.append(f"Total Records: {metrics['total_records']}")
        report.append(f"Duplicate Records: {metrics['duplicate_records']}")
        
        # Missing values
        report.append("\nMissing Values (%)")
        report.append("-" * 20)
        for col, pct in metrics['missing_values_pct'].items():
            report.append(f"{col}: {pct:.2%}")
        
        # Column statistics
        report.append("\nColumn Statistics")
        report.append("-" * 20)
        for col, stats in metrics['column_stats'].items():
            report.append(f"\n{col}:")
            for stat, value in stats.items():
                report.append(f"  {stat}: {value:.2f}")
        
        # Anomalies
        if anomalies:
            report.append("\nDetected Anomalies")
            report.append("-" * 20)
            for anomaly in anomalies:
                report.append(f"! {anomaly}")
        
        return '\n'.join(report)
    
    def save_report(self, report, filename='quality_report.txt'):
        """Save the report to a file"""
        try:
            with open(filename, 'w') as f:
                f.write(report)
            self.logger.info(f"Report saved to {filename}")
        except Exception as e:
            self.logger.error(f"Failed to save report: {str(e)}")

In [15]:
class ScheduledMonitoring:
    def __init__(self):
        self.logger = logging.getLogger(__name__)
        self.monitor = DataQualityMonitor()
        self.dashboard = QualityDashboard()
        self.alert_system = AlertSystem()
        self.jobs = {}
        
    def monitoring_task(self):
        """Main monitoring task to be scheduled"""
        try:
            # Load and analyze data
            hr_data = load_hr_data()
            
            for dataset_name, df in hr_data.items():
                # Calculate metrics
                metrics = self.monitor.calculate_metrics(df)
                
                # Detect anomalies
                anomalies = self.monitor.detect_anomalies(metrics)
                
                # Generate report
                report = self.dashboard.generate_report(metrics, anomalies)
                self.dashboard.save_report(report, f'quality_report_{dataset_name}.txt')
                
                # Send alerts if anomalies detected
                if anomalies:
                    self.alert_system.send_alert(dataset_name, anomalies)
                    
            self.logger.info("Monitoring task completed successfully")
            
        except Exception as e:
            self.logger.error(f"Error in monitoring task: {str(e)}")
    
    def schedule_job(self, job_id, interval_type, interval_value):
        """Schedule a new monitoring job
        
        Args:
            job_id (str): Unique identifier for the job
            interval_type (str): 'seconds', 'minutes', 'hourly', 'daily'
            interval_value (int): Value for the interval
        """
        if job_id in self.jobs:
            self.logger.warning(f"Job {job_id} already exists. Removing old schedule.")
            self.remove_job(job_id)
        
        if interval_type == 'seconds':
            job = schedule.every(interval_value).seconds.do(self.monitoring_task)
        elif interval_type == 'minutes':
            job = schedule.every(interval_value).minutes.do(self.monitoring_task)
        elif interval_type == 'hourly':
            job = schedule.every(interval_value).hours.do(self.monitoring_task)
        elif interval_type == 'daily':
            job = schedule.every(interval_value).days.do(self.monitoring_task)
        else:
            raise ValueError(f"Invalid interval type: {interval_type}")
        
        self.jobs[job_id] = job
        self.logger.info(f"Scheduled new monitoring job: {job_id} ({interval_value} {interval_type})")
    
    def remove_job(self, job_id):
        """Remove a scheduled job"""
        if job_id in self.jobs:
            schedule.cancel_job(self.jobs[job_id])
            del self.jobs[job_id]
            self.logger.info(f"Removed job: {job_id}")
        else:
            self.logger.warning(f"Job {job_id} not found")
    
    def list_jobs(self):
        """List all scheduled jobs"""
        return list(self.jobs.keys())
    
    def run_scheduler(self):
        """Run the scheduler (blocking)"""
        self.logger.info("Starting scheduler...")
        while True:
            schedule.run_pending()
            time.sleep(1)

In [16]:
class AlertSystem:
    def __init__(self):
        self.logger = logging.getLogger(__name__)
        
    def send_alert(self, dataset_name, anomalies):
        """Send alerts for detected anomalies"""
        alert_message = f"\nData Quality Alert for {dataset_name}\n"
        alert_message += "=" * 50 + "\n"
        alert_message += "\nDetected Anomalies:\n"
        
        for anomaly in anomalies:
            alert_message += f"- {anomaly}\n"
        
        # Log the alert
        self.logger.warning(alert_message)
        
        # Save alert to file
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        alert_file = f"alert_{dataset_name}_{timestamp}.txt"
        
        try:
            with open(alert_file, 'w') as f:
                f.write(alert_message)
            self.logger.info(f"Alert saved to {alert_file}")
        except Exception as e:
            self.logger.error(f"Failed to save alert: {str(e)}")

## Example Usage

Here are some examples of how to use the monitoring system:

In [17]:
# Initialize the monitoring system
monitoring = ScheduledMonitoring()

# Example 1: Quick check every 30 seconds
monitoring.schedule_job('quick_check', 'seconds', 30)

# Example 2: Regular check every 5 minutes
monitoring.schedule_job('regular_check', 'minutes', 5)

# Example 3: Hourly detailed check
monitoring.schedule_job('hourly_check', 'hourly', 1)

# Example 4: Daily report at midnight
monitoring.schedule_job('daily_report', 'daily', 1)

# List all scheduled jobs
print("\nScheduled jobs:")
print(monitoring.list_jobs())

# Example: Remove a specific job
print("\nRemoving 'quick_check' job...")
monitoring.remove_job('quick_check')

# Show updated job list
print("\nUpdated job list:")
print(monitoring.list_jobs())


Scheduled jobs:
['quick_check', 'regular_check', 'hourly_check', 'daily_report']

Removing 'quick_check' job...

Updated job list:
['regular_check', 'hourly_check', 'daily_report']


In [18]:
# Run the scheduler (uncomment to start)
# Note: This will block the notebook execution
# monitoring.run_scheduler()

In [24]:
# Example: Remove a specific job
print("\nRemoving 'quick_check' job...")
monitoring.remove_job('quick_check')
print("\nRemoving 'regular_check' job...")
monitoring.remove_job('regular_check')
print("\nRemoving 'hourly_check' job...")
monitoring.remove_job('hourly_check')
print("\nRemoving 'daily_report' job...")
monitoring.remove_job('daily_report')


Removing 'quick_check' job...

Removing 'regular_check' job...

Removing 'hourly_check' job...

Removing 'daily_report' job...
