## Architecture to Monitor Data Quality Over Time

**Description**: Design a monitoring system in Python that checks and logs data quality metrics (accuracy, completeness) for a dataset over time.

**Steps to follow:**
1. Implement a Scheduled Script:
    - Use schedule library to periodically run a script.
2. Script to Calculate Metrics:
    - For simplicity, use a function calculate_quality_metrics() that calculates and logs metrics such as missing rate or mismatch rate.
3. Store Logs:
    - Use Python's logging library to save these metrics over time.

In [1]:
# Write your code from here
import schedule
import time
import logging
import pandas as pd
import numpy as np
from datetime import datetime

# Configure logging
logging.basicConfig(filename='data_quality_log.txt', level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

DATASET_PATH = 'your_dataset.csv'  # Replace with the actual path to your dataset
ACCURACY_THRESHOLD = 0.95  # Example threshold for accuracy (95%)
COMPLETENESS_THRESHOLD = 0.90  # Example threshold for completeness (90%)

def calculate_quality_metrics(file_path):
    """
    Calculates and logs data quality metrics (accuracy and completeness)
    for a given dataset.

    Args:
        file_path (str): Path to the dataset CSV file.
    """
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        logging.error(f"Dataset not found at: {file_path}")
        return

    total_rows = len(df)

    # --- Completeness Metric (Example: Check for missing values in key columns) ---
    key_columns_for_completeness = ['product_id', 'customer_id', 'order_date']
    missing_counts = df[key_columns_for_completeness].isnull().sum()
    total_missing = missing_counts.sum()
    completeness_rate = (1 - (total_missing / (total_rows * len(key_columns_for_completeness)))) if (total_rows * len(key_columns_for_completeness)) > 0 else 1.0
    completeness_percentage = completeness_rate * 100
    logging.info(f"Completeness Rate: {completeness_percentage:.2f}% (Missing in {key_columns_for_completeness}: {missing_counts.to_dict()})")

    if completeness_rate < COMPLETENESS_THRESHOLD:
        logging.warning(f"Completeness below threshold ({COMPLETENESS_THRESHOLD * 100:.2f}%).")

    # --- Accuracy Metric (Example: Check for invalid values in a specific column) ---
    price_column = 'price'
    invalid_price_count = df[df[price_column] <= 0][price_column].count()
    accuracy_rate = (1 - (invalid_price_count / total_rows)) if total_rows > 0 else 1.0
    accuracy_percentage = accuracy_rate * 100
    logging.info(f"Accuracy Rate (Price > 0): {accuracy_percentage:.2f}% (Invalid Price Count: {invalid_price_count})")

    if accuracy_rate < ACCURACY_THRESHOLD:
        logging.warning(f"Accuracy below threshold ({ACCURACY_THRESHOLD * 100:.2f}%).")

def run_data_quality_check():
    """
    Runs the data quality check and logs the metrics.
    """
    logging.info("--- Starting Data Quality Check ---")
    calculate_quality_metrics(DATASET_PATH)
    logging.info("--- Data Quality Check Finished ---")

if __name__ == "__main__":
    # Schedule the data quality check to run periodically
    schedule.every().day.at("03:00").do(run_data_quality_check)  # Run daily at 3:00 AM
    # schedule.every(1).hour.do(run_data_quality_check)        # Run every hour
    # schedule.every(5).minutes.do(run_data_quality_check)     # Run every 5 minutes

    logging.info("Data Quality Monitoring Service Started. Checks scheduled.")
    while True:
        schedule.run_pending()
        time.sleep(1)

ModuleNotFoundError: No module named 'schedule'