## Architecture to Monitor Data Quality Over Time

**Description**: Design a monitoring system in Python that checks and logs data quality metrics (accuracy, completeness) for a dataset over time.

**Steps to follow:**
1. Implement a Scheduled Script:
    - Use schedule library to periodically run a script.
2. Script to Calculate Metrics:
    - For simplicity, use a function calculate_quality_metrics() that calculates and logs metrics such as missing rate or mismatch rate.
3. Store Logs:
    - Use Python's logging library to save these metrics over time.

In [1]:
# Write your code from here
import schedule
import time
import pandas as pd
from io import StringIO
import logging
from datetime import datetime

# Configure logging
logging.basicConfig(filename='data_quality_log.txt', level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

# --- Configuration ---
DATA_FILE_PATH = 'daily_data.csv'  # Replace with your actual data file path
TRUSTED_SOURCE_PATH = 'trusted_data.csv' # Replace with your trusted source file path (if applicable for accuracy)
ACCURACY_THRESHOLD = 0.95  # Example: 95% accuracy
COMPLETENESS_THRESHOLD = 0.98 # Example: 98% completeness
SCHEDULE_INTERVAL_HOURS = 24 # Run the check every 24 hours (daily)

def load_data(file_path):
    """Loads data from a CSV file."""
    try:
        df = pd.read_csv(file_path)
        return df
    except FileNotFoundError:
        logging.error(f"Data file not found: {file_path}")
        return None

def calculate_accuracy(current_df, trusted_df, join_column='product_id', data_column='price_current', trusted_column='price_trusted'):
    """Calculates accuracy by comparing to a trusted source."""
    if current_df is None or trusted_df is None or join_column not in current_df.columns or join_column not in trusted_df.columns or data_column not in current_df.columns or trusted_column not in trusted_df.columns:
        logging.warning("Missing DataFrames or required columns for accuracy calculation.")
        return None

    merged_df = pd.merge(current_df, trusted_df, on=join_column, how='inner')
    if merged_df.empty:
        logging.warning("No common records found for accuracy comparison.")
        return 0.0

    match_count = (merged_df[data_column] == merged_df[trusted_column]).sum()
    accuracy_rate = match_count / len(merged_df)
    return accuracy_rate

def calculate_completeness(df, columns_to_check):
    """Calculates completeness for specified columns."""
    if df is None or not columns_to_check:
        logging.warning("Missing DataFrame or columns to check for completeness.")
        return None

    total_rows = len(df)
    if total_rows == 0:
        logging.warning("DataFrame is empty, completeness is 100%.")
        return 1.0

    missing_counts = df[columns_to_check].isnull().sum().sum()
    total_expected_values = total_rows * len(columns_to_check)
    completeness_rate = 1 - (missing_counts / total_expected_values) if total_expected_values > 0 else 1.0
    return completeness_rate

def calculate_quality_metrics():
    """Calculates data quality metrics and logs them."""
    now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    logging.info(f"--- Running Data Quality Check at {now} ---")

    current_data_df = load_data(DATA_FILE_PATH)
    if current_data_df is None:
        return

    # --- Calculate Accuracy (if a trusted source is available) ---
    trusted_data_df = load_data(TRUSTED_SOURCE_PATH)
    if trusted_data_df is not None:
        accuracy = calculate_accuracy(current_data_df, trusted_data_df)
        if accuracy is not None:
            logging.info(f"Accuracy Rate: {accuracy:.4f}")
            if accuracy < ACCURACY_THRESHOLD:
                logging.warning(f"Accuracy below threshold ({ACCURACY_THRESHOLD:.2f}).")

    # --- Calculate Completeness for critical fields ---
    critical_columns = ['transaction_id', 'amount', 'date'] # Adjust to your critical columns
    completeness = calculate_completeness(current_data_df, critical_columns)
    if completeness is not None:
        logging.info(f"Completeness Rate (for {critical_columns}): {completeness:.4f}")
        if completeness < COMPLETENESS_THRESHOLD:
            logging.warning(f"Completeness below threshold ({COMPLETENESS_THRESHOLD:.2f}).")

    logging.info("--- Data Quality Check Completed ---")

# Schedule the job to run periodically
schedule.every(SCHEDULE_INTERVAL_HOURS).hours.do(calculate_quality_metrics)

if __name__ == "__main__":
    logging.info("Data Quality Monitoring System Started.")
    while True:
        schedule.run_pending()
        time.sleep(1)

ModuleNotFoundError: No module named 'schedule'