## Data Quality Dashboard in Python

**Description**: Create a basic dashboard using a Python library (e.g., Plotly Dash) to visualize data quality metrics for a given dataset.

In [2]:
pip install dash


Defaulting to user installation because normal site-packages is not writeable
Collecting dash
  Downloading dash-3.0.4-py3-none-any.whl.metadata (10 kB)
Collecting Flask<3.1,>=1.0.4 (from dash)
  Downloading flask-3.0.3-py3-none-any.whl.metadata (3.2 kB)
Collecting Werkzeug<3.1 (from dash)
  Downloading werkzeug-3.0.6-py3-none-any.whl.metadata (3.7 kB)
Collecting retrying (from dash)
  Downloading retrying-1.3.4-py3-none-any.whl.metadata (6.9 kB)
Collecting itsdangerous>=2.1.2 (from Flask<3.1,>=1.0.4->dash)
  Downloading itsdangerous-2.2.0-py3-none-any.whl.metadata (1.9 kB)
Collecting blinker>=1.6.2 (from Flask<3.1,>=1.0.4->dash)
  Downloading blinker-1.9.0-py3-none-any.whl.metadata (1.6 kB)
Downloading dash-3.0.4-py3-none-any.whl (7.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m00:01[0m0:02[0m
[?25hDownloading flask-3.0.3-py3-none-any.whl (101 kB)
Downloading werkzeug-3.0.6-py3-none-any.whl (227 kB)
Downloa

In [3]:
# Write your code from here
import dash
from dash import dcc, html
import plotly.graph_objs as go
import pandas as pd
from datetime import datetime

# Sample dataset
data = {
    'customer_id': [1, 2, 3, 4, 5],
    'name': ['John', 'Jane', 'Alice', 'Bob', 'Charlie'],
    'email': ['john@example.com', 'jane@example.com', None, 'bob@example.com', 'charlie@example.com'],
    'order_date': ['2023-01-01', '2023-03-01', '2022-07-15', '2023-04-10', '2023-02-20']
}

df = pd.DataFrame(data)

# Data Quality Metrics

def calculate_completeness(df):
    """
    Calculates the completeness of the dataset.
    Completeness is defined as the percentage of non-missing values in the entire dataset.
    """
    missing_data = df.isnull().sum().sum()
    total_data = df.size
    completeness = 100 - (missing_data / total_data * 100)
    return completeness

def calculate_uniqueness(df, column):
    """
    Calculates the uniqueness of the dataset based on the specified column.
    Uniqueness is defined as the percentage of unique entries in the given column.
    """
    if column not in df.columns:
        raise ValueError(f"Column '{column}' not found in the dataset")
    duplicate_count = df[column].duplicated().sum()
    uniqueness = 100 - (duplicate_count / len(df) * 100)
    return uniqueness

def calculate_timeliness(df, date_column):
    """
    Calculates the timeliness of the dataset based on the last update date.
    Timeliness is defined as the number of days since the most recent update.
    """
    # Ensure date_column exists and handle missing values
    if date_column not in df.columns:
        raise ValueError(f"Column '{date_column}' not found in the dataset")
    
    df[date_column] = pd.to_datetime(df[date_column], errors='coerce')  # Convert to datetime, errors become NaT
    if df[date_column].isnull().any():
        raise ValueError(f"Invalid date values found in column '{date_column}'")
    
    # Drop missing dates before calculating timeliness
    df = df.dropna(subset=[date_column])
    
    today = datetime.today()
    max_date = df[date_column].max()
    days_since_last_update = (today - max_date).days
    return days_since_last_update

def calculate_consistency(df, column):
    """
    Calculates the consistency of the dataset based on the specified column.
    Consistency is defined as the percentage of valid (non-null) entries in the column.
    """
    if column not in df.columns:
        raise ValueError(f"Column '{column}' not found in the dataset")
    
    valid_entries = df[column].apply(lambda x: isinstance(x, str) and "@" in x).sum()
    consistency = valid_entries / len(df) * 100
    return consistency

# Calculate data quality metrics
completeness_score = calculate_completeness(df)
uniqueness_score = calculate_uniqueness(df, 'customer_id')
timeliness_score = calculate_timeliness(df, 'order_date')
consistency_score = calculate_consistency(df, 'email')

# Initialize Dash app
app = dash.Dash(__name__)

# Layout of the dashboard
app.layout = html.Div([
    html.H1("Data Quality Dashboard", style={'textAlign': 'center'}),

    html.Div([
        html.Div([
            html.H3("Completeness"),
            html.Div(f"Completeness Score: {completeness_score:.2f}%"),
        ], style={'padding': 10, 'width': '45%', 'display': 'inline-block'}),

        html.Div([
            html.H3("Uniqueness"),
            html.Div(f"Uniqueness Score: {uniqueness_score:.2f}%"),
        ], style={'padding': 10, 'width': '45%', 'display': 'inline-block'}),

    ], style={'display': 'flex', 'justifyContent': 'space-between'}),

    html.Div([
        html.Div([
            html.H3("Timeliness"),
            html.Div(f"Days Since Last Update: {timeliness_score} days"),
        ], style={'padding': 10, 'width': '45%', 'display': 'inline-block'}),

        html.Div([
            html.H3("Consistency"),
            html.Div(f"Consistency Score: {consistency_score:.2f}%"),
        ], style={'padding': 10, 'width': '45%', 'display': 'inline-block'}),

    ], style={'display': 'flex', 'justifyContent': 'space-between'}),

    html.Div([
        dcc.Graph(
            id='data-quality-graph',
            figure={
                'data': [
                    go.Bar(
                        x=['Completeness', 'Uniqueness', 'Timeliness', 'Consistency'],
                        y=[completeness_score, uniqueness_score, 100 - timeliness_score, consistency_score],
                        name='Data Quality Score'
                    )
                ],
                'layout': go.Layout(
                    title='Data Quality Metrics',
                    xaxis={'title': 'Metric'},
                    yaxis={'title': 'Score (%)'}
                )
            }
        )
    ])
])

# Run the app
if __name__ == '__main__':
    app.run(debug=True)