In [19]:
from datetime import datetime
import numpy as np
import pandas as pd

dt_stamp = datetime.now().strftime("%Y%m%d_%H%M")

# Dummy dataset
data_df = pd.DataFrame({
    'age': [25, 35, 40, np.nan, 50, 60, 35, 65, 30, 45],
    'income': [40000, 50000, 60000, 70000, np.nan, 80000, 75000, 65000, 70000, 55000]
})

# Config table
config_table = pd.DataFrame({
    'column': ['age', 'income'],
    'min_val': [0, 0],
    'max_val': [100, 200000],
    'type': ['int', 'float'],
    'rules': [['rule_01', 'rule_03'], ['rule_02', 'rule_03']]
})

# Define rules
def check_completeness(df, config):
    """Checks the completeness of a column in a DataFrame.

    Parameters:
    df (DataFrame): The input DataFrame.
    config (Series): The configuration row.

    Returns:
    float: The proportion of non-null entries in the column.
    """
    column = config['column']
    total = len(df)
    non_na = df[column].count()
    return non_na/total

def check_range(df, config):
    """Checks if all values in a DataFrame column fall within a specified range.

    Parameters:
    df (DataFrame): The input DataFrame.
    config (Series): The configuration row.

    Returns:
    int: Returns 1 if all values are within range, 0 otherwise.
    """
    column = config['column']
    min_val = config['min_val']
    max_val = config['max_val']
    
    df = df.dropna(subset=[column])
    if df[column].between(min_val, max_val).all():
        return 1
    else:
        return 0

def check_datatype(df, config):
    """Checks if all values in a DataFrame column match a specified data type.

    Parameters:
    df (DataFrame): The input DataFrame.
    config (Series): The configuration row.

    Returns:
    int: Returns 1 if all values match the data type, 0 otherwise.
    """
    column = config['column']
    data_type = config['type']

    if data_type == 'int':
        try:
            df[column] = df[column].astype('float').astype('int')
            return 1
        except ValueError:
            return 0
    elif data_type == 'float':
        try:
            df[column] = df[column].astype('float')
            return 1
        except ValueError:
            return 0
    else:
        return np.nan
    
    
def assess_outcome(category, result):
    """
    Assigns a color to the result based on the category and the result.

    Parameters:
    category (str): The category of the rule.
    result (float or int): The result of the rule.

    Returns:
    str: A color ('green', 'amber', or 'red').
    """

    if category == 'completeness':
        if result > 0.95:
            return 'green'
        elif result > 0.85:
            return 'amber'
        else:
            return 'red'
    elif category == 'validity':
        if result == 1:
            return 'green'
        else:
            return 'red'
    elif category == 'consistency':
        if result == 1:
            return 'green'
        else:
            return 'red'
    else:
        # Add more categories as needed
        pass

# Store rules in a dictionary
rules_dict = {
    'rule_01': (check_completeness, 'completeness'),
    'rule_02': (check_range, 'validity'),
    'rule_03': (check_datatype, 'consistency')
}

# Run rules and collect results
results = []
for idx, config in config_table.iterrows():
    for rule_name, rule_tuple in rules_dict.items():
        if rule_name in config['rules']:
            try:
                rule_func = rule_tuple[0]  # function
                rule_category = rule_tuple[1]  # category
                result = rule_func(data_df, config)
                color = assess_outcome(rule_category, result)
                results.append([config['column'], rule_name, rule_category, result, color, dt_stamp])
            except Exception as e:
                print(f'The rule {rule_name} failed with column {config["column"]}')
                results.append([config['column'], rule_name, rule_category, np.nan, np.nan, dt_stamp])

# Create dataframe for results
columns = ['column', 'id', 'category', 'metric', 'result', 'assessment_date']
final_result = pd.DataFrame(results, columns=columns)
print(final_result)

   column       id      category  metric result assessment_date
0     age  rule_01  completeness     0.9  amber   20230601_2149
1     age  rule_03   consistency     0.0    red   20230601_2149
2  income  rule_02      validity     1.0  green   20230601_2149
3  income  rule_03   consistency     1.0  green   20230601_2149
