In [0]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, when, lit, array, size, concat_ws
from typing import Tuple, Dict, List
import re

In [0]:

def validate_bronze_rules(
    df: DataFrame,
    rules: Dict[str, List[Dict[str, Any]]],
    failed_rules: List[str],
) -> Tuple[DataFrame, DataFrame]:
    """
    Validates a DataFrame against bronze rules.

    Parameters:
    - df: Input DataFrame
    - rules: dict of rules for each column
    - failed_rules: list of rule_types that result in hard failure
    Returns:
    - clean_df: rows passing all failed_rules and quarantine_rules
    - failed_df: rows failing any failed_rule
    """

    df_with_flags = df
    failed_exprs = []

    for col_name, rule_list in rules.items():
        if col_name == "*":
            continue  # global or dataset-level rules not handled here

        for rule in rule_list:
            rule_type = rule.get("rule_type")

            # Common pattern expressions
            if rule_type in failed_rules or rule_type in quarantine_rules:
                reason_label = f"{col_name}:{rule_type}"

                # --- Rule Implementations ---
                if rule_type == "null_check":
                    condition = col(col_name).isNull()

                elif rule_type == "regex_check":
                    pattern = rule.get("pattern", ".*")
                    condition = ~col(col_name).rlike(pattern)

                elif rule_type == "boolean_check":
                    condition = ~lower(col(col_name).cast("string")).isin("true", "false", "yes", "no", "y", "n")

                elif rule_type == "range_check":
                    min_val = rule.get("min")
                    max_val = rule.get("max")
                    condition = ~(col(col_name).between(min_val, max_val))

                elif rule_type == "data_type_check":
                    expected_type = rule.get("expected_type")
                    if expected_type == "integer":
                        condition = ~col(col_name).cast("int").isNotNull()
                    elif expected_type == "string":
                        condition = ~col(col_name).cast("string").isNotNull()
                    elif expected_type == "boolean":
                        condition = ~lower(col(col_name).cast("string")).isin("true", "false")
                    elif expected_type == "date":
                        condition = ~col(col_name).cast("date").isNotNull()
                    else:
                        continue  # Unknown type

                elif rule_type == "date_format_check":
                    pattern = rule.get("pattern", r"^\d{4}-\d{2}-\d{2}$")
                    condition = ~col(col_name).rlike(pattern)

                else:
                    continue  # Unhandled rule_type

                # Append expression to the correct rule list
                reason_expr = when(condition, lit(reason_label))

                if rule_type in failed_rules:
                    failed_exprs.append(reason_expr)

    # Create arrays of reasons for each type
    df_with_flags = df_with_flags.withColumn("failed_raw", array(*failed_exprs) if failed_exprs else array())

    df_with_flags = df_with_flags.withColumn("failed_reasons", expr("filter(failed_raw, x -> x is not null)"))

    # Combine all reasons for visibility (optional)
    df_with_flags = df_with_flags.withColumn(
        "all_reasons",
        expr("failed_reasons")
    ).withColumn(
        "reasons_str",
        concat_ws(",", col("all_reasons"))
    )

    # Splitting the dataframe
    clean_df = df_with_flags.filter((size(col("failed_reasons")) == 0))
    failed_df = df_with_flags.filter(size(col("failed_reasons")) > 0)

    return clean_df, failed_df


In [0]:


def validate_silver_rules(
    df: DataFrame,
    rules: Dict[str, List[Dict[str, Any]]],
    quarantine_rules: List[str]
) -> Tuple[DataFrame, DataFrame]:
    """
    Validates a DataFrame against silver rules with quarantine and failed rule handling.

    Parameters:
    - df: Input DataFrame
    - rules: dict of rules for each column
    - quarantine_rules: list of rule_types that cause soft rejection (quarantine)

    Returns:
    - clean_df: rows passing all rules
    - quarantine_df: rows failing quarantine_rules only
    """
    df_with_flags = df
    quarantine_exprs = []

    for col_name, rule_list in rules.items():
        if col_name == "*":
            continue  # Global rules handled elsewhere if needed

        for rule in rule_list:
            rule_type = rule.get("rule_type")
            reason_label = f"{col_name}:{rule_type}"

            # --- Rule Implementations ---
            if  rule_type in quarantine_rules:
                if rule_type == "length_check":
                    max_len = rule.get("max_length", 255)
                    condition = length(col(col_name)) > max_len

                elif rule_type == "trim_check":
                    condition = col(col_name) != trim(col(col_name))

                elif rule_type == "default_value_check":
                    invalids = rule.get("invalid_values", [])
                    condition = lower(col(col_name)).isin([v.lower() for v in invalids])

                elif rule_type == "allowed_values_check":
                    allowed = rule.get("allowed_values", [])
                    condition = ~lower(col(col_name)).isin([v.lower() for v in allowed])

                elif rule_type == "date_validity_check":
                    if rule.get("no_future", False):
                        condition = col(col_name).cast("date") > current_date()
                    else:
                        continue

                elif rule_type == "range_check":
                    min_val = rule.get("min")
                    max_val = rule.get("max")
                    condition = (
                        (col(col_name).cast("double") < float(min_val)) |
                        (col(col_name).cast("double") > float(max_val))
                    )

                elif rule_type == "data_type_check":
                    expected_type = rule.get("expected_type")
                    if expected_type == "integer":
                        condition = ~col(col_name).cast("int").isNotNull()
                    elif expected_type == "float":
                        condition = ~col(col_name).cast("float").isNotNull()
                    elif expected_type == "string":
                        condition = ~col(col_name).cast("string").isNotNull()
                    elif expected_type == "boolean":
                        condition = ~lower(col(col_name).cast("string")).isin("true", "false", "yes", "no", "y", "n")
                    elif expected_type == "date":
                        condition = ~col(col_name).cast("date").isNotNull()
                    else:
                        continue

                else:
                    continue  # Unknown or unimplemented rule

                # Map condition to appropriate list
                reason_expr = when(condition, lit(reason_label))
                if rule_type in quarantine_rules:
                    quarantine_exprs.append(reason_expr)

    # Attach failure/quarantine arrays
    df_with_flags = df_with_flags \
        .withColumn("quarantine_raw", array(*quarantine_exprs) if quarantine_exprs else array()) \
        .withColumn("quarantine_reasons", expr("filter(quarantine_raw, x -> x is not null)")) \
        .withColumn("reasons_str", concat_ws(",", col("quarantine_reasons")))

    # Separate rows
    clean_df = df_with_flags.filter(
        (size(col("quarantine_reasons")) == 0)
    )

    quarantine_df = df_with_flags.filter(
        (size(col("quarantine_reasons")) > 0)
    )


    return clean_df, quarantine_df
