In [1]:
import pandas as pd

# Example DataFrame
data = {
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'David'],
    'age': [25, 17, 65, 45, 18],
    'salary': [50000, 60000, 200000, 75000, 75000]
}

df = pd.DataFrame(data)


In [12]:
import os
import pandas as pd
from datetime import datetime

class DataValidator:

    def __init__(self, store=False, history=False, united=True, path="./validation_logs", file_type="pkl"):
        self.store = store
        self.united = united
        self.history = history
        self.file_type = file_type.lower()

        if history:
            self.path = os.path.join(path, f"{datetime.now().strftime('%Y-%m-%d')}")
        else:
            self.path = path

        self.all_validations_df = pd.DataFrame()

        if not isinstance(store, bool):
            raise TypeError("The 'store' argument must be a boolean.")
        if not isinstance(united, bool):
            raise TypeError("The 'united' argument must be a boolean.")
        if not isinstance(history, bool):
            raise TypeError("The 'history' argument must be a boolean.")
        if not isinstance(file_type, str):
            raise TypeError("The 'file_type' argument must be a string.")

        if not os.path.exists(self.path):
            os.makedirs(self.path)

    def range_check(self, *, column: str, borders: list, name: str, **kwargs):
        if not isinstance(column, str):
            raise TypeError("The 'column' argument must be a string.")
        if not isinstance(borders, list) or not all(isinstance(i, tuple) and len(i) == 2 for i in borders):
            raise TypeError("The 'borders' argument must be a list of tuples with two numeric values.")
        if not isinstance(name, str):
            raise TypeError("The 'name' argument must be a string.")

        def decorator(func):
            def wrapper(df, *args, **kwargs_func):
                if column not in df.columns:
                    raise TypeError(f"Error: Column '{column}' not found in DataFrame.")

                in_range_mask = pd.Series([False] * len(df))

                for bottom, top in borders:
                    in_range_mask |= df[column].between(bottom, top)

                out_of_bounds = df.loc[~in_range_mask].copy()

                if not out_of_bounds.empty and self.store:
                    self.save(out_of_bounds, name)

                return func(df, *args, **kwargs_func)
            return wrapper
        return decorator

    def value_check(self, *, column: str, allowed: list = None, not_allowed: list = None, name: str, **kwargs):
        if not isinstance(column, str):
            raise TypeError("The 'column' argument must be a string.")
        if allowed is not None and not isinstance(allowed, list):
            raise TypeError("The 'allowed' argument must be a list.")
        if not_allowed is not None and not isinstance(not_allowed, list):
            raise TypeError("The 'not_allowed' argument must be a list.")
        if not isinstance(name, str):
            raise TypeError("The 'name' argument must be a string.")
    
        def decorator(func):
            def wrapper(df, *args, **kwargs_func):
                if column not in df.columns:
                    raise TypeError(f"Error: Column '{column}' not found in DataFrame.")
    
                invalid_rows = pd.DataFrame()
    
                if allowed is not None:
                    invalid_rows_allowed = df[~df[column].isin(allowed)]
                    invalid_rows = pd.concat([invalid_rows, invalid_rows_allowed])
    
                if not_allowed is not None:
                    invalid_rows_not_allowed = df[df[column].isin(not_allowed)]
                    invalid_rows = pd.concat([invalid_rows, invalid_rows_not_allowed])
    
                if not invalid_rows.empty and self.store:
                    self.save(invalid_rows, name)
    
                return func(df, *args, **kwargs_func)
            return wrapper
        return decorator

    def custom_check(self, *, custom_logic, name: str, **kwargs):
        if not (isinstance(custom_logic, str) or callable(custom_logic)):
            raise TypeError("The 'custom_logic' argument must be a string or a callable (function).")
        if not isinstance(name, str):
            raise TypeError("The 'name' argument must be a string.")

        def decorator(func):
            def wrapper(df, *args, **kwargs_func):
                if isinstance(custom_logic, str):
                    # Evaluate the custom logic string in the context of the DataFrame (df)
                    try:
                        invalid_rows = df.query(custom_logic)
                    except Exception as e:
                        raise ValueError(f"Error in custom logic: {str(e)}")
                elif callable(custom_logic):
                    # Execute the custom logic function
                    try:
                        invalid_rows = custom_logic(df)
                    except Exception as e:
                        raise ValueError(f"Error in custom function: {str(e)}")

                    if isinstance(invalid_rows, pd.Series):
                        invalid_rows = df.loc[invalid_rows].copy()
                    elif not isinstance(invalid_rows, pd.DataFrame):
                        raise TypeError("The custom function must return a pandas Series or DataFrame.")

                if not invalid_rows.empty and self.store:
                    self.save(invalid_rows, name)

                return func(df, *args, **kwargs_func)
            return wrapper
        return decorator

    def save(self, outliers, name):
        outliers = outliers.copy()
        outliers["Validation Name"] = name

        if self.united:
            self.all_validations_df = pd.concat([self.all_validations_df, outliers], ignore_index=True)
            self.save_file(self.all_validations_df, os.path.join(self.path, "log"))
        else:
            self.save_file(outliers, os.path.join(self.path, f"{name}"))

    def save_file(self, df, file_name):
        if self.file_type == "csv":
            df.to_csv(f"{file_name}.csv", index=False)
        elif self.file_type == "xlsx":
            df.to_excel(f"{file_name}.xlsx", index=False)
        elif self.file_type == "pkl":
            df.to_pickle(f"{file_name}.pkl")
        elif self.file_type == "txt":
            with open(f"{file_name}.txt", "w") as log:
                df.to_string(log)
                log.write("\n")
        else:
            raise ValueError("Unsupported file type. Supported types are: 'csv', 'xlsx', 'pkl', 'txt'")




# Example usage of the DataValidator class.
validator = DataValidator(store=True, history=True, united=True, file_type="csv")

# Define custom validation logic as a string
custom_logic_str = "salary == 200000"

# Define custom validation logic as a function
def custom_logic_func(df):
    # Example custom logic: check if 'score' column is greater than 100
    return df['salary'] == 200000




@validator.range_check(column='age', borders=[(18, 30), (50, 60)], name="Val1")
@validator.value_check(column='name', allowed=['Alice', 'Bob', 'Charlie'], name="Val2")
@validator.value_check(column='name', not_allowed=['David'], name="Val3")
@validator.custom_check(custom_logic=custom_logic_str, name="CustomCheckStr")
@validator.custom_check(custom_logic=custom_logic_func, name="CustomCheckFunc")
def process_data_1(df):
    print("Processing data for validation 1...")



# Apply the decorators and process a DataFrame (df must be defined earlier in the code).
process_data_1(df)


Processing data for validation 1...
