In [80]:
import pandas as pd

# Example DataFrame
data = {
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'David'],
    'age': [25, 17, 65, 45, 18],
    'salary': [50000, 60000, 200000, 75000, 75000]
}

df = pd.DataFrame(data)


In [90]:
import os
import pandas as pd
from datetime import datetime

# Class definition for DataValidator, which handles data validation tasks and storing validation results.
class DataValidator:

    # Initialization method with various configuration options.
    def __init__(self, store=False, history=False, united=True, path="./validation logs", file_type="pkl"):
        self.store = store  # Determines whether to store the validation results.
        self.united = united  # Determines whether to store all results in a single file.
        self.history = history  # If True, creates a new folder each day to store logs.
        self.file_type = file_type.lower()  # Ensures the file type is stored in lowercase.

        # Adjust the storage path based on the 'history' flag.
        if history:
            self.path = os.path.join(path, f"{datetime.now().strftime('%Y-%m-%d')}")
        else:
            self.path = path  # Use the default or provided path if history is False.

        # Initialize an empty DataFrame for storing all validation results if 'united' is True.
        self.all_validations_df = pd.DataFrame()

        # Type checking for the constructor arguments.
        if not isinstance(store, bool):
            raise TypeError("The 'store' argument must be a boolean.")

        if not isinstance(united, bool):
            raise TypeError("The 'united' argument must be a boolean.")

        if not isinstance(history, bool):
            raise TypeError("The 'history' argument must be a boolean.")

        if not isinstance(file_type, str):
            raise TypeError("The 'file_type' argument must be a string.")

        # Ensure the path exists by creating the directory if it doesn't already exist.
        if not os.path.exists(self.path):
            os.makedirs(self.path)

    # Method to define the Range_Validation decorator, which validates whether the values in a DataFrame column fall within specified ranges.
    def Range_Validation(self, *, column: str, borders: list, name: str, **kwargs):

        # Type checking for the arguments.
        if not isinstance(column, str):
            raise TypeError("The 'column' argument must be a string.")

        if not isinstance(borders, list) or not all(isinstance(i, tuple) and len(i) == 2 for i in borders):
            raise TypeError("The 'borders' argument must be a list of tuples, where each tuple contains two numeric values (int or float).")

        if not isinstance(name, str):
            raise TypeError("The 'name' argument must be a string.")

        # Define the actual decorator function.
        def decorator(func):
            def wrapper(df, *args, **kwargs_func):
                # Check if the specified column exists in the DataFrame.
                if column not in df.columns:
                    raise TypeError(f"Error: Column '{column}' not found in DataFrame.")

                # Initialize a mask for identifying rows that fall within the specified ranges.
                in_range_mask = pd.Series([False] * len(df))

                # Update the mask to mark rows within any of the specified ranges.
                for bottom, top in borders:
                    in_range_mask |= df[column].between(bottom, top)

                # Identify the rows that fall outside the specified ranges.
                out_of_bounds = df.loc[~in_range_mask].copy()

                # If any rows are out of bounds and storing is enabled, save these rows.
                if not out_of_bounds.empty and self.store:
                    self.save_outliers(out_of_bounds, column, borders, name)

                # Execute the original function (decorated function) with the DataFrame and other arguments.
                return func(df, *args, **kwargs_func)
            return wrapper
        return decorator





    # Method to save outlier data based on the specified column and validation name.
    def save_outliers(self, outliers, column, borders, name):
        # Add a new column to identify which validation these rows failed.
        outliers["Validation Name"] = name

        # If storing all validations in a single file, concatenate the new outliers to the existing DataFrame and save.
        if self.united:
            self.all_validations_df = pd.concat([self.all_validations_df, outliers], ignore_index=True)
            self.save_to_file(self.all_validations_df, os.path.join(self.path, "log"))
        # If not united, save each validation's outliers in separate files.
        else:
            self.save_to_file(outliers, os.path.join(self.path, f"{name}"))



    # Method to save a DataFrame to a file based on the specified file type.
    def save_to_file(self, df, file_name):
        """ Saves the DataFrame to a file based on the specified file type. """
        if self.file_type == "csv":
            df.to_csv(f"{file_name}.csv", index=False)  # Save as CSV without the index column.
        elif self.file_type == "xlsx":
            df.to_excel(f"{file_name}.xlsx", index=False)  # Save as Excel without the index column.
        elif self.file_type == "pkl":
            df.to_pickle(f"{file_name}.pkl")  # Save as a pickle file.
        elif self.file_type == "txt":
            with open(f"{file_name}.txt", "w") as log:  # Save as a text file.
                df.to_string(log)  # Write the DataFrame's string representation to the file.
                log.write("\n")  # Add a newline at the end.
        else:
            raise ValueError("Unsupported file type. Supported types are: 'csv', 'xlsx', 'pkl', 'txt'")



# Example usage of the DataValidator class.
validator = DataValidator(store=True, history=True, united=True, file_type="csv")

# Define a function to process data, decorated with two Range_Validation decorators.
@validator.Range_Validation(column='age', borders=[(18, 30)], name="Val1")
@validator.Range_Validation(column='age', borders=[(50, 60)], name="Val2")
def process_data_1(df):
    print("Processing data for validation 1...")

# Apply the decorators and process a DataFrame (df must be defined earlier in the code).
process_data_1(df)


TypeError: The 'column' argument must be a string.