# Run the validation Suite

In [1]:
import great_expectations as ge
import pandas as pd
import json
import pipeline

In [2]:
def load_expectation_suite(path: str) -> dict:
    """Load expectation suite stored in JSON format
    and convert into dictionary.
    Args:
        path (str): path to expectation suite json file
    Returns:
        dict: expectation suite
    """
    with open(path, 'r') as f:
        expectation_suite = json.load(f)
    return expectation_suite
    
def great_expectation_validation(df: pd.DataFrame,
                                 expectation_suite_path: str) -> dict:
    """Run validation on DataFrame based on expecation suite
    Args:
        df (pd.DataFrame): DataFrame to validate
        expectation_suite_path (str): path to expectation suite json file
    Returns:
        dict: Validation result
    """
    expectation_suite = load_expectation_suite(expectation_suite_path)
    gdf = ge.from_pandas(df, expectation_suite=expectation_suite)
    validation_results = gdf.validate(result_format = 'SUMMARY', catch_exceptions = True)
    return validation_results

In [3]:
input_path = './titanic_data.csv'
output_path = './titanic_data_processed.csv'
expectation_suite_path = "./my_expectation_file.json"

df = pipeline.run_pipeline(input_path, output_path)
gdf = ge.from_pandas(df)

validation_result = great_expectation_validation(df = gdf, expectation_suite_path = expectation_suite_path)

In [4]:
validation_result['success']

True

In [5]:
validation_result['statistics']

{'evaluated_expectations': 4,
 'successful_expectations': 4,
 'unsuccessful_expectations': 0,
 'success_percent': 100.0}

In [6]:
for r in validation_result['results']:
    if not(r['success']):
        print (f"failed: {r['expectation_config']['expectation_type']}")
    else:
        print (f"success: {r['expectation_config']['expectation_type']}")

success: expect_table_columns_to_match_set
success: expect_column_values_to_not_be_null
success: expect_column_values_to_be_unique
success: expect_column_values_to_be_in_set


# Store Validation results to keep statistics of data quality

In [7]:
import datetime

datetime_object = datetime.datetime.now()

date_time_str = datetime_object.strftime('%Y%m%d%H%M%S')


In [8]:
with open( f"./validation_results_achive/validation{date_time_str}.json", "w") as my_file:
    my_file.write(
        json.dumps(validation_result.to_json_dict(), sort_keys=True, indent=4)
    )