In [1]:
import great_expectations as gx

In [124]:
import pandas as pd

In [3]:
context = gx.get_context() #configure the context

In [10]:
source_folder = "Downloads/archive"
data_source_name = "my_filesystem_data_source1"  #define the datasource's parameters

In [11]:
data_source = context.data_sources.add_pandas_filesystem(
    name=data_source_name, base_directory=source_folder
) #add the filesystem datasource to the data context

In [23]:
asset_name = "air_passengers_csv_file" #create a data asset

In [24]:
file_csv_asset = data_source.add_csv_asset(name=asset_name) #add the data asset to the data source

In [26]:
file_data_asset = context.data_sources.get(data_source_name).get_asset(asset_name) #retrieve the data asset

In [77]:
file_path = "Downloads/archive/AirPassengers.csv" #define the path to the datafile

In [78]:
sample_batch = context.data_sources.pandas_default.read_csv(file_path) #retrieve a batch of data from the data source

In [131]:
sample_batch.head() #verify the returned batch

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

     Month  #Passengers
0  1949-01          112
1  1949-02          118
2  1949-03          132
3  1949-04          129
4  1949-05          121

In [86]:
suite_name = "my_expectation_suite"
suite = gx.ExpectationSuite(name=suite_name) #create an expectatio suite

In [87]:
suite = context.suites.add(suite) #add the suite to the context

In [176]:
validation_results = [] #create a list to contain the validation results

In [177]:
expectation = gx.expectations.ExpectColumnMaxToBeBetween(
    column="#Passengers", min_value=1, max_value=400
) #run the first expectation

In [178]:
validation_result = sample_batch.validate(expectation) #validate the expectation against the batch retrieved using pandas

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

In [88]:
suite.add_expectation(expectation) #add the expectation tu the suite

ExpectColumnMaxToBeBetween(id='6e28cf1e-3b66-4b34-b7f1-7d4a766338ee', meta=None, notes=None, result_format=<ResultFormat.BASIC: 'BASIC'>, description=None, catch_exceptions=False, rendered_content=None, windows=None, batch_id=None, column='#Passengers', row_condition=None, condition_parser=None, min_value=1.0, max_value=400.0, strict_min=False, strict_max=False)

In [179]:
validation_results.append(validation_result) #add the validation result to the list of results

In [180]:
expectation2 = gx.expectations.ExpectColumnSumToBeBetween(
    column="#Passengers",
    min_value=40000,
    max_value=40400
)
validation_result = sample_batch.validate(expectation2)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

In [96]:
suite.add_expectation(expectation2)

ExpectColumnSumToBeBetween(id='5aa2268a-06f7-4dac-ba8b-99bc4d8035a3', meta=None, notes=None, result_format=<ResultFormat.BASIC: 'BASIC'>, description=None, catch_exceptions=False, rendered_content=None, windows=None, batch_id=None, column='#Passengers', row_condition=None, condition_parser=None, min_value=40000.0, max_value=40400.0, strict_min=False, strict_max=False)

In [181]:
validation_results.append(validation_result)

In [182]:
expectation3=gx.expectations.ExpectColumnToExist(
    column="Month",
    column_index=0
)
validation_result = sample_batch.validate(expectation3)

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

In [98]:
suite.add_expectation(expectation3)

ExpectColumnToExist(id='4f08f981-a648-4baa-917b-cf04861d0ca9', meta=None, notes=None, result_format=<ResultFormat.BASIC: 'BASIC'>, description=None, catch_exceptions=False, rendered_content=None, windows=None, batch_id=None, column='Month', column_index=0)

In [183]:
validation_results.append(validation_result)

In [184]:
expectation4=gx.expectations.ExpectColumnValuesToBeUnique(
    column="Month"
)
validation_result = sample_batch.validate(expectation4)

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

In [102]:
suite.add_expectation(expectation4)

ExpectColumnValuesToBeUnique(id='a22a6705-5e4e-41a8-b78c-b443c0727266', meta=None, notes=None, result_format=<ResultFormat.BASIC: 'BASIC'>, description=None, catch_exceptions=True, rendered_content=None, windows=None, batch_id=None, column='Month', mostly=1.0, row_condition=None, condition_parser=None)

In [185]:
validation_results.append(validation_result)

In [186]:
expectation5 = gx.expectations.ExpectColumnValuesToNotBeNull(
    column="#Passengers")
validation_result = sample_batch.validate(expectation5)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

In [104]:
suite.add_expectation(expectation5)

ExpectColumnValuesToNotBeNull(id='532deb5c-49c8-4a3a-b10e-80b87d78bd10', meta=None, notes=None, result_format=<ResultFormat.BASIC: 'BASIC'>, description=None, catch_exceptions=True, rendered_content=None, windows=None, batch_id=None, column='#Passengers', mostly=1.0, row_condition=None, condition_parser=None)

In [187]:
validation_results.append(validation_result)

In [188]:
data_for_df = [] #create a list to hold the data for the dataframe

In [189]:
for expectation in validation_results:
    expectation_type = expectation['expectation_config']['type']
    success = expectation['success']
    column = expectation['expectation_config']['kwargs']['column']
    data_for_df.append({
        "Expectation Type": expectation_type,
        "Success": success,
        "Column": column
    }) #take the values from each validation result to be included in the dataframe

In [190]:
df_results = pd.DataFrame(data_for_df) #create the dataframe with the previous values

In [191]:
df_results #display the dataframe

Unnamed: 0,Expectation Type,Success,Column
0,expect_column_max_to_be_between,False,#Passengers
1,expect_column_sum_to_be_between,True,#Passengers
2,expect_column_to_exist,True,Month
3,expect_column_values_to_be_unique,True,Month
4,expect_column_values_to_not_be_null,True,#Passengers


In [193]:
df_results.to_csv('validation_results.csv', index=False) #saving the dataframe to my local as csv file