In [2]:
from great_expectations.data_context import FileDataContext

project_root_dir = "./"
context = FileDataContext.create(project_root_dir=project_root_dir)

In [3]:
# Define datasource with multiple files
datasource_name = "wine_quality_datasource"
path_to_folder_containing_csv_files="data/raw/unzip_raw_wine_data"
datasource = context.sources.add_pandas_filesystem(
    name=datasource_name, base_directory=path_to_folder_containing_csv_files
)

# Define data asset
asset_name = "wine_quality_data_asset"
batching_regex = r"winequality-(?P<color>.+)\.csv"
asset = datasource.add_csv_asset(asset_name, batching_regex=batching_regex , delimiter=";")

In [4]:

# Build batch request
## Check data asset options
print(asset.batch_request_options)

options_dict = {'color': 'red'}
batch_request = asset.build_batch_request(options=options_dict)
batches = asset.get_batch_list_from_batch_request(batch_request)
# Check returned batches
for batch in batches:
    print(batch.batch_spec)

('color', 'path')
{'path': 'data/raw/unzip_raw_wine_data/winequality-red.csv', 'reader_method': 'read_csv', 'reader_options': {'delimiter': ';'}}


In [5]:
# Create Expectations suite
expectation_suite_name = "red_wine_expectation_suite"
context.add_or_update_expectation_suite(expectation_suite_name)

# Create a Validator
validator = context.get_validator(batch_request=batch_request,
    expectation_suite_name=expectation_suite_name,
)

In [None]:
import pandas as pd
df = pd.read_csv('data/raw/unzip_raw_wine_data/winequality-red.csv', delimiter=';')
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [6]:
# Create and run an Expectation
#list_of_columns = list(df.columns)
validator.expect_column_values_to_not_be_null(column='chlorides')

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 1599,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [7]:
validator.expect_column_values_to_be_of_type(column='quality', type_='int64')

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": "int64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [None]:
validator.expect_table_columns_to_match_ordered_list(column_list=list_of_columns)

In [8]:
# Save Expectations
validator.save_expectation_suite(discard_failed_expectations=False)

In [None]:
my_suite = context.get_expectation_suite(expectation_suite_name)
my_suite.show_expectations_by_expectation_type()

In [9]:
# Create a Checkpoint
checkpoint_name = "red_wine_checkpoint"
checkpoint = context.add_or_update_checkpoint(
    name=checkpoint_name,
    validations=[
        {
            "batch_request": batch_request,
            "expectation_suite_name": expectation_suite_name,
        },
    ],
)

In [10]:
# Run Checkpoint 
checkpoint_result = checkpoint.run()

# Build Data Docs
context.build_data_docs()

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{'local_site': 'file:///home/naveen/mlops/wine-quality-mlops/gx/uncommitted/data_docs/local_site/index.html'}

In [13]:
context.get_docs_sites_urls()

[{'site_name': 'local_site',
  'site_url': 'file:///home/naveen/mlops/wine-quality-mlops/gx/uncommitted/data_docs/local_site/index.html'}]

In [19]:
checkpoint_result.list_validation_results()[0]

{
  "success": true,
  "results": [
    {
      "success": true,
      "expectation_config": {
        "expectation_type": "expect_column_values_to_not_be_null",
        "kwargs": {
          "column": "chlorides",
          "batch_id": "wine_quality_datasource-wine_quality_data_asset-color_red"
        },
        "meta": {}
      },
      "result": {
        "element_count": 1599,
        "unexpected_count": 0,
        "unexpected_percent": 0.0,
        "partial_unexpected_list": [],
        "partial_unexpected_counts": [],
        "partial_unexpected_index_list": []
      },
      "meta": {},
      "exception_info": {
        "raised_exception": false,
        "exception_traceback": null,
        "exception_message": null
      }
    },
    {
      "success": true,
      "expectation_config": {
        "expectation_type": "expect_column_values_to_be_of_type",
        "kwargs": {
          "column": "quality",
          "type_": "int64",
          "batch_id": "wine_quality_datasource-