In [1]:
import numpy as np
import pandas as pd
import duckdb
import sqlalchemy
import matplotlib.pyplot as plt

In [1]:
import great_expectations as gx
from great_expectations import expectations as gxe

<div class="alert alert-block alert-info">
data testing

In [2]:
context = gx.get_context()
source_folder = "data/"
data_source_name = "olist_geolocation_dataset"

data_source = context.data_sources.add_pandas_filesystem(
    name=data_source_name, 
    base_directory=source_folder
)

In [3]:
asset_name = "olist_geolocation_dataset_files"
file_csv_asset = data_source.add_csv_asset(name=asset_name)


In [4]:
file_data_asset = context.data_sources.get(data_source_name).get_asset(asset_name)

In [5]:
batch_definition_name = "olist_geolocation_dataset.csv"
batch_definition_path = "olist_geolocation_dataset.csv"

batch_definition = file_data_asset.add_batch_definition_path(
    name=batch_definition_name, path=batch_definition_path
)

batch = batch_definition.get_batch()

In [6]:
print(batch.head(4))

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 211.14it/s]

   geolocation_zip_code_prefix  geolocation_lat  geolocation_lng  \
0                         1037       -23.545621       -46.639292   
1                         1046       -23.546081       -46.644820   
2                         1046       -23.546129       -46.642951   
3                         1041       -23.544392       -46.639499   

  geolocation_city geolocation_state  
0        sao paulo                SP  
1        sao paulo                SP  
2        sao paulo                SP  
3        sao paulo                SP  





In [7]:
preset_lat_expectation = gx.expectations.ExpectColumnValuesToBeBetween(
    column="geolocation_lat", min_value=-35, max_value=5
)

preset_long_expectation = gx.expectations.ExpectColumnValuesToBeBetween(
    column="geolocation_lng", min_value=-75, max_value=-35
)

suite_name = "br_ecom_expectation"
suite = gx.ExpectationSuite(name=suite_name)

suite = context.suites.add(suite)
suite.add_expectation(preset_lat_expectation)
suite.add_expectation(preset_long_expectation)

definition_name = "br_ecom_validation_definition"
validation_definition = gx.ValidationDefinition(
    data=batch_definition, suite=suite, name=definition_name
)


In [8]:
validation_results = validation_definition.run()

Calculating Metrics: 100%|██████████| 17/17 [00:00<00:00, 73.32it/s] 


In [9]:
print(validation_results)

{
  "success": false,
  "results": [
    {
      "success": false,
      "expectation_config": {
        "type": "expect_column_values_to_be_between",
        "kwargs": {
          "batch_id": "olist_geolocation_dataset-olist_geolocation_dataset_files",
          "column": "geolocation_lat",
          "min_value": -35.0,
          "max_value": 5.0
        },
        "meta": {},
        "id": "42cefdf9-e4cf-4f5c-bb9f-379767ad59ee"
      },
      "result": {
        "element_count": 1000163,
        "unexpected_count": 29,
        "unexpected_percent": 0.002899527377037543,
        "partial_unexpected_list": [
          28.008978338034268,
          41.61405150610495,
          42.43928591592116,
          38.38167205114709,
          43.68496096631822,
          29.40925222930908,
          21.65754744931478,
          25.995202881053302,
          25.995244980240106,
          38.3239386880374,
          38.99196259838999,
          38.26820516582393,
          45.06593318269697,
     