In [6]:
import numpy as np
import pandas as pd
import duckdb
import sqlalchemy
from pandas_gbq import read_gbq

In [None]:
import great_expectations as gx
from great_expectations import expectations as gxe

import pprint
import os

<div class="alert alert-block alert-info">
data testing

In [8]:
context = gx.get_context()
# query bigquery
project_id = "projectm2-aiess"
query = "SELECT * FROM olist_brazilian_ecommerce_DS.DS_land_geolocation"
df_geolocation = read_gbq(query, project_id=project_id)

data_source_name = "geolocation_df"
data_source = context.data_sources.add_pandas(name=data_source_name)

# create asset
data_asset_name = "geolocation_asset"
data_asset = data_source.add_dataframe_asset(name=data_asset_name)

# create batch
batch_definition_name = "batch_geolocation_dataframe"
batch_definition = data_asset.add_batch_definition_whole_dataframe(batch_definition_name)

batch_parameters = {"dataframe": df_geolocation}

new_batch = batch_definition.get_batch(batch_parameters=batch_parameters)

Downloading: 100%|[32m██████████[0m|


In [9]:
print(new_batch.head(4))

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 149.41it/s]

  geolocation_zip_code_prefix  geolocation_lat  geolocation_lng  \
0                       81470       -25.571748       -49.334374   
1                       28930       -22.969370       -42.029834   
2                       87365       -24.271860       -53.069433   
3                       87365       -24.281905       -53.074516   

     geolocation_city geolocation_state  
0            * cidade                PR  
1  ...arraial do cabo                RJ  
2      4o. centenario                PR  
3       4º centenario                PR  





In [28]:
suite_name = "br_ecom_expectation"
suite = gx.ExpectationSuite(name=suite_name)

preset_lat_expectation = gx.expectations.ExpectColumnValuesToBeBetween(
    column="geolocation_lat", min_value=-35, max_value=5
)

preset_long_expectation = gx.expectations.ExpectColumnValuesToBeBetween(
    column="geolocation_lng", min_value=-75, max_value=-35
)

context.suites.add_or_update(suite)
suite.add_expectation(preset_lat_expectation)
suite.add_expectation(preset_long_expectation)

definition_name = "br_ecom_validation_definition_V2"
validation_definition = gx.ValidationDefinition(
    data=batch_definition, suite=suite, name=definition_name
)


In [30]:
validation_results = validation_definition.run(batch_parameters=batch_parameters)

Calculating Metrics: 100%|██████████| 17/17 [00:00<00:00, 49.42it/s] 


In [31]:
print(validation_results)

{
  "success": false,
  "results": [
    {
      "success": true,
      "expectation_config": {
        "type": "expect_column_values_to_be_between",
        "kwargs": {
          "batch_id": "geolocation_df-geolocation_asset",
          "column": "geolocation_lat",
          "min_value": -35.0,
          "max_value": 5.0
        },
        "meta": {},
        "id": "154fcb0e-57d3-43e5-a30f-c456b7728c0b"
      },
      "result": {
        "element_count": 738299,
        "unexpected_count": 0,
        "unexpected_percent": 0.0,
        "partial_unexpected_list": [],
        "missing_count": 0,
        "missing_percent": 0.0,
        "unexpected_percent_total": 0.0,
        "unexpected_percent_nonmissing": 0.0,
        "partial_unexpected_counts": [],
        "partial_unexpected_index_list": []
      },
      "meta": {},
      "exception_info": {
        "raised_exception": false,
        "exception_traceback": null,
        "exception_message": null
      }
    },
    {
      "success"

In [None]:
# Save full results to file
output_folder = "gx_output"
os.makedirs(output_folder, exist_ok=True)
result_path = os.path.join(output_folder, "gx_results_geo.txt")

with open(result_path, "w") as f:
    f.write(pprint.pformat(validation_results))

print(f" Full GX test results saved to {result_path}")


 Full GX test results saved to gx_output/gx_full_results.txt
