In [1]:
import numpy as np
import pandas as pd
import duckdb
import sqlalchemy
from pandas_gbq import read_gbq

In [2]:
import great_expectations as gx
from great_expectations import expectations as gxe

import pprint
import os

<div class="alert alert-block alert-info">
GX_ORDERS_DELIVERY

In [3]:
context = gx.get_context()
# query bigquery
project_id = "projectm2-aiess"
query = "SELECT * FROM olist_brazilian_ecommerce_DS.DS_orders_delivery"
df_orders_delivery = read_gbq(query, project_id=project_id)

data_source_name = "orders_delivery_df"
data_source = context.data_sources.add_pandas(name=data_source_name)

# create asset
data_asset_name = "orders_delivery_asset"
data_asset = data_source.add_dataframe_asset(name=data_asset_name)

# create batch
batch_definition_name = "batch_orders_delivery_dataframe"
batch_definition = data_asset.add_batch_definition_whole_dataframe(batch_definition_name)

batch_parameters = {"dataframe": df_orders_delivery}

new_batch = batch_definition.get_batch(batch_parameters=batch_parameters)

Downloading: 100%|[32m██████████[0m|


In [4]:
print(new_batch.head(4))

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 150.76it/s]

                       pk_order_sid                   fk_customer_sid  \
0  809a282bbd5dbcabb6f2f724fca862ec  622e13439d6b5a0b486c435618b2679e   
1  bfbd0f9bdef84302105ad712db648a6c  86dc2ffce2dfff336de2f386a786e574   
2  2e7a8482f6fb09756ca50c10d7bfc047  08c5351a6aca1c1589a38f244edeee9d   
3  e5215415bb6f76fe3b7cb68103a0d1c0  b6f6cbfc126f1ae6723fe2f9b3751208   

  fk_order_purchased_date_sid fk_order_approved_at_date_sid  \
0                    20160913                      20161007   
1                    20160915                      20160915   
2                    20160904                      20161007   
3                    20161022                          None   

  fk_order_delivered_carrier_date_sid fk_order_delivered_customer_date_sid  \
0                                None                                 None   
1                            20161107                             20161109   
2                            20161018                                 None   
3     




In [5]:
suite_name = "orders_delivery_expectation"
suite = gx.ExpectationSuite(name=suite_name)

# Test for column existence
actual_delivery_time_expectation = gx.expectations.ExpectColumnToExist(column="actual_delivery_time")
estimated_delivery_time_expectation = gx.expectations.ExpectColumnToExist(column="estimated delivery_time")
actual_delivery_time_minutes_expectation = gx.expectations.ExpectColumnToExist(column="actual_delivery_time_minutes")
estimated_delivery_time_minutes_expectation = gx.expectations.ExpectColumnToExist(column="estimated-delivery_time_minutes")

In [6]:
# Test for column values to be non-null
actual_delivery_time_nonNull_expectation = gx.expectations.ExpectColumnValuesToNotBeNull(column="actual_delivery_time")
estimated_delivery_time_nonNull_expectation = gx.expectations.ExpectColumnValuesToNotBeNull(column="estimated delivery_time")
actual_delivery_time_minutes_nonNull_expectation = gx.expectations.ExpectColumnValuesToNotBeNull(column="actual_delivery_time_minutes")
estimated_delivery_time_minutes_nonNull_expectation = gx.expectations.ExpectColumnValuesToNotBeNull(column="estimated-delivery_time_minutes")

In [7]:
# Test for order status value validity
order_status_validity_expectation = gx.expectations.ExpectColumnValuesToBeInSet(
        column="order_status", 
        value_set=["delivered", "shipped", "processing", "canceled", "unavailable", "invoiced", "approved", "created"]
    )

In [8]:
# Test for column data types
actual_delivery_time_type_expectation = gx.expectations.ExpectColumnValuesToBeOfType(column="actual_delivery_time", type_="object")
estimated_delivery_time_type_expectation = gx.expectations.ExpectColumnValuesToBeOfType(column="estimated delivery_time", type_="object")
actual_delivery_time_minutes_type_expectation = gx.expectations.ExpectColumnValuesToBeOfType(column="actual_delivery_time_minutes", type_="float")
estimated_delivery_time_minutes_type_expectation = gx.expectations.ExpectColumnValuesToBeOfType(column="estimated-delivery_time_minutes", type_="float")

In [9]:
# Test for column value ranges
actual_delivery_time_minutes_range_expectation = gx.expectations.ExpectColumnValuesToBeBetween(column="actual_delivery_time_minutes", min_value=0, max_value=350000)
estimated_delivery_time_minutes_range_expectation = gx.expectations.ExpectColumnValuesToBeBetween(column="estimated-delivery_time_minutes", min_value=0, max_value=350000)

In [10]:
# Test for consistency between columns using a row condition
actual_estimated_delivery_delta_expectation = gx.expectations.ExpectColumnValuesToBeBetween(
	column="actual_delivery_time_minutes",
	min_value=0,  # Set a minimum value
	max_value=350000,  # Set a maximum value
	row_condition="actual_delivery_time_minutes <= estimated-delivery_time_minutes",
	condition_parser="pandas"
)

In [11]:
context.suites.add_or_update(suite)
suite.add_expectation(actual_delivery_time_expectation)
suite.add_expectation(estimated_delivery_time_expectation)
suite.add_expectation(actual_delivery_time_minutes_expectation)
suite.add_expectation(estimated_delivery_time_minutes_expectation)
suite.add_expectation(actual_delivery_time_nonNull_expectation)
suite.add_expectation(estimated_delivery_time_nonNull_expectation)
suite.add_expectation(actual_delivery_time_minutes_nonNull_expectation)
suite.add_expectation(estimated_delivery_time_minutes_nonNull_expectation)
suite.add_expectation(order_status_validity_expectation)
suite.add_expectation(actual_delivery_time_type_expectation)
suite.add_expectation(estimated_delivery_time_type_expectation)
suite.add_expectation(actual_delivery_time_minutes_type_expectation)
suite.add_expectation(estimated_delivery_time_minutes_type_expectation)
suite.add_expectation(actual_delivery_time_minutes_range_expectation)
suite.add_expectation(estimated_delivery_time_minutes_range_expectation)
suite.add_expectation(actual_estimated_delivery_delta_expectation)

definition_name = "Orders_delivery_validation_definition"
validation_definition = gx.ValidationDefinition(
    data=batch_definition, suite=suite, name=definition_name
)

In [12]:
validation_results = validation_definition.run(batch_parameters=batch_parameters)

Calculating Metrics:  81%|████████  | 43/53 [00:00<00:00, 123.72it/s]


In [13]:
print(validation_results)

{
  "success": false,
  "results": [
    {
      "success": false,
      "expectation_config": {
        "type": "expect_column_values_to_be_between",
        "kwargs": {
          "column": "actual_delivery_time_minutes",
          "row_condition": "actual_delivery_time_minutes <= estimated-delivery_time_minutes",
          "condition_parser": "pandas",
          "min_value": 0.0,
          "max_value": 350000.0,
          "batch_id": "orders_delivery_df-orders_delivery_asset"
        },
        "meta": {},
        "id": "123f9e4f-2811-43c2-8654-73f0b0953576"
      },
      "result": {},
      "meta": {},
      "exception_info": {
        "MetricConfigurationID(metric_name='table.column_types', metric_domain_kwargs_id='06d7815360174726c19f75f3edd3e29e', metric_value_kwargs_id='include_nested=True')": {
          "exception_traceback": "Traceback (most recent call last):\n  File \"/Users/andrew/miniconda3/envs/elt/lib/python3.10/site-packages/pandas/core/computation/scope.py\", line 23

In [14]:
# Save full results to file
output_folder = "gx_output"
os.makedirs(output_folder, exist_ok=True)
result_path = os.path.join(output_folder, "gx_results_orders_delivery.txt")

with open(result_path, "w") as f:
    f.write(pprint.pformat(validation_results))

print(f" Full GX test results saved to {result_path}")


 Full GX test results saved to gx_output/gx_results_orders_delivery.txt


<div class="alert alert-block alert-info">
GX_geo

In [15]:
context = gx.get_context()
# query bigquery
project_id = "projectm2-aiess"
query = "SELECT * FROM olist_brazilian_ecommerce_DS.DS_land_geolocation"
df_geolocation = read_gbq(query, project_id=project_id)

data_source_name = "geolocation_df"
data_source = context.data_sources.add_pandas(name=data_source_name)

# create asset
data_asset_name = "geolocation_asset"
data_asset = data_source.add_dataframe_asset(name=data_asset_name)

# create batch
batch_definition_name = "batch_geolocation_dataframe"
batch_definition = data_asset.add_batch_definition_whole_dataframe(batch_definition_name)

batch_parameters = {"dataframe": df_geolocation}

new_batch = batch_definition.get_batch(batch_parameters=batch_parameters)

Downloading: 100%|[32m██████████[0m|


In [16]:
suite_name = "br_ecom_expectation"
suite = gx.ExpectationSuite(name=suite_name)

preset_lat_expectation = gx.expectations.ExpectColumnValuesToBeBetween(
    column="geolocation_lat", min_value=-35, max_value=5
)

preset_long_expectation = gx.expectations.ExpectColumnValuesToBeBetween(
    column="geolocation_lng", min_value=-75, max_value=-35
)

context.suites.add_or_update(suite)
suite.add_expectation(preset_lat_expectation)
suite.add_expectation(preset_long_expectation)

definition_name = "br_ecom_validation_definition_V2"
validation_definition = gx.ValidationDefinition(
    data=batch_definition, suite=suite, name=definition_name
)


In [17]:
validation_results = validation_definition.run(batch_parameters=batch_parameters)
print(validation_results)

Calculating Metrics: 100%|██████████| 17/17 [00:00<00:00, 28.63it/s] 

{
  "success": false,
  "results": [
    {
      "success": true,
      "expectation_config": {
        "type": "expect_column_values_to_be_between",
        "kwargs": {
          "batch_id": "geolocation_df-geolocation_asset",
          "column": "geolocation_lat",
          "min_value": -35.0,
          "max_value": 5.0
        },
        "meta": {},
        "id": "1c3a15ed-1370-44bd-b15c-f500b3924999"
      },
      "result": {
        "element_count": 738299,
        "unexpected_count": 0,
        "unexpected_percent": 0.0,
        "partial_unexpected_list": [],
        "missing_count": 0,
        "missing_percent": 0.0,
        "unexpected_percent_total": 0.0,
        "unexpected_percent_nonmissing": 0.0,
        "partial_unexpected_counts": [],
        "partial_unexpected_index_list": []
      },
      "meta": {},
      "exception_info": {
        "raised_exception": false,
        "exception_traceback": null,
        "exception_message": null
      }
    },
    {
      "success"




In [18]:
# Save full results to file
output_folder = "gx_output"
os.makedirs(output_folder, exist_ok=True)
result_path = os.path.join(output_folder, "gx_results_geo.txt")

with open(result_path, "w") as f:
    f.write(pprint.pformat(validation_results))

print(f" Full GX test results saved to {result_path}")


 Full GX test results saved to gx_output/gx_results_geo.txt
