In [1]:
import numpy as np
import pandas as pd
import duckdb
import sqlalchemy
from pandas_gbq import read_gbq

In [2]:
import great_expectations as gx
from great_expectations import expectations as gxe

import pprint
import os

<div class="alert alert-block alert-info">
data testing

In [14]:
context = gx.get_context()
# query bigquery
project_id = "projectm2-aiess"
query = "SELECT * FROM olist_brazillian_ecommerce.DIM_CUSTOMERS"
df_customers = read_gbq(query, project_id=project_id)


data_source_name = "olist.dim_customers"
data_source = context.data_sources.add_pandas(name=data_source_name)

Downloading: 100%|[32m██████████[0m|


In [15]:
# create asset
data_asset_name = "olist.dim_customers_asset"
data_asset = data_source.add_dataframe_asset(name=data_asset_name)


In [16]:
# create batch
batch_definition_name = "batch_customers_dataframe"
batch_definition = data_asset.add_batch_definition_whole_dataframe(batch_definition_name)

batch_parameters = {"dataframe": df_customers}

new_batch = batch_definition.get_batch(batch_parameters=batch_parameters)

In [17]:
print(new_batch.head(4))

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 304.24it/s] 

                    pk_customer_sid                customer_unique_id  \
0  929cb7b1618f0d99c7814d79b0c5b7cc  3cd07f2cd0e3bd52f3896c0048fb37c7   
1  0f32385df13e46d88d997460208bc866  4f67110f6d6d1241111167b141bfa780   
2  f34e32b8604090d1c2a7f3b04c834807  785a8a410d6fc9661d66f7218a87229d   
3  31dbc13addc753e210692eacaea065e4  5dbba6c01268a8ad43f79157bf4454a0   

  customer_zip_code_prefix customer_city customer_state  \
0                    69900    rio branco             AC   
1                    69900    rio branco             AC   
2                    69900    rio branco             AC   
3                    69900    rio branco             AC   

                load_date  
0  2025-06-17 04:54:24 AM  
1  2025-06-17 04:54:24 AM  
2  2025-06-17 04:54:24 AM  
3  2025-06-17 04:54:24 AM  





In [18]:
# Create a new suite for all dimension tables schema validation
suite_name = "schema_dim_customers_expectation"
suite = gx.ExpectationSuite(name=suite_name)

schema_dim_customers_expectation = gx.expectations.ExpectColumnToExist(
    column="customer_sid", column_index=0
)

context.suites.add_or_update(suite)
suite.add_expectation(schema_dim_customers_expectation)

definition_name = "schema_dim_customers_definition"
validation_definition = gx.ValidationDefinition(
    data=batch_definition, suite=suite, name=definition_name
)

In [19]:
validation_results = validation_definition.run(batch_parameters=batch_parameters)
print(validation_results)

Calculating Metrics: 100%|██████████| 2/2 [00:00<00:00, 490.13it/s] 

{
  "success": false,
  "results": [
    {
      "success": false,
      "expectation_config": {
        "type": "expect_column_to_exist",
        "kwargs": {
          "batch_id": "olist.dim_customers-olist.dim_customers_asset",
          "column": "customer_sid",
          "column_index": 0
        },
        "meta": {},
        "id": "f01884da-28cb-45bb-9c0e-05c7d4764bfe"
      },
      "result": {},
      "meta": {},
      "exception_info": {
        "raised_exception": false,
        "exception_traceback": null,
        "exception_message": null
      }
    }
  ],
  "suite_name": "schema_dim_customers_expectation",
  "suite_parameters": {},
  "statistics": {
    "evaluated_expectations": 1,
    "successful_expectations": 0,
    "unsuccessful_expectations": 1,
    "success_percent": 0.0
  },
  "meta": {
    "great_expectations_version": "1.5.1",
    "batch_spec": {
      "batch_data": "PandasDataFrame"
    },
    "batch_markers": {
      "ge_load_time": "20250617T045505.450707Z",
 




In [20]:
# Save full results to file
output_folder = "gx_output"
os.makedirs(output_folder, exist_ok=True)
result_path = os.path.join(output_folder, "gx_results_customers.txt")

with open(result_path, "w") as f:
    f.write(pprint.pformat(validation_results))

print(f" Full GX test results saved to {result_path}")


 Full GX test results saved to gx_output/gx_results_customers.txt


In [29]:
context = gx.get_context()
# List of GBQ tables and their expected columns with types
gbq_tables_with_columns_and_types = {
    "olist_brazillian_ecommerce.DIM_PAYMENTS": {"payment_sid": "string"},
    "olist_brazillian_ecommerce.DIM_REVIEWS": {"review_sid": "string"},
    "olist_brazillian_ecommerce.DIM_GEOLOCATION": {"geolocation_zip_code_prefix": "string"},
    "olist_brazillian_ecommerce.DIM_ORDER_ITEMS": {"item_sid": "string"},
    "olist_brazillian_ecommerce.DIM_DATE": {"date_sid": "integer"},
}

output_folder = "gx_output"
os.makedirs(output_folder, exist_ok=True)
summary_file_path = os.path.join(output_folder, "gx_dtype_summary.txt")

with open(summary_file_path, "w") as f:
    f.write("Great Expectations Full Validation Results\n")
    f.write("=" * 60 + "\n\n")

 # Iterate over the list of tables and process each one   
    for table_name, expected_columns in gbq_tables_with_columns_and_types.items():
        query = f"SELECT * FROM {table_name}"
        df_table = read_gbq(query, project_id="projectm2-aiess")

        # Generate unique names for data source and asset
        data_source_name = f"{table_name}_data_source"
        asset_name = f"{table_name}_asset"

        # Add data source
        data_source = context.data_sources.add_pandas(name=data_source_name)
        # Add DataFrame asset
        data_asset = data_source.add_dataframe_asset(name=asset_name)
        # Add batch definition
        batch_definition = data_asset.add_batch_definition_whole_dataframe(table_name)
        batch_parameters = {"dataframe": df_table}
        batch = batch_definition.get_batch(batch_parameters=batch_parameters)

        # Create Expectation Suite
        suite_name = f"{table_name}_suite"
        suite = gx.ExpectationSuite(name=suite_name)
        suite = context.suites.add(suite)
        # Add ExpectColumnValuesToBeOfType expectations for each expected column
        for column, column_type in expected_columns.items():
            expectation = gx.expectations.ExpectColumnValuesToBeOfType(
                column=column, type_=column_type
            )
            suite.add_expectation(expectation)

        validation_definition = gx.ValidationDefinition(
            data=batch_definition, suite=suite, name=f"{table_name}_validation"
        )
        results = validation_definition.run(batch_parameters=batch_parameters)

        f.write(f"Table: {table_name}\n")
        #f.write(pprint.pformat(results))
        f.write(f"Table: {table_name}\n")
        f.write("-" * 60 + "\n")

        for result in results.results:
            expectation_type = result.expectation_config.type
            kwargs = result.expectation_config.kwargs
            column = kwargs.get("column")
            expected_type = kwargs.get("type_")
            status = "PASSED" if result.success else "FAILED"
            f.write(
                f"{expectation_type} on column '{column}'"
                f" (expected type: {expected_type}): {status}\n"
            )

        f.write("\n\n")

print(f"Combined gx dytppe test results saved to {summary_file_path}")


Downloading: 100%|[32m██████████[0m|


Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 341.72it/s] 


Downloading: 100%|[32m██████████[0m|


Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 405.01it/s] 


Downloading: 100%|[32m██████████[0m|


Calculating Metrics:  50%|█████     | 5/10 [00:00<00:00, 20.91it/s]  


Downloading: 100%|[32m██████████[0m|


Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 319.61it/s] 


Downloading: 100%|[32m██████████[0m|


Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 291.31it/s] 

Combined gx dytppe test results saved to gx_output/gx_dtype_summary.txt





In [28]:
context = gx.get_context()

# Query the fact table from GBQ
project_id = "projectm2-aiess"
fact_table_name = "olist_brazillian_ecommerce.FCT_ORDERS"
query = f"SELECT * FROM {fact_table_name}"
df_fact_table = read_gbq(query, project_id=project_id)

# Generate unique names for data source and asset
data_source_name = f"{fact_table_name}_data_source"
asset_name = f"{fact_table_name}_asset"

# Add data source
data_source = context.data_sources.add_pandas(name=data_source_name)

# Add DataFrame asset
data_asset = data_source.add_dataframe_asset(name=asset_name)

# Add batch definition
batch_definition_name = fact_table_name
batch_definition = data_asset.add_batch_definition_whole_dataframe(batch_definition_name)

# Get the batch and print the first few rows
batch_parameters = {"dataframe": df_fact_table}
batch = batch_definition.get_batch(batch_parameters=batch_parameters)
print(f"Batch for {fact_table_name}:")
print(batch.head(4))

# Add column expectations
schema_fct_orders_expectation_1 = gx.expectations.ExpectColumnToExist(
    column="payment_sid", column_index=0
)
schema_fct_orders_expectation_2 = gx.expectations.ExpectColumnToExist(
    column="review_sid", column_index=1
)
schema_fct_orders_expectation_3 = gx.expectations.ExpectColumnToExist(
    column="item_sid", column_index=2
)
schema_fct_orders_expectation_4 = gx.expectations.ExpectColumnToExist(
    column="customer_sid", column_index=3
)

# Create a new suite for the fact table schema validation
suite_name = "schema_fct_orders_expectation"
suite = gx.ExpectationSuite(name=suite_name)
suite = context.suites.add(suite)

suite.add_expectation(schema_fct_orders_expectation_1)
suite.add_expectation(schema_fct_orders_expectation_2)
suite.add_expectation(schema_fct_orders_expectation_3)
suite.add_expectation(schema_fct_orders_expectation_4)

# Create validation definition
definition_name = "schema_fct_orders_definition"
validation_definition = gx.ValidationDefinition(
    data=batch_definition, suite=suite, name=definition_name
)

# Run validation
validation_results = validation_definition.run()
print(f"Validation results for {fact_table_name}:")
print(validation_results)

Downloading: 100%|[32m██████████[0m|
Batch for olist_brazillian_ecommerce.FCT_ORDERS:


Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 370.75it/s] 

                        pk_order_id                       pk_item_sid  \
0  a2e4c44360b4a57bdff22f3a4630c173  a2e4c44360b4a57bdff22f3a4630c173   
1  132f1e724165a07f6362532bfb97486e  132f1e724165a07f6362532bfb97486e   
2  809a282bbd5dbcabb6f2f724fca862ec  809a282bbd5dbcabb6f2f724fca862ec   
3  e5215415bb6f76fe3b7cb68103a0d1c0  e5215415bb6f76fe3b7cb68103a0d1c0   

                     fk_payment_sid                     fk_review_sid  \
0  a2e4c44360b4a57bdff22f3a4630c173  a2e4c44360b4a57bdff22f3a4630c173   
1  132f1e724165a07f6362532bfb97486e  132f1e724165a07f6362532bfb97486e   
2  809a282bbd5dbcabb6f2f724fca862ec  809a282bbd5dbcabb6f2f724fca862ec   
3  e5215415bb6f76fe3b7cb68103a0d1c0  e5215415bb6f76fe3b7cb68103a0d1c0   

                    fk_customer_sid fk_order_purchased_date_sid  \
0  8886130db0ea6e9e70ba0b03d7c0d286                    20170206   
1  b2191912d8ad6eac2e4dc3b6e1459515                    20170425   
2  622e13439d6b5a0b486c435618b2679e                    20160913   





BuildBatchRequestError: Bad input to build_batch_request: options must contain exactly 1 key, 'dataframe'.

In [None]:
context = gx.get_context()
suite_name = "df_delivery_expectation_suite"
suite = gx.ExpectationSuite(name=suite_name)
suite = context.suites.add(suite)

# Add expectations for column existence
suite.add_expectation(
    gx.expectations.ExpectColumnToExist(column="actual_delivery_time")
)
suite.add_expectation(
    gx.expectations.ExpectColumnToExist(column="estimated_delivery_time")
)
suite.add_expectation(
    gx.expectations.ExpectColumnToExist(column="actual_delivery_time_minutes")
)
suite.add_expectation(
    gx.expectations.ExpectColumnToExist(column="estimated_delivery_time_minutes")
)

# Add expectations for column values to be non-null
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="actual_delivery_time")
)
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="estimated_delivery_time")
)
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="actual_delivery_time_minutes")
)
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="estimated_delivery_time_minutes")
)

# Add expectations for column values to be positive
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeGreaterThan(
        column="actual_delivery_time_minutes", value=0
    )
)
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeGreaterThan(
        column="estimated_delivery_time_minutes", value=0
    )
)

# Add expectations for column data types
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeOfType(
        column="actual_delivery_time", type_="timedelta64[ns]"
    )
)
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeOfType(
        column="estimated_delivery_time", type_="timedelta64[ns]"
    )
)
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeOfType(
        column="actual_delivery_time_minutes", type_="float"
    )
)
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeOfType(
        column="estimated_delivery_time_minutes", type_="float"
    )
)

# Add expectations for column value ranges
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeBetween(
        column="actual_delivery_time_minutes", min_value=0, max_value=10000
    )
)
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeBetween(
        column="estimated_delivery_time_minutes", min_value=0, max_value=10000
    )
)

# Add expectations for consistency between columns
suite.add_expectation(
    gx.expectations.ExpectMulticolumnValuesToBeInSet(
        columns=["actual_delivery_time_minutes", "estimated_delivery_time_minutes"],
        condition="actual_delivery_time_minutes <= estimated_delivery_time_minutes"
    )
)

# Validate the DataFrame
batch_definition = data_asset.add_batch_definition_whole_dataframe("df_delivery_batch")
batch_parameters = {"dataframe": df_fact_table}
batch = batch_definition.get_batch(batch_parameters=batch_parameters)

validation_definition = gx.ValidationDefinition(
    data=batch, suite=suite, name="df_delivery_validation"
)
validation_results = validation_definition.run()
print(validation_results)