In [1]:
import numpy as np
import pandas as pd
import duckdb
import sqlalchemy
from pandas_gbq import read_gbq

In [2]:
import great_expectations as gx
from great_expectations import expectations as gxe

import pprint
import os

<div class="alert alert-block alert-info">
data testing

In [None]:
context = gx.get_context()
# query bigquery
project_id = "projectm2-aiess"
query = "SELECT * FROM olist_brazilian_ecommerce_target_star.DIM_CUSTOMERS"
df_customers = read_gbq(query, project_id=project_id)


data_source_name = "olist.dim_customers"
data_source = context.data_sources.add_pandas(name=data_source_name)

Downloading: 100%|[32m██████████[0m|


In [8]:
# create asset
data_asset_name = "olist.dim_customers_asset"
data_asset = data_source.add_dataframe_asset(name=data_asset_name)


In [9]:
# create batch
batch_definition_name = "batch_customers_dataframe"
batch_definition = data_asset.add_batch_definition_whole_dataframe(batch_definition_name)

batch_parameters = {"dataframe": df_customers}

new_batch = batch_definition.get_batch(batch_parameters=batch_parameters)

In [10]:
print(new_batch.head(4))

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 241.15it/s]

                    pk_customer_sid                customer_unique_id  \
0  10ad09201fcc1c82d181ff7234bcdb3b  94742cd1fbac9146be7e2a139b63e13c   
1  8a0108267d9258a0ec9f74381bc9b0de  7a2dc4682890550ebe3b8befcea3d55c   
2  cd281c1a7d26cd29a3ed4b029fce7270  086d6b5b5ba195a91aa0a6ec8e75d1a4   
3  31dbc13addc753e210692eacaea065e4  5dbba6c01268a8ad43f79157bf4454a0   

   customer_zip_code_prefix customer_city customer_state  \
0                     69900    rio branco             AC   
1                     69900    rio branco             AC   
2                     69900    rio branco             AC   
3                     69900    rio branco             AC   

                load_date  
0  2025-06-13 01:47:46 PM  
1  2025-06-13 01:47:46 PM  
2  2025-06-13 01:47:46 PM  
3  2025-06-13 01:47:46 PM  





In [11]:
# Create a new suite for all dimension tables schema validation
suite_name = "schema_dim_customers_expectation"
suite = gx.ExpectationSuite(name=suite_name)

schema_dim_customers_expectation = gx.expectations.ExpectColumnToExist(
    column="customer_sid", column_index=0
)

context.suites.add_or_update(suite)
suite.add_expectation(schema_dim_customers_expectation)

definition_name = "schema_dim_customers_definition"
validation_definition = gx.ValidationDefinition(
    data=batch_definition, suite=suite, name=definition_name
)

In [14]:
context = gx.get_context()
# List of GBQ tables and their expected columns with types
gbq_tables_with_columns_and_types = {
    "olist_brazilian_ecommerce_target_star.DIM_PAYMENTS": {"payment_sid": "string"},
    "olist_brazilian_ecommerce_target_star.DIM_REVIEWS": {"review_sid": "string"},
    "olist_brazilian_ecommerce_target_star.DIM_GEOLOCATION": {"geolocation_zip_code_prefix": "string"},
    "olist_brazilian_ecommerce_target_star.DIM_ITEMS": {"item_sid": "string"},
    "olist_brazilian_ecommerce_target_star.DIM_DATE": {"date_sid": "integer"},
}

# Iterate over the list of tables and process each one
for table_name, expected_columns in gbq_tables_with_columns_and_types.items():
    # Query the table from GBQ
    query = f"SELECT * FROM {table_name}"
    df_table = read_gbq(query, project_id="projectm2-aiess")

    # Generate unique names for data source and asset
    data_source_name = f"{table_name}_data_source"
    asset_name = f"{table_name}_asset"

    # Add data source
    data_source = context.data_sources.add_pandas(name=data_source_name)

    # Add DataFrame asset
    data_asset = data_source.add_dataframe_asset(name=asset_name)

    # Add batch definition
    batch_definition_name = table_name
    batch_definition = data_asset.add_batch_definition_whole_dataframe(batch_definition_name)

    # Get the batch and print the first few rows
    batch_parameters = {"dataframe": df_table}
    batch = batch_definition.get_batch(batch_parameters=batch_parameters)
    print(f"Batch for {table_name}:")
    print(batch.head(4))

    # Create Expectation Suite
    suite_name = f"{table_name}_expectation_suite"
    suite = gx.ExpectationSuite(name=suite_name)
    suite = context.suites.add(suite)

    # Add ExpectColumnValuesToBeOfType expectations for each expected column
    for column, column_type in expected_columns.items():
        expectation = gx.expectations.ExpectColumnValuesToBeOfType(
            column=column, type_=column_type
        )
        suite.add_expectation(expectation)

    # Create validation definition
    definition_name = f"{table_name}_validation_definition"
    validation_definition = gx.ValidationDefinition(
        data=batch_definition, suite=suite, name=definition_name
    )

    # Run validation
    validation_results = validation_definition.run(batch_parameters=batch_parameters)
    print(f"Validation results for {table_name}:")
    print(validation_results)

Downloading: 100%|[32m██████████[0m|
Batch for olist_brazilian_ecommerce_target_star.DIM_PAYMENTS:


Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 699.28it/s] 

                     pk_payment_sid  payment_sequential payment_type  \
0  09b457f1152126535a2e7d4a3b73ff6f                   1       boleto   
1  ee0829e4d1cc7547c044b8643c14da38                   1       boleto   
2  db7576b1fe440f4c0a808855aacf0948                   1       boleto   
3  6ef587afa4703fd874edb7ade8efc7b0                   1       boleto   

   payment_installments  payment_value               load_date  
0                     1          14.08  2025-06-13 01:48:07 PM  
1                     1          18.52  2025-06-13 01:48:07 PM  
2                     1          18.62  2025-06-13 01:48:07 PM  
3                     1          20.23  2025-06-13 01:48:07 PM  



Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 628.08it/s] 


Validation results for olist_brazilian_ecommerce_target_star.DIM_PAYMENTS:
{
  "success": false,
  "results": [
    {
      "success": false,
      "expectation_config": {
        "type": "expect_column_values_to_be_of_type",
        "kwargs": {
          "column": "payment_sid",
          "type_": "string",
          "batch_id": "olist_brazilian_ecommerce_target_star.DIM_PAYMENTS_data_source-olist_brazilian_ecommerce_target_star.DIM_PAYMENTS_asset"
        },
        "meta": {},
        "id": "a63c3c95-53e3-431e-ab49-9b7cbba99fc5"
      },
      "result": {},
      "meta": {},
      "exception_info": {
        "exception_traceback": "Traceback (most recent call last):\n  File \"/Users/sqa.k/miniconda3/envs/elt/lib/python3.10/site-packages/great_expectations/validator/validator.py\", line 650, in graph_validate\n    result = expectation.metrics_validate(\n  File \"/Users/sqa.k/miniconda3/envs/elt/lib/python3.10/site-packages/great_expectations/expectations/expectation.py\", line 1147, 

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 654.44it/s] 

                      pk_review_sid fk_review_creation_date_sid  review_score  \
0  69ac6a27fde9855ebeaaecac0f78058b                    20161002             1   
1  6916ca4502d6d3bfd39818759d55d536                    20161006             1   
2  49f695dffa457eaba90d388a5c37e942                    20161009             1   
3  b28309e8253951a954123e039d3242f7                    20161020             1   

  review_comment_title                             review_comment_message  \
0                 None  MEU PEDIDO NÃO FOI ENTREGUE E NÃO FOI DADA NEN...   
1                 None     nao recebi o produto e nem resposta da empresa   
2                 None  PRODUTO NÃO CHEGOU,E JÁ PASSOU O PRAZO DE ENTREGA   
3                 None  Produto não foi entregue. Não foi enviada nota...   

       review_creation_date   review_answer_timestamp               load_date  
0 2016-10-02 00:00:00+00:00 2016-10-26 12:31:00+00:00  2025-06-13 01:48:20 PM  
1 2016-10-06 00:00:00+00:00 2016-10-07 18:32:28+


Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 699.87it/s] 


Validation results for olist_brazilian_ecommerce_target_star.DIM_REVIEWS:
{
  "success": false,
  "results": [
    {
      "success": false,
      "expectation_config": {
        "type": "expect_column_values_to_be_of_type",
        "kwargs": {
          "column": "review_sid",
          "type_": "string",
          "batch_id": "olist_brazilian_ecommerce_target_star.DIM_REVIEWS_data_source-olist_brazilian_ecommerce_target_star.DIM_REVIEWS_asset"
        },
        "meta": {},
        "id": "1db6a0e9-b772-424f-9e42-70ce48ad9a23"
      },
      "result": {},
      "meta": {},
      "exception_info": {
        "exception_traceback": "Traceback (most recent call last):\n  File \"/Users/sqa.k/miniconda3/envs/elt/lib/python3.10/site-packages/great_expectations/validator/validator.py\", line 650, in graph_validate\n    result = expectation.metrics_validate(\n  File \"/Users/sqa.k/miniconda3/envs/elt/lib/python3.10/site-packages/great_expectations/expectations/expectation.py\", line 1147, in m

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 626.95it/s] 

   geolocation_zip_code_prefix  geolocation_lat  geolocation_lng  \
0                        81470       -25.571748       -49.334374   
1                        28930       -22.969370       -42.029834   
2                        87365       -24.271860       -53.069433   
3                        87365       -24.277697       -53.074930   

     geolocation_city geolocation_state               load_date  
0            * cidade                PR  2025-06-13 01:47:54 PM  
1  ...arraial do cabo                RJ  2025-06-13 01:47:54 PM  
2      4o. centenario                PR  2025-06-13 01:47:54 PM  
3       4º centenario                PR  2025-06-13 01:47:54 PM  



Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 663.97it/s] 


Validation results for olist_brazilian_ecommerce_target_star.DIM_GEOLOCATION:
{
  "success": false,
  "results": [
    {
      "success": false,
      "expectation_config": {
        "type": "expect_column_values_to_be_of_type",
        "kwargs": {
          "batch_id": "olist_brazilian_ecommerce_target_star.DIM_GEOLOCATION_data_source-olist_brazilian_ecommerce_target_star.DIM_GEOLOCATION_asset",
          "column": "geolocation_zip_code_prefix",
          "type_": "string"
        },
        "meta": {},
        "id": "024c5179-8c6c-4ffe-861e-0d4018ab5db3"
      },
      "result": {
        "observed_value": "int64"
      },
      "meta": {},
      "exception_info": {
        "raised_exception": false,
        "exception_traceback": null,
        "exception_message": null
      }
    }
  ],
  "suite_name": "olist_brazilian_ecommerce_target_star.DIM_GEOLOCATION_expectation_suite",
  "suite_parameters": {},
  "statistics": {
    "evaluated_expectations": 1,
    "successful_expectations":

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 565.27it/s] 

                        pk_item_sid fk_shipping_limit_date_sid  order_item_id  \
0  7f39ba4c9052be115350065d07583cac                   20171024              1   
1  9dc8d1a6f16f1b89874c29c9d8d30447                   20171018              1   
2  d455a8cb295653b55abda06d434ab492                   20171012              1   
3  fc8cfb0445794a015aae85ee6bdcf12b                   20170720              1   

                         product_id                         seller_id  \
0  a2ff5a97bf95719e38ea2e3b4105bce8  0015a82c2db000af6aaaf3ae2ecb0532   
1  a2ff5a97bf95719e38ea2e3b4105bce8  0015a82c2db000af6aaaf3ae2ecb0532   
2  a2ff5a97bf95719e38ea2e3b4105bce8  0015a82c2db000af6aaaf3ae2ecb0532   
3  08574b074924071f4e201e151b152b4e  001cca7ae9ae17fb1caed9dfb1094831   

        shipping_limit_date  price  freight_value product_category_name  \
0 2017-10-24 23:56:20+00:00  895.0          21.02       eletroportateis   
1 2017-10-18 14:49:22+00:00  895.0          21.02       eletroportateis   
2 2


Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 348.57it/s] 


Validation results for olist_brazilian_ecommerce_target_star.DIM_ITEMS:
{
  "success": false,
  "results": [
    {
      "success": false,
      "expectation_config": {
        "type": "expect_column_values_to_be_of_type",
        "kwargs": {
          "column": "item_sid",
          "type_": "string",
          "batch_id": "olist_brazilian_ecommerce_target_star.DIM_ITEMS_data_source-olist_brazilian_ecommerce_target_star.DIM_ITEMS_asset"
        },
        "meta": {},
        "id": "43187fa5-15dd-49f6-acbe-a47528fe4082"
      },
      "result": {},
      "meta": {},
      "exception_info": {
        "exception_traceback": "Traceback (most recent call last):\n  File \"/Users/sqa.k/miniconda3/envs/elt/lib/python3.10/site-packages/great_expectations/validator/validator.py\", line 650, in graph_validate\n    result = expectation.metrics_validate(\n  File \"/Users/sqa.k/miniconda3/envs/elt/lib/python3.10/site-packages/great_expectations/expectations/expectation.py\", line 1147, in metrics_v

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 376.37it/s] 


  pk_date_sid   full_date  year  month month_name  day_of_month  day_name  \
0    20140417  2014-04-17  2014      4      April            17  Thursday   
1    20150423  2015-04-23  2015      4      April            23  Thursday   
2    20170413  2017-04-13  2017      4      April            13  Thursday   
3    20210418  2021-04-18  2021      4      April            18    Sunday   

   is_weekday               load_date  
0           1  2025-06-13 03:46:55 PM  
1           1  2025-06-13 03:46:55 PM  
2           1  2025-06-13 03:46:55 PM  
3           0  2025-06-13 03:46:55 PM  


Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 616.90it/s] 

Validation results for olist_brazilian_ecommerce_target_star.DIM_DATE:
{
  "success": false,
  "results": [
    {
      "success": false,
      "expectation_config": {
        "type": "expect_column_values_to_be_of_type",
        "kwargs": {
          "column": "date_sid",
          "type_": "integer",
          "batch_id": "olist_brazilian_ecommerce_target_star.DIM_DATE_data_source-olist_brazilian_ecommerce_target_star.DIM_DATE_asset"
        },
        "meta": {},
        "id": "5dc22bd2-4f3b-45ba-8de8-9a09830a27d9"
      },
      "result": {},
      "meta": {},
      "exception_info": {
        "exception_traceback": "Traceback (most recent call last):\n  File \"/Users/sqa.k/miniconda3/envs/elt/lib/python3.10/site-packages/great_expectations/validator/validator.py\", line 650, in graph_validate\n    result = expectation.metrics_validate(\n  File \"/Users/sqa.k/miniconda3/envs/elt/lib/python3.10/site-packages/great_expectations/expectations/expectation.py\", line 1147, in metrics_val




In [17]:
context = gx.get_context()

# Query the fact table from GBQ
project_id = "projectm2-aiess"
fact_table_name = "olist_brazilian_ecommerce_target.FCT_ORDERS"
query = f"SELECT * FROM {fact_table_name}"
df_fact_table = read_gbq(query, project_id=project_id)

# Generate unique names for data source and asset
data_source_name = f"{fact_table_name}_data_source"
asset_name = f"{fact_table_name}_asset"

# Add data source
data_source = context.data_sources.add_pandas(name=data_source_name)

# Add DataFrame asset
data_asset = data_source.add_dataframe_asset(name=asset_name)

# Add batch definition
batch_definition_name = fact_table_name
batch_definition = data_asset.add_batch_definition_whole_dataframe(batch_definition_name)

# Get the batch and print the first few rows
batch_parameters = {"dataframe": df_fact_table}
batch = batch_definition.get_batch(batch_parameters=batch_parameters)
print(f"Batch for {fact_table_name}:")
print(batch.head(4))

# Add column expectations
schema_fct_orders_expectation_1 = gx.expectations.ExpectColumnToExist(
    column="payment_sid", column_index=0
)
schema_fct_orders_expectation_2 = gx.expectations.ExpectColumnToExist(
    column="review_sid", column_index=1
)
schema_fct_orders_expectation_3 = gx.expectations.ExpectColumnToExist(
    column="item_sid", column_index=2
)
schema_fct_orders_expectation_4 = gx.expectations.ExpectColumnToExist(
    column="customer_sid", column_index=3
)

# Create a new suite for the fact table schema validation
suite_name = "schema_fct_orders_expectation"
suite = gx.ExpectationSuite(name=suite_name)
suite = context.suites.add(suite)

suite.add_expectation(schema_fct_orders_expectation_1)
suite.add_expectation(schema_fct_orders_expectation_2)
suite.add_expectation(schema_fct_orders_expectation_3)
suite.add_expectation(schema_fct_orders_expectation_4)

# Create validation definition
definition_name = "schema_fct_orders_definition"
validation_definition = gx.ValidationDefinition(
    data=batch_definition, suite=suite, name=definition_name
)

# Run validation
validation_results = validation_definition.run(batch_parameters=batch_parameters)
print(f"Validation results for {fact_table_name}:")
print(validation_results)

GenericGBQException: Reason: 403 POST https://bigquery.googleapis.com/bigquery/v2/projects/projectm2-aiess/queries?prettyPrint=false: Access Denied: Table olist-462315:Olist_csv.olist_orders: User does not have permission to query table olist-462315:Olist_csv.olist_orders, or perhaps it does not exist.

In [24]:
context = gx.get_context()
suite_name = "df_delivery_expectation_suite"
suite = gx.ExpectationSuite(name=suite_name)
suite = context.suites.add(suite)

# Add expectations for column existence
suite.add_expectation(
    gx.expectations.ExpectColumnToExist(column="actual_delivery_time")
)
suite.add_expectation(
    gx.expectations.ExpectColumnToExist(column="estimated_delivery_time")
)
suite.add_expectation(
    gx.expectations.ExpectColumnToExist(column="actual_delivery_time_minutes")
)
suite.add_expectation(
    gx.expectations.ExpectColumnToExist(column="estimated_delivery_time_minutes")
)

# Add expectations for column values to be non-null
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="actual_delivery_time")
)
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="estimated_delivery_time")
)
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="actual_delivery_time_minutes")
)
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="estimated_delivery_time_minutes")
)

# Add expectations for column values to be positive
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeBetween(
        column="actual_delivery_time_minutes", min_value=0
    )
)
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeBetween(
        column="estimated_delivery_time_minutes", min_value=0
    )
)

# Add expectations for column data types
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeOfType(
        column="actual_delivery_time", type_="timedelta64[ns]"
    )
)
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeOfType(
        column="estimated_delivery_time", type_="timedelta64[ns]"
    )
)
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeOfType(
        column="actual_delivery_time_minutes", type_="float"
    )
)
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeOfType(
        column="estimated_delivery_time_minutes", type_="float"
    )
)

# Add expectations for column value ranges
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeBetween(
        column="actual_delivery_time_minutes", min_value=0, max_value=10000
    )
)
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeBetween(
        column="estimated_delivery_time_minutes", min_value=0, max_value=10000
    )
)

# Validate the DataFrame
batch_definition = data_asset.add_batch_definition_whole_dataframe("df_delivery_expectation_batch")
batch_parameters = {"dataframe": df_fact_table}
batch = batch_definition.get_batch(batch_parameters=batch_parameters)

validation_definition = gx.ValidationDefinition(
    data=batch, suite=suite, name="df_delivery_validation"
)
validation_results = validation_definition.run(batch_parameters=batch_parameters)
print(validation_results)

NameError: name 'df_fact_table' is not defined