In [1]:
import numpy as np
import pandas as pd
import duckdb
import sqlalchemy
import matplotlib.pyplot as plt

In [2]:
import great_expectations as gx
from great_expectations import expectations as gxe

<div class="alert alert-block alert-info">
data testing

In [5]:
context = gx.get_context()
source_folder = "../dataset/"
data_source_name = "olist_geolocation_dataset"

data_source = context.data_sources.add_pandas_filesystem(
    name=data_source_name, 
    base_directory=source_folder
)

In [6]:
asset_name = "olist_geolocation_dataset_files"
file_csv_asset = data_source.add_csv_asset(name=asset_name)


In [7]:
file_data_asset = context.data_sources.get(data_source_name).get_asset(asset_name)

In [8]:
batch_definition_name = "olist_geolocation_dataset.csv"
batch_definition_path = "olist_geolocation_dataset.csv"

batch_definition = file_data_asset.add_batch_definition_path(
    name=batch_definition_name, path=batch_definition_path
)

batch = batch_definition.get_batch()

In [9]:
print(batch.head(4))

Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 172.45it/s]

   geolocation_zip_code_prefix  geolocation_lat  geolocation_lng  \
0                         1037       -23.545621       -46.639292   
1                         1046       -23.546081       -46.644820   
2                         1046       -23.546129       -46.642951   
3                         1041       -23.544392       -46.639499   

  geolocation_city geolocation_state  
0        sao paulo                SP  
1        sao paulo                SP  
2        sao paulo                SP  
3        sao paulo                SP  





In [10]:
preset_lat_expectation = gx.expectations.ExpectColumnValuesToBeBetween(
    column="geolocation_lat", min_value=-35, max_value=5
)

preset_long_expectation = gx.expectations.ExpectColumnValuesToBeBetween(
    column="geolocation_lng", min_value=-75, max_value=-35
)

suite_name = "br_ecom_expectation"
suite = gx.ExpectationSuite(name=suite_name)

suite = context.suites.add(suite)
suite.add_expectation(preset_lat_expectation)
suite.add_expectation(preset_long_expectation)

definition_name = "br_ecom_validation_definition"
validation_definition = gx.ValidationDefinition(
    data=batch_definition, suite=suite, name=definition_name
)


In [11]:
validation_results = validation_definition.run()

Calculating Metrics: 100%|██████████| 17/17 [00:00<00:00, 73.54it/s] 


In [12]:
print(validation_results)

{
  "success": false,
  "results": [
    {
      "success": false,
      "expectation_config": {
        "type": "expect_column_values_to_be_between",
        "kwargs": {
          "batch_id": "olist_geolocation_dataset-olist_geolocation_dataset_files",
          "column": "geolocation_lat",
          "min_value": -35.0,
          "max_value": 5.0
        },
        "meta": {},
        "id": "a66b55a0-975b-43ee-a1e4-61dc5152498b"
      },
      "result": {
        "element_count": 1000163,
        "unexpected_count": 29,
        "unexpected_percent": 0.002899527377037543,
        "partial_unexpected_list": [
          28.008978338034268,
          41.61405150610495,
          42.43928591592116,
          38.38167205114709,
          43.68496096631822,
          29.40925222930908,
          21.65754744931478,
          25.995202881053302,
          25.995244980240106,
          38.3239386880374,
          38.99196259838999,
          38.26820516582393,
          45.06593318269697,
     

In [None]:
context = gx.get_context()
source_folder = "../dataset/"
data_source_name = "olist.dim_customers"

data_source = context.data_sources.add_pandas_filesystem(
    name=data_source_name, 
    base_directory=source_folder
)

In [None]:
asset_name = "olist.dim_customers_files"
file_csv_asset = data_source.add_csv_asset(name=asset_name)


In [None]:
file_data_asset = context.data_sources.get(data_source_name).get_asset(asset_name)

In [None]:
batch_definition_name = "olist.dim_customers.csv"
batch_definition_path = "olist.dim_customers.csv"

batch_definition = file_data_asset.add_batch_definition_path(
    name=batch_definition_name, path=batch_definition_path
)

batch = batch_definition.get_batch()

In [None]:
print(batch.head(4))

In [None]:
schema_dim_customers_expectation = gx.expectations.ExpectColumnToExist(
    column="customer_sid", column_index=0
)

# Create a new suite for all dimension tables schema validation
suite_name = "schema_dim_customers_expectation"
suite = gx.ExpectationSuite(name=suite_name)

suite = context.suites.add(suite)
suite.add_expectation(schema_dim_customers_expectation)

definition_name = "schema_dim_customers_definition"
validation_definition = gx.ValidationDefinition(
    data=batch_definition, suite=suite, name=definition_name
)

In [None]:
context = gx.get_context()
source_folder = "../dataset/"

# List of CSV files and their expected columns
csv_files_with_columns = {
    "olist.dim_customers.csv": ["customer_sid"],
    "olist.dim_payments.csv": ["payment_sid"],
    "olist.dim_reviews.csv": ["review_sid"],
    "olist.dim_geolocation.csv": ["geolocation_zip_code_prefix"],
    "olist.dim_items.csv": ["item_sid"],
    "olist.dim_date.csv": ["date_sid"]
}

# Iterate over the list of files and process each one
for csv_file, expected_columns in csv_files_with_columns.items():
    # Generate unique names for data source and asset
    data_source_name = f"{csv_file.split('.')[0]}_data_source"
    asset_name = f"{csv_file.split('.')[0]}_asset"

    # Add data source
    data_source = context.data_sources.add_pandas_filesystem(
        name=data_source_name,
        base_directory=source_folder
    )

    # Add CSV asset
    file_csv_asset = data_source.add_csv_asset(name=asset_name)

    # Get the asset and add batch definition
    file_data_asset = context.data_sources.get(data_source_name).get_asset(asset_name)
    batch_definition_name = csv_file
    batch_definition_path = csv_file

    batch_definition = file_data_asset.add_batch_definition_path(
        name=batch_definition_name, path=batch_definition_path
    )

    # Get the batch and print the first few rows
    batch = batch_definition.get_batch()
    print(f"Batch for {csv_file}:")
    print(batch.head(4))

In [None]:
# Create Expectation Suite
suite_name = f"{csv_file.split('.')[0]}_expectation_suite"
suite = gx.ExpectationSuite(name=suite_name)
suite = context.suites.add(suite)

# Add ExpectColumnToExist expectations for each expected column
for column in expected_columns:
    expectation = gx.expectations.ExpectColumnToExist(column=column)
    suite.add_expectation(expectation)

# Create validation definition
definition_name = f"{csv_file.split('.')[0]}_validation_definition"
validation_definition = gx.ValidationDefinition(
    data=batch_definition, suite=suite, name=definition_name
)

# Run validation
validation_results = validation_definition.run()
print(f"Validation results for {csv_file}:")
print(validation_results)

In [None]:
context = gx.get_context()
source_folder = "../dataset/"
data_source_name = "olist.fct_orders.csv"

data_source = context.data_sources.add_pandas_filesystem(
    name=data_source_name, 
    base_directory=source_folder
)

In [None]:
asset_name = "olist.fct_orders_files"
file_csv_asset = data_source.add_csv_asset(name=asset_name)
file_data_asset = context.data_sources.get(data_source_name).get_asset(asset_name)
batch_definition_name = "olist.fct_orders.csv"
batch_definition_path = "olist.fct_orders.csv"

batch_definition = file_data_asset.add_batch_definition_path(
    name=batch_definition_name, path=batch_definition_path
)

batch = batch_definition.get_batch()
print(batch.head(4))

In [None]:
# Add column expectations
schema_dim_customers_expectation = gx.expectations.ExpectColumnToExist(
    column="payment_sid", column_index=0
)
schema_fct_orders_expectation_2 = gx.expectations.ExpectColumnToExist(
    column="review_sid", column_index=1
)
schema_fct_orders_expectation_3 = gx.expectations.ExpectColumnToExist(
    column="item_sid", column_index=2
)
schema_fct_orders_expectation_3 = gx.expectations.ExpectColumnToExist(
    column="customer_sid", column_index=3
)

# Create a new suite for all dimension tables schema validation
suite_name = "schema_fct_orders_expectation"
suite = gx.ExpectationSuite(name=suite_name)

suite = context.suites.add(suite)
suite.add_expectation(schema_fct_orders_expectation)

definition_name = "schema_fct_orders_definition"
validation_definition = gx.ValidationDefinition(
    data=batch_definition, suite=suite, name=definition_name
)

In [None]:
context = gx.get_context()
source_folder = "../dataset/"

# List of CSV files and their expected columns with types
csv_files_with_columns_and_types = {
    "olist.dim_customers.csv": {"customer_sid": "string"},
    "olist.dim_payments.csv": {"payment_sid": "string"},
    "olist.dim_reviews.csv": {"review_sid": "string"},
    "olist.dim_geolocation.csv": {"geolocation_zip_code_prefix": "string"},
    "olist.dim_items.csv": {"item_sid": "string"},
    "olist.dim_date.csv": {"date_sid": "integer"}
}

# Iterate over the list of files and process each one
for csv_file, expected_columns in csv_files_with_columns_and_types.items():
    # Generate unique names for data source and asset
    data_source_name = f"{csv_file.split('.')[0]}_data_source"
    asset_name = f"{csv_file.split('.')[0]}_asset"

    # Add data source
    data_source = context.data_sources.add_pandas_filesystem(
        name=data_source_name,
        base_directory=source_folder,
    )

In [None]:
# Add CSV asset
file_csv_asset = data_source.add_csv_asset(name=asset_name)

# Get the asset and add batch definition
file_data_asset = context.data_sources.get(data_source_name).get_asset(asset_name)
batch_definition_name = csv_file
batch_definition_path = csv_file

batch_definition = file_data_asset.add_batch_definition_path(
name=batch_definition_name, path=batch_definition_path
)

# Get the batch and print the first few rows
batch = batch_definition.get_batch()
print(f"Batch for {csv_file}:")
print(batch.head(4))

# Create Expectation Suite
suite_name = f"{csv_file.split('.')[0]}_expectation_suite"
suite = gx.ExpectationSuite(name=suite_name)
suite = context.suites.add(suite)

In [None]:
# Add ExpectColumnValuesToBeOfType expectations for each expected column
for column, column_type in expected_columns.items():
    expectation = gx.expectations.ExpectColumnValuesToBeOfType(
        column=column, type_=column_type
    )
    suite.add_expectation(expectation)

# Create validation definition
definition_name = f"{csv_file.split('.')[0]}_validation_definition"
validation_definition = gx.ValidationDefinition(
    data=batch_definition, suite=suite, name=definition_name
)

# Run validation
validation_results = validation_definition.run()
print(f"Validation results for {csv_file}:")
print(validation_results)