In [1]:
import great_expectations as ge
import pandas as pd
from great_expectations.core.expectation_configuration import ExpectationConfiguration

In [2]:
context = ge.get_context()

In [3]:
pand = pd.read_csv('C:\\Users\\Wilber\\Downloads\\datasetCrystal06102023.csv', sep=',',encoding='latin1')
pand.columns

Index(['date', 'canal', 'category', 'subcategory', 'subcategory2',
       'subcategory3', 'marca', 'modelo', 'sku', 'upc', 'item',
       'item characteristics', 'url sku', 'image', 'price', 'sale price',
       'shipment cost', 'sales flag', 'store id', 'store name',
       'store address', 'stock', 'upc wm', 'final price', 'upc wm2', 'comp',
       'composition', 'homogenized_clothing', 'homogenized_subcategory',
       'homogenized_category', 'homogenized_color', 'made_in'],
      dtype='object')

In [4]:
pand.canal.unique()

array(['Zara Colombia', 'Pull&Bear Colombia', 'Arturo Calle Colombia',
       'H&M Colombia', 'Tennis Colombia', 'Polito Colombia',
       'Mango Colombia', 'Offcorss Colombia', 'Bronzini Colombia'],
      dtype=object)

In [5]:
scrapping_ds        = context.sources.add_or_update_pandas(name='scrapping')
scrapping_src_asset = scrapping_ds.add_dataframe_asset(f"scrapping_df")
#for canal in pand.canal.unique():
scrapping_batch     = scrapping_src_asset.build_batch_request(dataframe=pand)


In [6]:
scrapping_suite = context.add_expectation_suite(expectation_suite_name="scrapping_suite_2")
#scrapping_suite = context.get_expectation_suite(expectation_suite_name="scrapping_suite_2")
# Primera expectativa - Se tienen 32 Columnas
col_32_exp = ExpectationConfiguration(
    expectation_type="expect_table_columns_to_match_set",
    kwargs={
        "column_set": [
            "date", "canal", "category", "subcategory", "subcategory2",
            "subcategory3", "marca", "modelo", "sku", "upc", "item",
            "item characteristics", "url sku", "image", "price", "sale price",
            "shipment cost", "sales flag", "store id", "store name",
            "store address", "stock", "upc wm", "final price", "upc wm2", "comp",
            "composition", "homogenized_clothing", "homogenized_subcategory",
            "homogenized_category", "homogenized_color", "made_in"
        ]
    },
    meta={
        "notes": {
            "format": "markdown",
            "content": "**Archivo Fuente de Scrapping** `Valida el esquema del archivo de Scrapping`",
        }
    },
)
scrapping_suite.add_expectation(expectation_configuration=col_32_exp)
# Segunda expectativa - Formato de las fechas del archivo de Scrapping
scrap_date_format =  ExpectationConfiguration(
    expectation_type="expect_column_values_to_match_strftime_format",
    kwargs={
        "column":         "date",
        "strftime_format":"YYYYMMDD"
    },
    meta={
        "notes": {
            "format": "markdown",
            "content": "**Fecha Scrapping** `Valida el Formato de la Fecha de Scrapping`",
        }
    },
)
# Adicionar expectativa
scrapping_suite.add_expectation(expectation_configuration=scrap_date_format)
context.save_expectation_suite(scrapping_suite)

In [11]:
context

{
  "anonymous_usage_statistics": {
    "usage_statistics_url": "https://stats.greatexpectations.io/great_expectations/v1/usage_statistics",
    "enabled": true,
    "explicit_id": true,
    "explicit_url": false,
    "data_context_id": "19e1b072-80dc-4193-ae04-cc0fe5b7abcd"
  },
  "checkpoint_store_name": "checkpoint_store",
  "config_version": 3,
  "data_docs_sites": {
    "local_site": {
      "class_name": "SiteBuilder",
      "show_how_to_buttons": true,
      "store_backend": {
        "class_name": "TupleFilesystemStoreBackend",
        "base_directory": "C:\\Users\\Wilber\\AppData\\Local\\Temp\\tmpteln74rl"
      },
      "site_index_builder": {
        "class_name": "DefaultSiteIndexBuilder"
      }
    }
  },
  "datasources": {},
  "evaluation_parameter_store_name": "evaluation_parameter_store",
  "expectations_store_name": "expectations_store",
  "fluent_datasources": {},
  "include_rendered_content": {
    "globally": false,
    "expectation_validation_result": false,
    "

In [7]:
validator = context.get_validator(
    batch_request=scrapping_batch,
    expectation_suite_name="scrapping_suite_2",
)

In [8]:
scrapping_checkpoint = context.add_or_update_checkpoint(
    name="scrapping_checkpoint",
    validations=[
        {
            "batch_request": scrapping_batch,
            "expectation_suite_name": "scrapping_suite_2",
        },
    ],
)

In [9]:
scrapping_checkpoint_result = scrapping_checkpoint.run()

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

In [10]:
scrapping_checkpoint_result

{
  "run_id": {
    "run_name": null,
    "run_time": "2023-11-19T09:22:27.721889-05:00"
  },
  "run_results": {
    "ValidationResultIdentifier::scrapping_suite_2/__none__/20231119T142227.721889Z/scrapping-scrapping_df": {
      "validation_result": {
        "success": false,
        "results": [
          {
            "success": true,
            "expectation_config": {
              "expectation_type": "expect_table_columns_to_match_set",
              "kwargs": {
                "column_set": [
                  "date",
                  "canal",
                  "category",
                  "subcategory",
                  "subcategory2",
                  "subcategory3",
                  "marca",
                  "modelo",
                  "sku",
                  "upc",
                  "item",
                  "item characteristics",
                  "url sku",
                  "image",
                  "price",
                  "sale price",
                  "shi

In [None]:
onboarding_result = context.assistants.onboarding.run(scrapping_batch)

In [None]:
onboarding_result.plot_expectations_and_metrics()

In [None]:
onboarding_result.show_expectations_by_domain_type()

In [None]:
onboarding_result.show_expectations_by_expectation_type()

In [None]:
missing_result = context.assistants.missingness.run(scrapping_batch)

In [None]:
missing_result.plot_expectations_and_metrics()

In [None]:
volum = context.assistants.volume.run(scrapping_batch)

In [None]:
volum.plot_expectations_and_metrics()

In [None]:
pand = pd.read_csv('C:\\Users\\Wilber\\Downloads\\homologaciones.csv')
pand.columns


In [None]:
equiv_ds        = context.sources.add_or_update_pandas(name='homolog')
equiv_asset = scrapping_ds.add_dataframe_asset(f"homolog_asset")
equiv_batch     = scrapping_src_asset.build_batch_request(dataframe=pand)


In [None]:
homol = context.assistants.onboarding.run(equiv_batch)

In [None]:
homol.plot_expectations_and_metrics()

In [None]:
otallas_ds        = context.sources.add_or_update_pandas(name='otallas')
otallas_asset = scrapping_ds.add_dataframe_asset(f"otallas_asset")
otallas_batch     = scrapping_src_asset.build_batch_request(dataframe=pand)
otallas = context.assistants.onboarding.run(otallas_batch)

In [None]:
otallas.plot_expectations_and_metrics()

In [None]:
pand = pd.read_csv('C:\\Users\\Wilber\\Downloads\\marcprop.csv')
pand.columns

In [None]:
pand = pd.read_csv('C:\\Users\\Wilber\\Downloads\\marcprop.csv')
mprop_ds        = context.sources.add_or_update_pandas(name='mprop')
mprop_asset = mprop_ds.add_dataframe_asset(f"mprop_asset")
mprop_batch     = scrapping_src_asset.build_batch_request(dataframe=pand)
mprop = context.assistants.onboarding.run(mprop_batch)

In [None]:
mprop.plot_expectations_and_metrics()